1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     // All of j_rargN may be used to return inline type fields so be careful
  332     // not to clobber those.
  333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  334     // assignment of Rresult below.
  335     Register Rresult = r14, Rresult_type = r15;
  336     __ ldr(Rresult, result);
  337     Label is_long, is_float, is_double, check_prim, exit;
  338     __ ldr(Rresult_type, result_type);
  339     __ cmp(Rresult_type, (u1)T_OBJECT);
  340     __ br(Assembler::EQ, check_prim);
  341     __ cmp(Rresult_type, (u1)T_LONG);
  342     __ br(Assembler::EQ, is_long);
  343     __ cmp(Rresult_type, (u1)T_FLOAT);
  344     __ br(Assembler::EQ, is_float);
  345     __ cmp(Rresult_type, (u1)T_DOUBLE);
  346     __ br(Assembler::EQ, is_double);
  347 
  348     // handle T_INT case
  349     __ strw(r0, Address(Rresult));
  350 
  351     __ BIND(exit);
  352 
  353     // pop parameters
  354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  355 
  356 #ifdef ASSERT
  357     // verify that threads correspond
  358     {
  359       Label L, S;
  360       __ ldr(rscratch1, thread);
  361       __ cmp(rthread, rscratch1);
  362       __ br(Assembler::NE, S);
  363       __ get_thread(rscratch1);
  364       __ cmp(rthread, rscratch1);
  365       __ br(Assembler::EQ, L);
  366       __ BIND(S);
  367       __ stop("StubRoutines::call_stub: threads must correspond");
  368       __ BIND(L);
  369     }
  370 #endif
  371 
  372     __ pop_cont_fastpath(rthread);
  373 
  374     // restore callee-save registers
  375     __ ldpd(v15, v14,  d15_save);
  376     __ ldpd(v13, v12,  d13_save);
  377     __ ldpd(v11, v10,  d11_save);
  378     __ ldpd(v9,  v8,   d9_save);
  379 
  380     __ ldp(r28, r27,   r28_save);
  381     __ ldp(r26, r25,   r26_save);
  382     __ ldp(r24, r23,   r24_save);
  383     __ ldp(r22, r21,   r22_save);
  384     __ ldp(r20, r19,   r20_save);
  385 
  386     // restore fpcr
  387     __ ldr(rscratch1,  fpcr_save);
  388     __ set_fpcr(rscratch1);
  389 
  390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  391     __ ldrw(c_rarg2, result_type);
  392     __ ldr(c_rarg3,  method);
  393     __ ldp(c_rarg4, c_rarg5,  entry_point);
  394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  395 
  396     // leave frame and return to caller
  397     __ leave();
  398     __ ret(lr);
  399 
  400     // handle return types different from T_INT
  401     __ BIND(check_prim);
  402     if (InlineTypeReturnedAsFields) {
  403       // Check for scalarized return value
  404       __ tbz(r0, 0, is_long);
  405       // Load pack handler address
  406       __ andr(rscratch1, r0, -2);
  407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
  408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  409       __ blr(rscratch1);
  410       __ b(exit);
  411     }
  412 
  413     __ BIND(is_long);
  414     __ str(r0, Address(Rresult, 0));
  415     __ br(Assembler::AL, exit);
  416 
  417     __ BIND(is_float);
  418     __ strs(j_farg0, Address(Rresult, 0));
  419     __ br(Assembler::AL, exit);
  420 
  421     __ BIND(is_double);
  422     __ strd(j_farg0, Address(Rresult, 0));
  423     __ br(Assembler::AL, exit);
  424 
  425     return start;
  426   }
  427 
  428   // Return point for a Java call if there's an exception thrown in
  429   // Java code.  The exception is caught and transformed into a
  430   // pending exception stored in JavaThread that can be tested from
  431   // within the VM.
  432   //
  433   // Note: Usually the parameters are removed by the callee. In case
  434   // of an exception crossing an activation frame boundary, that is
  435   // not the case if the callee is compiled code => need to setup the
  436   // rsp.
  437   //
  438   // r0: exception oop
  439 
  440   address generate_catch_exception() {
  441     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  442     StubCodeMark mark(this, stub_id);
  443     address start = __ pc();
  444 
  445     // same as in generate_call_stub():
  446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  447     const Address thread        (rfp, thread_off         * wordSize);
  448 
  449 #ifdef ASSERT
  450     // verify that threads correspond
  451     {
  452       Label L, S;
  453       __ ldr(rscratch1, thread);
  454       __ cmp(rthread, rscratch1);
  455       __ br(Assembler::NE, S);
  456       __ get_thread(rscratch1);
  457       __ cmp(rthread, rscratch1);
  458       __ br(Assembler::EQ, L);
  459       __ bind(S);
  460       __ stop("StubRoutines::catch_exception: threads must correspond");
  461       __ bind(L);
  462     }
  463 #endif
  464 
  465     // set pending exception
  466     __ verify_oop(r0);
  467 
  468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  469     __ mov(rscratch1, (address)__FILE__);
  470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  471     __ movw(rscratch1, (int)__LINE__);
  472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  473 
  474     // complete return to VM
  475     assert(StubRoutines::_call_stub_return_address != nullptr,
  476            "_call_stub_return_address must have been generated before");
  477     __ b(StubRoutines::_call_stub_return_address);
  478 
  479     return start;
  480   }
  481 
  482   // Continuation point for runtime calls returning with a pending
  483   // exception.  The pending exception check happened in the runtime
  484   // or native call stub.  The pending exception in Thread is
  485   // converted into a Java-level exception.
  486   //
  487   // Contract with Java-level exception handlers:
  488   // r0: exception
  489   // r3: throwing pc
  490   //
  491   // NOTE: At entry of this stub, exception-pc must be in LR !!
  492 
  493   // NOTE: this is always used as a jump target within generated code
  494   // so it just needs to be generated code with no x86 prolog
  495 
  496   address generate_forward_exception() {
  497     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  498     StubCodeMark mark(this, stub_id);
  499     address start = __ pc();
  500 
  501     // Upon entry, LR points to the return address returning into
  502     // Java (interpreted or compiled) code; i.e., the return address
  503     // becomes the throwing pc.
  504     //
  505     // Arguments pushed before the runtime call are still on the stack
  506     // but the exception handler will reset the stack pointer ->
  507     // ignore them.  A potential result in registers can be ignored as
  508     // well.
  509 
  510 #ifdef ASSERT
  511     // make sure this code is only executed if there is a pending exception
  512     {
  513       Label L;
  514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  515       __ cbnz(rscratch1, L);
  516       __ stop("StubRoutines::forward exception: no pending exception (1)");
  517       __ bind(L);
  518     }
  519 #endif
  520 
  521     // compute exception handler into r19
  522 
  523     // call the VM to find the handler address associated with the
  524     // caller address. pass thread in r0 and caller pc (ret address)
  525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  526     // the stack.
  527     __ mov(c_rarg1, lr);
  528     // lr will be trashed by the VM call so we move it to R19
  529     // (callee-saved) because we also need to pass it to the handler
  530     // returned by this call.
  531     __ mov(r19, lr);
  532     BLOCK_COMMENT("call exception_handler_for_return_address");
  533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  534                          SharedRuntime::exception_handler_for_return_address),
  535                     rthread, c_rarg1);
  536     // Reinitialize the ptrue predicate register, in case the external runtime
  537     // call clobbers ptrue reg, as we may return to SVE compiled code.
  538     __ reinitialize_ptrue();
  539 
  540     // we should not really care that lr is no longer the callee
  541     // address. we saved the value the handler needs in r19 so we can
  542     // just copy it to r3. however, the C2 handler will push its own
  543     // frame and then calls into the VM and the VM code asserts that
  544     // the PC for the frame above the handler belongs to a compiled
  545     // Java method. So, we restore lr here to satisfy that assert.
  546     __ mov(lr, r19);
  547     // setup r0 & r3 & clear pending exception
  548     __ mov(r3, r19);
  549     __ mov(r19, r0);
  550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  552 
  553 #ifdef ASSERT
  554     // make sure exception is set
  555     {
  556       Label L;
  557       __ cbnz(r0, L);
  558       __ stop("StubRoutines::forward exception: no pending exception (2)");
  559       __ bind(L);
  560     }
  561 #endif
  562 
  563     // continue at exception handler
  564     // r0: exception
  565     // r3: throwing pc
  566     // r19: exception handler
  567     __ verify_oop(r0);
  568     __ br(r19);
  569 
  570     return start;
  571   }
  572 
  573   // Non-destructive plausibility checks for oops
  574   //
  575   // Arguments:
  576   //    r0: oop to verify
  577   //    rscratch1: error message
  578   //
  579   // Stack after saving c_rarg3:
  580   //    [tos + 0]: saved c_rarg3
  581   //    [tos + 1]: saved c_rarg2
  582   //    [tos + 2]: saved lr
  583   //    [tos + 3]: saved rscratch2
  584   //    [tos + 4]: saved r0
  585   //    [tos + 5]: saved rscratch1
  586   address generate_verify_oop() {
  587     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  588     StubCodeMark mark(this, stub_id);
  589     address start = __ pc();
  590 
  591     Label exit, error;
  592 
  593     // save c_rarg2 and c_rarg3
  594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  595 
  596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  598     __ ldr(c_rarg3, Address(c_rarg2));
  599     __ add(c_rarg3, c_rarg3, 1);
  600     __ str(c_rarg3, Address(c_rarg2));
  601 
  602     // object is in r0
  603     // make sure object is 'reasonable'
  604     __ cbz(r0, exit); // if obj is null it is OK
  605 
  606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  608 
  609     // return if everything seems ok
  610     __ bind(exit);
  611 
  612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  613     __ ret(lr);
  614 
  615     // handle errors
  616     __ bind(error);
  617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  618 
  619     __ push(RegSet::range(r0, r29), sp);
  620     // debug(char* msg, int64_t pc, int64_t regs[])
  621     __ mov(c_rarg0, rscratch1);      // pass address of error message
  622     __ mov(c_rarg1, lr);             // pass return address
  623     __ mov(c_rarg2, sp);             // pass address of regs on stack
  624 #ifndef PRODUCT
  625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  626 #endif
  627     BLOCK_COMMENT("call MacroAssembler::debug");
  628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  629     __ blr(rscratch1);
  630     __ hlt(0);
  631 
  632     return start;
  633   }
  634 
  635   // Generate indices for iota vector.
  636   address generate_iota_indices(StubGenStubId stub_id) {
  637     __ align(CodeEntryAlignment);
  638     StubCodeMark mark(this, stub_id);
  639     address start = __ pc();
  640     // B
  641     __ emit_data64(0x0706050403020100, relocInfo::none);
  642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  643     // H
  644     __ emit_data64(0x0003000200010000, relocInfo::none);
  645     __ emit_data64(0x0007000600050004, relocInfo::none);
  646     // S
  647     __ emit_data64(0x0000000100000000, relocInfo::none);
  648     __ emit_data64(0x0000000300000002, relocInfo::none);
  649     // D
  650     __ emit_data64(0x0000000000000000, relocInfo::none);
  651     __ emit_data64(0x0000000000000001, relocInfo::none);
  652     // S - FP
  653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  655     // D - FP
  656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  658     return start;
  659   }
  660 
  661   // The inner part of zero_words().  This is the bulk operation,
  662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  663   // caller is responsible for zeroing the last few words.
  664   //
  665   // Inputs:
  666   // r10: the HeapWord-aligned base address of an array to zero.
  667   // r11: the count in HeapWords, r11 > 0.
  668   //
  669   // Returns r10 and r11, adjusted for the caller to clear.
  670   // r10: the base address of the tail of words left to clear.
  671   // r11: the number of words in the tail.
  672   //      r11 < MacroAssembler::zero_words_block_size.
  673 
  674   address generate_zero_blocks() {
  675     Label done;
  676     Label base_aligned;
  677 
  678     Register base = r10, cnt = r11;
  679 
  680     __ align(CodeEntryAlignment);
  681     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  682     StubCodeMark mark(this, stub_id);
  683     address start = __ pc();
  684 
  685     if (UseBlockZeroing) {
  686       int zva_length = VM_Version::zva_length();
  687 
  688       // Ensure ZVA length can be divided by 16. This is required by
  689       // the subsequent operations.
  690       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  691 
  692       __ tbz(base, 3, base_aligned);
  693       __ str(zr, Address(__ post(base, 8)));
  694       __ sub(cnt, cnt, 1);
  695       __ bind(base_aligned);
  696 
  697       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  698       // alignment.
  699       Label small;
  700       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  701       __ subs(rscratch1, cnt, low_limit >> 3);
  702       __ br(Assembler::LT, small);
  703       __ zero_dcache_blocks(base, cnt);
  704       __ bind(small);
  705     }
  706 
  707     {
  708       // Number of stp instructions we'll unroll
  709       const int unroll =
  710         MacroAssembler::zero_words_block_size / 2;
  711       // Clear the remaining blocks.
  712       Label loop;
  713       __ subs(cnt, cnt, unroll * 2);
  714       __ br(Assembler::LT, done);
  715       __ bind(loop);
  716       for (int i = 0; i < unroll; i++)
  717         __ stp(zr, zr, __ post(base, 16));
  718       __ subs(cnt, cnt, unroll * 2);
  719       __ br(Assembler::GE, loop);
  720       __ bind(done);
  721       __ add(cnt, cnt, unroll * 2);
  722     }
  723 
  724     __ ret(lr);
  725 
  726     return start;
  727   }
  728 
  729 
  730   typedef enum {
  731     copy_forwards = 1,
  732     copy_backwards = -1
  733   } copy_direction;
  734 
  735   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  736   // for arraycopy stubs.
  737   class ArrayCopyBarrierSetHelper : StackObj {
  738     BarrierSetAssembler* _bs_asm;
  739     MacroAssembler* _masm;
  740     DecoratorSet _decorators;
  741     BasicType _type;
  742     Register _gct1;
  743     Register _gct2;
  744     Register _gct3;
  745     FloatRegister _gcvt1;
  746     FloatRegister _gcvt2;
  747     FloatRegister _gcvt3;
  748 
  749   public:
  750     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  751                               DecoratorSet decorators,
  752                               BasicType type,
  753                               Register gct1,
  754                               Register gct2,
  755                               Register gct3,
  756                               FloatRegister gcvt1,
  757                               FloatRegister gcvt2,
  758                               FloatRegister gcvt3)
  759       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  760         _masm(masm),
  761         _decorators(decorators),
  762         _type(type),
  763         _gct1(gct1),
  764         _gct2(gct2),
  765         _gct3(gct3),
  766         _gcvt1(gcvt1),
  767         _gcvt2(gcvt2),
  768         _gcvt3(gcvt3) {
  769     }
  770 
  771     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  772       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  773                             dst1, dst2, src,
  774                             _gct1, _gct2, _gcvt1);
  775     }
  776 
  777     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  778       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  779                              dst, src1, src2,
  780                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  781     }
  782 
  783     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  784       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  785                             dst1, dst2, src,
  786                             _gct1);
  787     }
  788 
  789     void copy_store_at_16(Address dst, Register src1, Register src2) {
  790       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  791                              dst, src1, src2,
  792                              _gct1, _gct2, _gct3);
  793     }
  794 
  795     void copy_load_at_8(Register dst, Address src) {
  796       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  797                             dst, noreg, src,
  798                             _gct1);
  799     }
  800 
  801     void copy_store_at_8(Address dst, Register src) {
  802       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  803                              dst, src, noreg,
  804                              _gct1, _gct2, _gct3);
  805     }
  806   };
  807 
  808   // Bulk copy of blocks of 8 words.
  809   //
  810   // count is a count of words.
  811   //
  812   // Precondition: count >= 8
  813   //
  814   // Postconditions:
  815   //
  816   // The least significant bit of count contains the remaining count
  817   // of words to copy.  The rest of count is trash.
  818   //
  819   // s and d are adjusted to point to the remaining words to copy
  820   //
  821   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  822     BasicType type;
  823     copy_direction direction;
  824 
  825     switch (stub_id) {
  826     case copy_byte_f_id:
  827       direction = copy_forwards;
  828       type = T_BYTE;
  829       break;
  830     case copy_byte_b_id:
  831       direction = copy_backwards;
  832       type = T_BYTE;
  833       break;
  834     case copy_oop_f_id:
  835       direction = copy_forwards;
  836       type = T_OBJECT;
  837       break;
  838     case copy_oop_b_id:
  839       direction = copy_backwards;
  840       type = T_OBJECT;
  841       break;
  842     case copy_oop_uninit_f_id:
  843       direction = copy_forwards;
  844       type = T_OBJECT;
  845       break;
  846     case copy_oop_uninit_b_id:
  847       direction = copy_backwards;
  848       type = T_OBJECT;
  849       break;
  850     default:
  851       ShouldNotReachHere();
  852     }
  853 
  854     int unit = wordSize * direction;
  855     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  856 
  857     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  858       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  859     const Register stride = r14;
  860     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  861     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  862     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  863 
  864     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  865     assert_different_registers(s, d, count, rscratch1, rscratch2);
  866 
  867     Label again, drain;
  868 
  869     __ align(CodeEntryAlignment);
  870 
  871     StubCodeMark mark(this, stub_id);
  872 
  873     __ bind(start);
  874 
  875     Label unaligned_copy_long;
  876     if (AvoidUnalignedAccesses) {
  877       __ tbnz(d, 3, unaligned_copy_long);
  878     }
  879 
  880     if (direction == copy_forwards) {
  881       __ sub(s, s, bias);
  882       __ sub(d, d, bias);
  883     }
  884 
  885 #ifdef ASSERT
  886     // Make sure we are never given < 8 words
  887     {
  888       Label L;
  889       __ cmp(count, (u1)8);
  890       __ br(Assembler::GE, L);
  891       __ stop("genrate_copy_longs called with < 8 words");
  892       __ bind(L);
  893     }
  894 #endif
  895 
  896     // Fill 8 registers
  897     if (UseSIMDForMemoryOps) {
  898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  899       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  900     } else {
  901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  902       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  903       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  905     }
  906 
  907     __ subs(count, count, 16);
  908     __ br(Assembler::LO, drain);
  909 
  910     int prefetch = PrefetchCopyIntervalInBytes;
  911     bool use_stride = false;
  912     if (direction == copy_backwards) {
  913        use_stride = prefetch > 256;
  914        prefetch = -prefetch;
  915        if (use_stride) __ mov(stride, prefetch);
  916     }
  917 
  918     __ bind(again);
  919 
  920     if (PrefetchCopyIntervalInBytes > 0)
  921       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  922 
  923     if (UseSIMDForMemoryOps) {
  924       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  925       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  926       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  927       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  928     } else {
  929       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  930       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  931       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  932       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  933       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  934       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  937     }
  938 
  939     __ subs(count, count, 8);
  940     __ br(Assembler::HS, again);
  941 
  942     // Drain
  943     __ bind(drain);
  944     if (UseSIMDForMemoryOps) {
  945       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  946       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  947     } else {
  948       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  950       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  951       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  952     }
  953 
  954     {
  955       Label L1, L2;
  956       __ tbz(count, exact_log2(4), L1);
  957       if (UseSIMDForMemoryOps) {
  958         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  959         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  960       } else {
  961         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  962         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  963         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  964         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  965       }
  966       __ bind(L1);
  967 
  968       if (direction == copy_forwards) {
  969         __ add(s, s, bias);
  970         __ add(d, d, bias);
  971       }
  972 
  973       __ tbz(count, 1, L2);
  974       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  975       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  976       __ bind(L2);
  977     }
  978 
  979     __ ret(lr);
  980 
  981     if (AvoidUnalignedAccesses) {
  982       Label drain, again;
  983       // Register order for storing. Order is different for backward copy.
  984 
  985       __ bind(unaligned_copy_long);
  986 
  987       // source address is even aligned, target odd aligned
  988       //
  989       // when forward copying word pairs we read long pairs at offsets
  990       // {0, 2, 4, 6} (in long words). when backwards copying we read
  991       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  992       // address by -2 in the forwards case so we can compute the
  993       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  994       // or -1.
  995       //
  996       // when forward copying we need to store 1 word, 3 pairs and
  997       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  998       // zero offset We adjust the destination by -1 which means we
  999       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1000       //
 1001       // When backwards copyng we need to store 1 word, 3 pairs and
 1002       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1003       // offsets {1, 3, 5, 7, 8} * unit.
 1004 
 1005       if (direction == copy_forwards) {
 1006         __ sub(s, s, 16);
 1007         __ sub(d, d, 8);
 1008       }
 1009 
 1010       // Fill 8 registers
 1011       //
 1012       // for forwards copy s was offset by -16 from the original input
 1013       // value of s so the register contents are at these offsets
 1014       // relative to the 64 bit block addressed by that original input
 1015       // and so on for each successive 64 byte block when s is updated
 1016       //
 1017       // t0 at offset 0,  t1 at offset 8
 1018       // t2 at offset 16, t3 at offset 24
 1019       // t4 at offset 32, t5 at offset 40
 1020       // t6 at offset 48, t7 at offset 56
 1021 
 1022       // for backwards copy s was not offset so the register contents
 1023       // are at these offsets into the preceding 64 byte block
 1024       // relative to that original input and so on for each successive
 1025       // preceding 64 byte block when s is updated. this explains the
 1026       // slightly counter-intuitive looking pattern of register usage
 1027       // in the stp instructions for backwards copy.
 1028       //
 1029       // t0 at offset -16, t1 at offset -8
 1030       // t2 at offset -32, t3 at offset -24
 1031       // t4 at offset -48, t5 at offset -40
 1032       // t6 at offset -64, t7 at offset -56
 1033 
 1034       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1035       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1036       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1037       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1038 
 1039       __ subs(count, count, 16);
 1040       __ br(Assembler::LO, drain);
 1041 
 1042       int prefetch = PrefetchCopyIntervalInBytes;
 1043       bool use_stride = false;
 1044       if (direction == copy_backwards) {
 1045          use_stride = prefetch > 256;
 1046          prefetch = -prefetch;
 1047          if (use_stride) __ mov(stride, prefetch);
 1048       }
 1049 
 1050       __ bind(again);
 1051 
 1052       if (PrefetchCopyIntervalInBytes > 0)
 1053         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1054 
 1055       if (direction == copy_forwards) {
 1056        // allowing for the offset of -8 the store instructions place
 1057        // registers into the target 64 bit block at the following
 1058        // offsets
 1059        //
 1060        // t0 at offset 0
 1061        // t1 at offset 8,  t2 at offset 16
 1062        // t3 at offset 24, t4 at offset 32
 1063        // t5 at offset 40, t6 at offset 48
 1064        // t7 at offset 56
 1065 
 1066         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1067         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1068         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1069         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1070         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1071         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1072         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1073         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1074         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1075       } else {
 1076        // d was not offset when we started so the registers are
 1077        // written into the 64 bit block preceding d with the following
 1078        // offsets
 1079        //
 1080        // t1 at offset -8
 1081        // t3 at offset -24, t0 at offset -16
 1082        // t5 at offset -48, t2 at offset -32
 1083        // t7 at offset -56, t4 at offset -48
 1084        //                   t6 at offset -64
 1085        //
 1086        // note that this matches the offsets previously noted for the
 1087        // loads
 1088 
 1089         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1090         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1091         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1092         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1093         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1094         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1095         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1096         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1097         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1098       }
 1099 
 1100       __ subs(count, count, 8);
 1101       __ br(Assembler::HS, again);
 1102 
 1103       // Drain
 1104       //
 1105       // this uses the same pattern of offsets and register arguments
 1106       // as above
 1107       __ bind(drain);
 1108       if (direction == copy_forwards) {
 1109         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1110         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1111         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1112         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1113         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1114       } else {
 1115         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1116         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1117         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1118         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1119         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1120       }
 1121       // now we need to copy any remaining part block which may
 1122       // include a 4 word block subblock and/or a 2 word subblock.
 1123       // bits 2 and 1 in the count are the tell-tale for whether we
 1124       // have each such subblock
 1125       {
 1126         Label L1, L2;
 1127         __ tbz(count, exact_log2(4), L1);
 1128        // this is the same as above but copying only 4 longs hence
 1129        // with only one intervening stp between the str instructions
 1130        // but note that the offsets and registers still follow the
 1131        // same pattern
 1132         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1133         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1134         if (direction == copy_forwards) {
 1135           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1136           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1137           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1141           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1142         }
 1143         __ bind(L1);
 1144 
 1145         __ tbz(count, 1, L2);
 1146        // this is the same as above but copying only 2 longs hence
 1147        // there is no intervening stp between the str instructions
 1148        // but note that the offset and register patterns are still
 1149        // the same
 1150         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1151         if (direction == copy_forwards) {
 1152           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1153           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1154         } else {
 1155           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1156           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1157         }
 1158         __ bind(L2);
 1159 
 1160        // for forwards copy we need to re-adjust the offsets we
 1161        // applied so that s and d are follow the last words written
 1162 
 1163        if (direction == copy_forwards) {
 1164          __ add(s, s, 16);
 1165          __ add(d, d, 8);
 1166        }
 1167 
 1168       }
 1169 
 1170       __ ret(lr);
 1171       }
 1172   }
 1173 
 1174   // Small copy: less than 16 bytes.
 1175   //
 1176   // NB: Ignores all of the bits of count which represent more than 15
 1177   // bytes, so a caller doesn't have to mask them.
 1178 
 1179   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1180     bool is_backwards = step < 0;
 1181     size_t granularity = uabs(step);
 1182     int direction = is_backwards ? -1 : 1;
 1183 
 1184     Label Lword, Lint, Lshort, Lbyte;
 1185 
 1186     assert(granularity
 1187            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1188 
 1189     const Register t0 = r3;
 1190     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1191     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1192 
 1193     // ??? I don't know if this bit-test-and-branch is the right thing
 1194     // to do.  It does a lot of jumping, resulting in several
 1195     // mispredicted branches.  It might make more sense to do this
 1196     // with something like Duff's device with a single computed branch.
 1197 
 1198     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1199     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1200     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1201     __ bind(Lword);
 1202 
 1203     if (granularity <= sizeof (jint)) {
 1204       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1205       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1206       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1207       __ bind(Lint);
 1208     }
 1209 
 1210     if (granularity <= sizeof (jshort)) {
 1211       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1212       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1213       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1214       __ bind(Lshort);
 1215     }
 1216 
 1217     if (granularity <= sizeof (jbyte)) {
 1218       __ tbz(count, 0, Lbyte);
 1219       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1220       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1221       __ bind(Lbyte);
 1222     }
 1223   }
 1224 
 1225   Label copy_f, copy_b;
 1226   Label copy_obj_f, copy_obj_b;
 1227   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1228 
 1229   // All-singing all-dancing memory copy.
 1230   //
 1231   // Copy count units of memory from s to d.  The size of a unit is
 1232   // step, which can be positive or negative depending on the direction
 1233   // of copy.  If is_aligned is false, we align the source address.
 1234   //
 1235 
 1236   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1237                    Register s, Register d, Register count, int step) {
 1238     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1239     bool is_backwards = step < 0;
 1240     unsigned int granularity = uabs(step);
 1241     const Register t0 = r3, t1 = r4;
 1242 
 1243     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1244     // load all the data before writing anything
 1245     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1246     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1247     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1248     const Register send = r17, dend = r16;
 1249     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1250     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1251     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1252 
 1253     if (PrefetchCopyIntervalInBytes > 0)
 1254       __ prfm(Address(s, 0), PLDL1KEEP);
 1255     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1256     __ br(Assembler::HI, copy_big);
 1257 
 1258     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1259     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1260 
 1261     __ cmp(count, u1(16/granularity));
 1262     __ br(Assembler::LS, copy16);
 1263 
 1264     __ cmp(count, u1(64/granularity));
 1265     __ br(Assembler::HI, copy80);
 1266 
 1267     __ cmp(count, u1(32/granularity));
 1268     __ br(Assembler::LS, copy32);
 1269 
 1270     // 33..64 bytes
 1271     if (UseSIMDForMemoryOps) {
 1272       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1273       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1274       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1275       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1276     } else {
 1277       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1278       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1279       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1280       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1281 
 1282       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1283       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1284       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1285       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1286     }
 1287     __ b(finish);
 1288 
 1289     // 17..32 bytes
 1290     __ bind(copy32);
 1291     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1292     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1293 
 1294     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1295     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1296     __ b(finish);
 1297 
 1298     // 65..80/96 bytes
 1299     // (96 bytes if SIMD because we do 32 byes per instruction)
 1300     __ bind(copy80);
 1301     if (UseSIMDForMemoryOps) {
 1302       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1303       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1304       // Unaligned pointers can be an issue for copying.
 1305       // The issue has more chances to happen when granularity of data is
 1306       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1307       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1308       // The most performance drop has been seen for the range 65-80 bytes.
 1309       // For such cases using the pair of ldp/stp instead of the third pair of
 1310       // ldpq/stpq fixes the performance issue.
 1311       if (granularity < sizeof (jint)) {
 1312         Label copy96;
 1313         __ cmp(count, u1(80/granularity));
 1314         __ br(Assembler::HI, copy96);
 1315         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1316 
 1317         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1318         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1319 
 1320         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1321         __ b(finish);
 1322 
 1323         __ bind(copy96);
 1324       }
 1325       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1326 
 1327       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1328       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1329 
 1330       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1331     } else {
 1332       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1333       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1334       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1335       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1336       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1337 
 1338       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1339       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1340       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1341       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1342       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1343     }
 1344     __ b(finish);
 1345 
 1346     // 0..16 bytes
 1347     __ bind(copy16);
 1348     __ cmp(count, u1(8/granularity));
 1349     __ br(Assembler::LO, copy8);
 1350 
 1351     // 8..16 bytes
 1352     bs.copy_load_at_8(t0, Address(s, 0));
 1353     bs.copy_load_at_8(t1, Address(send, -8));
 1354     bs.copy_store_at_8(Address(d, 0), t0);
 1355     bs.copy_store_at_8(Address(dend, -8), t1);
 1356     __ b(finish);
 1357 
 1358     if (granularity < 8) {
 1359       // 4..7 bytes
 1360       __ bind(copy8);
 1361       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1362       __ ldrw(t0, Address(s, 0));
 1363       __ ldrw(t1, Address(send, -4));
 1364       __ strw(t0, Address(d, 0));
 1365       __ strw(t1, Address(dend, -4));
 1366       __ b(finish);
 1367       if (granularity < 4) {
 1368         // 0..3 bytes
 1369         __ bind(copy4);
 1370         __ cbz(count, finish); // get rid of 0 case
 1371         if (granularity == 2) {
 1372           __ ldrh(t0, Address(s, 0));
 1373           __ strh(t0, Address(d, 0));
 1374         } else { // granularity == 1
 1375           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1376           // the first and last byte.
 1377           // Handle the 3 byte case by loading and storing base + count/2
 1378           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1379           // This does means in the 1 byte case we load/store the same
 1380           // byte 3 times.
 1381           __ lsr(count, count, 1);
 1382           __ ldrb(t0, Address(s, 0));
 1383           __ ldrb(t1, Address(send, -1));
 1384           __ ldrb(t2, Address(s, count));
 1385           __ strb(t0, Address(d, 0));
 1386           __ strb(t1, Address(dend, -1));
 1387           __ strb(t2, Address(d, count));
 1388         }
 1389         __ b(finish);
 1390       }
 1391     }
 1392 
 1393     __ bind(copy_big);
 1394     if (is_backwards) {
 1395       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1396       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1397     }
 1398 
 1399     // Now we've got the small case out of the way we can align the
 1400     // source address on a 2-word boundary.
 1401 
 1402     // Here we will materialize a count in r15, which is used by copy_memory_small
 1403     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1404     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1405     // can not be used as a temp register, as it contains the count.
 1406 
 1407     Label aligned;
 1408 
 1409     if (is_aligned) {
 1410       // We may have to adjust by 1 word to get s 2-word-aligned.
 1411       __ tbz(s, exact_log2(wordSize), aligned);
 1412       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1413       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1414       __ sub(count, count, wordSize/granularity);
 1415     } else {
 1416       if (is_backwards) {
 1417         __ andr(r15, s, 2 * wordSize - 1);
 1418       } else {
 1419         __ neg(r15, s);
 1420         __ andr(r15, r15, 2 * wordSize - 1);
 1421       }
 1422       // r15 is the byte adjustment needed to align s.
 1423       __ cbz(r15, aligned);
 1424       int shift = exact_log2(granularity);
 1425       if (shift > 0) {
 1426         __ lsr(r15, r15, shift);
 1427       }
 1428       __ sub(count, count, r15);
 1429 
 1430 #if 0
 1431       // ?? This code is only correct for a disjoint copy.  It may or
 1432       // may not make sense to use it in that case.
 1433 
 1434       // Copy the first pair; s and d may not be aligned.
 1435       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1436       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1437 
 1438       // Align s and d, adjust count
 1439       if (is_backwards) {
 1440         __ sub(s, s, r15);
 1441         __ sub(d, d, r15);
 1442       } else {
 1443         __ add(s, s, r15);
 1444         __ add(d, d, r15);
 1445       }
 1446 #else
 1447       copy_memory_small(decorators, type, s, d, r15, step);
 1448 #endif
 1449     }
 1450 
 1451     __ bind(aligned);
 1452 
 1453     // s is now 2-word-aligned.
 1454 
 1455     // We have a count of units and some trailing bytes. Adjust the
 1456     // count and do a bulk copy of words. If the shift is zero
 1457     // perform a move instead to benefit from zero latency moves.
 1458     int shift = exact_log2(wordSize/granularity);
 1459     if (shift > 0) {
 1460       __ lsr(r15, count, shift);
 1461     } else {
 1462       __ mov(r15, count);
 1463     }
 1464     if (direction == copy_forwards) {
 1465       if (type != T_OBJECT) {
 1466         __ bl(copy_f);
 1467       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1468         __ bl(copy_obj_uninit_f);
 1469       } else {
 1470         __ bl(copy_obj_f);
 1471       }
 1472     } else {
 1473       if (type != T_OBJECT) {
 1474         __ bl(copy_b);
 1475       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1476         __ bl(copy_obj_uninit_b);
 1477       } else {
 1478         __ bl(copy_obj_b);
 1479       }
 1480     }
 1481 
 1482     // And the tail.
 1483     copy_memory_small(decorators, type, s, d, count, step);
 1484 
 1485     if (granularity >= 8) __ bind(copy8);
 1486     if (granularity >= 4) __ bind(copy4);
 1487     __ bind(finish);
 1488   }
 1489 
 1490 
 1491   void clobber_registers() {
 1492 #ifdef ASSERT
 1493     RegSet clobbered
 1494       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1495     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1496     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1497     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1498       __ mov(*it, rscratch1);
 1499     }
 1500 #endif
 1501 
 1502   }
 1503 
 1504   // Scan over array at a for count oops, verifying each one.
 1505   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1506   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1507     Label loop, end;
 1508     __ mov(rscratch1, a);
 1509     __ mov(rscratch2, zr);
 1510     __ bind(loop);
 1511     __ cmp(rscratch2, count);
 1512     __ br(Assembler::HS, end);
 1513     if (size == wordSize) {
 1514       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1515       __ verify_oop(temp);
 1516     } else {
 1517       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1518       __ decode_heap_oop(temp); // calls verify_oop
 1519     }
 1520     __ add(rscratch2, rscratch2, 1);
 1521     __ b(loop);
 1522     __ bind(end);
 1523   }
 1524 
 1525   // Arguments:
 1526   //   stub_id - is used to name the stub and identify all details of
 1527   //             how to perform the copy.
 1528   //
 1529   //   entry - is assigned to the stub's post push entry point unless
 1530   //           it is null
 1531   //
 1532   // Inputs:
 1533   //   c_rarg0   - source array address
 1534   //   c_rarg1   - destination array address
 1535   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1536   //
 1537   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1538   // the hardware handle it.  The two dwords within qwords that span
 1539   // cache line boundaries will still be loaded and stored atomically.
 1540   //
 1541   // Side Effects: entry is set to the (post push) entry point so it
 1542   //               can be used by the corresponding conjoint copy
 1543   //               method
 1544   //
 1545   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1546     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1547     RegSet saved_reg = RegSet::of(s, d, count);
 1548     int size;
 1549     bool aligned;
 1550     bool is_oop;
 1551     bool dest_uninitialized;
 1552     switch (stub_id) {
 1553     case jbyte_disjoint_arraycopy_id:
 1554       size = sizeof(jbyte);
 1555       aligned = false;
 1556       is_oop = false;
 1557       dest_uninitialized = false;
 1558       break;
 1559     case arrayof_jbyte_disjoint_arraycopy_id:
 1560       size = sizeof(jbyte);
 1561       aligned = true;
 1562       is_oop = false;
 1563       dest_uninitialized = false;
 1564       break;
 1565     case jshort_disjoint_arraycopy_id:
 1566       size = sizeof(jshort);
 1567       aligned = false;
 1568       is_oop = false;
 1569       dest_uninitialized = false;
 1570       break;
 1571     case arrayof_jshort_disjoint_arraycopy_id:
 1572       size = sizeof(jshort);
 1573       aligned = true;
 1574       is_oop = false;
 1575       dest_uninitialized = false;
 1576       break;
 1577     case jint_disjoint_arraycopy_id:
 1578       size = sizeof(jint);
 1579       aligned = false;
 1580       is_oop = false;
 1581       dest_uninitialized = false;
 1582       break;
 1583     case arrayof_jint_disjoint_arraycopy_id:
 1584       size = sizeof(jint);
 1585       aligned = true;
 1586       is_oop = false;
 1587       dest_uninitialized = false;
 1588       break;
 1589     case jlong_disjoint_arraycopy_id:
 1590       // since this is always aligned we can (should!) use the same
 1591       // stub as for case arrayof_jlong_disjoint_arraycopy
 1592       ShouldNotReachHere();
 1593       break;
 1594     case arrayof_jlong_disjoint_arraycopy_id:
 1595       size = sizeof(jlong);
 1596       aligned = true;
 1597       is_oop = false;
 1598       dest_uninitialized = false;
 1599       break;
 1600     case oop_disjoint_arraycopy_id:
 1601       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1602       aligned = !UseCompressedOops;
 1603       is_oop = true;
 1604       dest_uninitialized = false;
 1605       break;
 1606     case arrayof_oop_disjoint_arraycopy_id:
 1607       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1608       aligned = !UseCompressedOops;
 1609       is_oop = true;
 1610       dest_uninitialized = false;
 1611       break;
 1612     case oop_disjoint_arraycopy_uninit_id:
 1613       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1614       aligned = !UseCompressedOops;
 1615       is_oop = true;
 1616       dest_uninitialized = true;
 1617       break;
 1618     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1619       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1620       aligned = !UseCompressedOops;
 1621       is_oop = true;
 1622       dest_uninitialized = true;
 1623       break;
 1624     default:
 1625       ShouldNotReachHere();
 1626       break;
 1627     }
 1628 
 1629     __ align(CodeEntryAlignment);
 1630     StubCodeMark mark(this, stub_id);
 1631     address start = __ pc();
 1632     __ enter();
 1633 
 1634     if (entry != nullptr) {
 1635       *entry = __ pc();
 1636       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1637       BLOCK_COMMENT("Entry:");
 1638     }
 1639 
 1640     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1641     if (dest_uninitialized) {
 1642       decorators |= IS_DEST_UNINITIALIZED;
 1643     }
 1644     if (aligned) {
 1645       decorators |= ARRAYCOPY_ALIGNED;
 1646     }
 1647 
 1648     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1649     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1650 
 1651     if (is_oop) {
 1652       // save regs before copy_memory
 1653       __ push(RegSet::of(d, count), sp);
 1654     }
 1655     {
 1656       // UnsafeMemoryAccess page error: continue after unsafe access
 1657       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1658       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1659       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1660     }
 1661 
 1662     if (is_oop) {
 1663       __ pop(RegSet::of(d, count), sp);
 1664       if (VerifyOops)
 1665         verify_oop_array(size, d, count, r16);
 1666     }
 1667 
 1668     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1669 
 1670     __ leave();
 1671     __ mov(r0, zr); // return 0
 1672     __ ret(lr);
 1673     return start;
 1674   }
 1675 
 1676   // Arguments:
 1677   //   stub_id - is used to name the stub and identify all details of
 1678   //             how to perform the copy.
 1679   //
 1680   //   nooverlap_target - identifes the (post push) entry for the
 1681   //             corresponding disjoint copy routine which can be
 1682   //             jumped to if the ranges do not actually overlap
 1683   //
 1684   //   entry - is assigned to the stub's post push entry point unless
 1685   //           it is null
 1686   //
 1687   //
 1688   // Inputs:
 1689   //   c_rarg0   - source array address
 1690   //   c_rarg1   - destination array address
 1691   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1692   //
 1693   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1694   // the hardware handle it.  The two dwords within qwords that span
 1695   // cache line boundaries will still be loaded and stored atomically.
 1696   //
 1697   // Side Effects:
 1698   //   entry is set to the no-overlap entry point so it can be used by
 1699   //   some other conjoint copy method
 1700   //
 1701   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1702     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1703     RegSet saved_regs = RegSet::of(s, d, count);
 1704     int size;
 1705     bool aligned;
 1706     bool is_oop;
 1707     bool dest_uninitialized;
 1708     switch (stub_id) {
 1709     case jbyte_arraycopy_id:
 1710       size = sizeof(jbyte);
 1711       aligned = false;
 1712       is_oop = false;
 1713       dest_uninitialized = false;
 1714       break;
 1715     case arrayof_jbyte_arraycopy_id:
 1716       size = sizeof(jbyte);
 1717       aligned = true;
 1718       is_oop = false;
 1719       dest_uninitialized = false;
 1720       break;
 1721     case jshort_arraycopy_id:
 1722       size = sizeof(jshort);
 1723       aligned = false;
 1724       is_oop = false;
 1725       dest_uninitialized = false;
 1726       break;
 1727     case arrayof_jshort_arraycopy_id:
 1728       size = sizeof(jshort);
 1729       aligned = true;
 1730       is_oop = false;
 1731       dest_uninitialized = false;
 1732       break;
 1733     case jint_arraycopy_id:
 1734       size = sizeof(jint);
 1735       aligned = false;
 1736       is_oop = false;
 1737       dest_uninitialized = false;
 1738       break;
 1739     case arrayof_jint_arraycopy_id:
 1740       size = sizeof(jint);
 1741       aligned = true;
 1742       is_oop = false;
 1743       dest_uninitialized = false;
 1744       break;
 1745     case jlong_arraycopy_id:
 1746       // since this is always aligned we can (should!) use the same
 1747       // stub as for case arrayof_jlong_disjoint_arraycopy
 1748       ShouldNotReachHere();
 1749       break;
 1750     case arrayof_jlong_arraycopy_id:
 1751       size = sizeof(jlong);
 1752       aligned = true;
 1753       is_oop = false;
 1754       dest_uninitialized = false;
 1755       break;
 1756     case oop_arraycopy_id:
 1757       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1758       aligned = !UseCompressedOops;
 1759       is_oop = true;
 1760       dest_uninitialized = false;
 1761       break;
 1762     case arrayof_oop_arraycopy_id:
 1763       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1764       aligned = !UseCompressedOops;
 1765       is_oop = true;
 1766       dest_uninitialized = false;
 1767       break;
 1768     case oop_arraycopy_uninit_id:
 1769       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1770       aligned = !UseCompressedOops;
 1771       is_oop = true;
 1772       dest_uninitialized = true;
 1773       break;
 1774     case arrayof_oop_arraycopy_uninit_id:
 1775       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1776       aligned = !UseCompressedOops;
 1777       is_oop = true;
 1778       dest_uninitialized = true;
 1779       break;
 1780     default:
 1781       ShouldNotReachHere();
 1782     }
 1783 
 1784     StubCodeMark mark(this, stub_id);
 1785     address start = __ pc();
 1786     __ enter();
 1787 
 1788     if (entry != nullptr) {
 1789       *entry = __ pc();
 1790       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1791       BLOCK_COMMENT("Entry:");
 1792     }
 1793 
 1794     // use fwd copy when (d-s) above_equal (count*size)
 1795     __ sub(rscratch1, d, s);
 1796     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1797     __ br(Assembler::HS, nooverlap_target);
 1798 
 1799     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1800     if (dest_uninitialized) {
 1801       decorators |= IS_DEST_UNINITIALIZED;
 1802     }
 1803     if (aligned) {
 1804       decorators |= ARRAYCOPY_ALIGNED;
 1805     }
 1806 
 1807     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1808     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1809 
 1810     if (is_oop) {
 1811       // save regs before copy_memory
 1812       __ push(RegSet::of(d, count), sp);
 1813     }
 1814     {
 1815       // UnsafeMemoryAccess page error: continue after unsafe access
 1816       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1817       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1818       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1819     }
 1820     if (is_oop) {
 1821       __ pop(RegSet::of(d, count), sp);
 1822       if (VerifyOops)
 1823         verify_oop_array(size, d, count, r16);
 1824     }
 1825     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1826     __ leave();
 1827     __ mov(r0, zr); // return 0
 1828     __ ret(lr);
 1829     return start;
 1830   }
 1831 
 1832   // Helper for generating a dynamic type check.
 1833   // Smashes rscratch1, rscratch2.
 1834   void generate_type_check(Register sub_klass,
 1835                            Register super_check_offset,
 1836                            Register super_klass,
 1837                            Register temp1,
 1838                            Register temp2,
 1839                            Register result,
 1840                            Label& L_success) {
 1841     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1842 
 1843     BLOCK_COMMENT("type_check:");
 1844 
 1845     Label L_miss;
 1846 
 1847     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1848                                      super_check_offset);
 1849     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1850 
 1851     // Fall through on failure!
 1852     __ BIND(L_miss);
 1853   }
 1854 
 1855   //
 1856   //  Generate checkcasting array copy stub
 1857   //
 1858   //  Input:
 1859   //    c_rarg0   - source array address
 1860   //    c_rarg1   - destination array address
 1861   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1862   //    c_rarg3   - size_t ckoff (super_check_offset)
 1863   //    c_rarg4   - oop ckval (super_klass)
 1864   //
 1865   //  Output:
 1866   //    r0 ==  0  -  success
 1867   //    r0 == -1^K - failure, where K is partial transfer count
 1868   //
 1869   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1870     bool dest_uninitialized;
 1871     switch (stub_id) {
 1872     case checkcast_arraycopy_id:
 1873       dest_uninitialized = false;
 1874       break;
 1875     case checkcast_arraycopy_uninit_id:
 1876       dest_uninitialized = true;
 1877       break;
 1878     default:
 1879       ShouldNotReachHere();
 1880     }
 1881 
 1882     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1883 
 1884     // Input registers (after setup_arg_regs)
 1885     const Register from        = c_rarg0;   // source array address
 1886     const Register to          = c_rarg1;   // destination array address
 1887     const Register count       = c_rarg2;   // elementscount
 1888     const Register ckoff       = c_rarg3;   // super_check_offset
 1889     const Register ckval       = c_rarg4;   // super_klass
 1890 
 1891     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1892     RegSet wb_post_saved_regs = RegSet::of(count);
 1893 
 1894     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1895     const Register copied_oop  = r22;       // actual oop copied
 1896     const Register count_save  = r21;       // orig elementscount
 1897     const Register start_to    = r20;       // destination array start address
 1898     const Register r19_klass   = r19;       // oop._klass
 1899 
 1900     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1901     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1902 
 1903     //---------------------------------------------------------------
 1904     // Assembler stub will be used for this call to arraycopy
 1905     // if the two arrays are subtypes of Object[] but the
 1906     // destination array type is not equal to or a supertype
 1907     // of the source type.  Each element must be separately
 1908     // checked.
 1909 
 1910     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1911                                copied_oop, r19_klass, count_save);
 1912 
 1913     __ align(CodeEntryAlignment);
 1914     StubCodeMark mark(this, stub_id);
 1915     address start = __ pc();
 1916 
 1917     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1918 
 1919 #ifdef ASSERT
 1920     // caller guarantees that the arrays really are different
 1921     // otherwise, we would have to make conjoint checks
 1922     { Label L;
 1923       __ b(L);                  // conjoint check not yet implemented
 1924       __ stop("checkcast_copy within a single array");
 1925       __ bind(L);
 1926     }
 1927 #endif //ASSERT
 1928 
 1929     // Caller of this entry point must set up the argument registers.
 1930     if (entry != nullptr) {
 1931       *entry = __ pc();
 1932       BLOCK_COMMENT("Entry:");
 1933     }
 1934 
 1935      // Empty array:  Nothing to do.
 1936     __ cbz(count, L_done);
 1937     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1938 
 1939 #ifdef ASSERT
 1940     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1941     // The ckoff and ckval must be mutually consistent,
 1942     // even though caller generates both.
 1943     { Label L;
 1944       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1945       __ ldrw(start_to, Address(ckval, sco_offset));
 1946       __ cmpw(ckoff, start_to);
 1947       __ br(Assembler::EQ, L);
 1948       __ stop("super_check_offset inconsistent");
 1949       __ bind(L);
 1950     }
 1951 #endif //ASSERT
 1952 
 1953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1954     bool is_oop = true;
 1955     int element_size = UseCompressedOops ? 4 : 8;
 1956     if (dest_uninitialized) {
 1957       decorators |= IS_DEST_UNINITIALIZED;
 1958     }
 1959 
 1960     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1961     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1962 
 1963     // save the original count
 1964     __ mov(count_save, count);
 1965 
 1966     // Copy from low to high addresses
 1967     __ mov(start_to, to);              // Save destination array start address
 1968     __ b(L_load_element);
 1969 
 1970     // ======== begin loop ========
 1971     // (Loop is rotated; its entry is L_load_element.)
 1972     // Loop control:
 1973     //   for (; count != 0; count--) {
 1974     //     copied_oop = load_heap_oop(from++);
 1975     //     ... generate_type_check ...;
 1976     //     store_heap_oop(to++, copied_oop);
 1977     //   }
 1978     __ align(OptoLoopAlignment);
 1979 
 1980     __ BIND(L_store_element);
 1981     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1982                       __ post(to, element_size), copied_oop, noreg,
 1983                       gct1, gct2, gct3);
 1984     __ sub(count, count, 1);
 1985     __ cbz(count, L_do_card_marks);
 1986 
 1987     // ======== loop entry is here ========
 1988     __ BIND(L_load_element);
 1989     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1990                      copied_oop, noreg, __ post(from, element_size),
 1991                      gct1);
 1992     __ cbz(copied_oop, L_store_element);
 1993 
 1994     __ load_klass(r19_klass, copied_oop);// query the object klass
 1995 
 1996     BLOCK_COMMENT("type_check:");
 1997     generate_type_check(/*sub_klass*/r19_klass,
 1998                         /*super_check_offset*/ckoff,
 1999                         /*super_klass*/ckval,
 2000                         /*r_array_base*/gct1,
 2001                         /*temp2*/gct2,
 2002                         /*result*/r10, L_store_element);
 2003 
 2004     // Fall through on failure!
 2005 
 2006     // ======== end loop ========
 2007 
 2008     // It was a real error; we must depend on the caller to finish the job.
 2009     // Register count = remaining oops, count_orig = total oops.
 2010     // Emit GC store barriers for the oops we have copied and report
 2011     // their number to the caller.
 2012 
 2013     __ subs(count, count_save, count);     // K = partially copied oop count
 2014     __ eon(count, count, zr);              // report (-1^K) to caller
 2015     __ br(Assembler::EQ, L_done_pop);
 2016 
 2017     __ BIND(L_do_card_marks);
 2018     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2019 
 2020     __ bind(L_done_pop);
 2021     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2022     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2023 
 2024     __ bind(L_done);
 2025     __ mov(r0, count);
 2026     __ leave();
 2027     __ ret(lr);
 2028 
 2029     return start;
 2030   }
 2031 
 2032   // Perform range checks on the proposed arraycopy.
 2033   // Kills temp, but nothing else.
 2034   // Also, clean the sign bits of src_pos and dst_pos.
 2035   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2036                               Register src_pos, // source position (c_rarg1)
 2037                               Register dst,     // destination array oo (c_rarg2)
 2038                               Register dst_pos, // destination position (c_rarg3)
 2039                               Register length,
 2040                               Register temp,
 2041                               Label& L_failed) {
 2042     BLOCK_COMMENT("arraycopy_range_checks:");
 2043 
 2044     assert_different_registers(rscratch1, temp);
 2045 
 2046     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2047     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2048     __ addw(temp, length, src_pos);
 2049     __ cmpw(temp, rscratch1);
 2050     __ br(Assembler::HI, L_failed);
 2051 
 2052     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2053     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2054     __ addw(temp, length, dst_pos);
 2055     __ cmpw(temp, rscratch1);
 2056     __ br(Assembler::HI, L_failed);
 2057 
 2058     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2059     __ movw(src_pos, src_pos);
 2060     __ movw(dst_pos, dst_pos);
 2061 
 2062     BLOCK_COMMENT("arraycopy_range_checks done");
 2063   }
 2064 
 2065   // These stubs get called from some dumb test routine.
 2066   // I'll write them properly when they're called from
 2067   // something that's actually doing something.
 2068   static void fake_arraycopy_stub(address src, address dst, int count) {
 2069     assert(count == 0, "huh?");
 2070   }
 2071 
 2072 
 2073   //
 2074   //  Generate 'unsafe' array copy stub
 2075   //  Though just as safe as the other stubs, it takes an unscaled
 2076   //  size_t argument instead of an element count.
 2077   //
 2078   //  Input:
 2079   //    c_rarg0   - source array address
 2080   //    c_rarg1   - destination array address
 2081   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2082   //
 2083   // Examines the alignment of the operands and dispatches
 2084   // to a long, int, short, or byte copy loop.
 2085   //
 2086   address generate_unsafe_copy(address byte_copy_entry,
 2087                                address short_copy_entry,
 2088                                address int_copy_entry,
 2089                                address long_copy_entry) {
 2090     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2091 
 2092     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2093     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2094 
 2095     __ align(CodeEntryAlignment);
 2096     StubCodeMark mark(this, stub_id);
 2097     address start = __ pc();
 2098     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2099 
 2100     // bump this on entry, not on exit:
 2101     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2102 
 2103     __ orr(rscratch1, s, d);
 2104     __ orr(rscratch1, rscratch1, count);
 2105 
 2106     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2107     __ cbz(rscratch1, L_long_aligned);
 2108     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2109     __ cbz(rscratch1, L_int_aligned);
 2110     __ tbz(rscratch1, 0, L_short_aligned);
 2111     __ b(RuntimeAddress(byte_copy_entry));
 2112 
 2113     __ BIND(L_short_aligned);
 2114     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2115     __ b(RuntimeAddress(short_copy_entry));
 2116     __ BIND(L_int_aligned);
 2117     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2118     __ b(RuntimeAddress(int_copy_entry));
 2119     __ BIND(L_long_aligned);
 2120     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2121     __ b(RuntimeAddress(long_copy_entry));
 2122 
 2123     return start;
 2124   }
 2125 
 2126   //
 2127   //  Generate generic array copy stubs
 2128   //
 2129   //  Input:
 2130   //    c_rarg0    -  src oop
 2131   //    c_rarg1    -  src_pos (32-bits)
 2132   //    c_rarg2    -  dst oop
 2133   //    c_rarg3    -  dst_pos (32-bits)
 2134   //    c_rarg4    -  element count (32-bits)
 2135   //
 2136   //  Output:
 2137   //    r0 ==  0  -  success
 2138   //    r0 == -1^K - failure, where K is partial transfer count
 2139   //
 2140   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2141                                 address int_copy_entry, address oop_copy_entry,
 2142                                 address long_copy_entry, address checkcast_copy_entry) {
 2143     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2144 
 2145     Label L_failed, L_objArray;
 2146     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2147 
 2148     // Input registers
 2149     const Register src        = c_rarg0;  // source array oop
 2150     const Register src_pos    = c_rarg1;  // source position
 2151     const Register dst        = c_rarg2;  // destination array oop
 2152     const Register dst_pos    = c_rarg3;  // destination position
 2153     const Register length     = c_rarg4;
 2154 
 2155 
 2156     // Registers used as temps
 2157     const Register dst_klass  = c_rarg5;
 2158 
 2159     __ align(CodeEntryAlignment);
 2160 
 2161     StubCodeMark mark(this, stub_id);
 2162 
 2163     address start = __ pc();
 2164 
 2165     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2166 
 2167     // bump this on entry, not on exit:
 2168     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2169 
 2170     //-----------------------------------------------------------------------
 2171     // Assembler stub will be used for this call to arraycopy
 2172     // if the following conditions are met:
 2173     //
 2174     // (1) src and dst must not be null.
 2175     // (2) src_pos must not be negative.
 2176     // (3) dst_pos must not be negative.
 2177     // (4) length  must not be negative.
 2178     // (5) src klass and dst klass should be the same and not null.
 2179     // (6) src and dst should be arrays.
 2180     // (7) src_pos + length must not exceed length of src.
 2181     // (8) dst_pos + length must not exceed length of dst.
 2182     //
 2183 
 2184     //  if (src == nullptr) return -1;
 2185     __ cbz(src, L_failed);
 2186 
 2187     //  if (src_pos < 0) return -1;
 2188     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     //  if (dst == nullptr) return -1;
 2191     __ cbz(dst, L_failed);
 2192 
 2193     //  if (dst_pos < 0) return -1;
 2194     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2195 
 2196     // registers used as temp
 2197     const Register scratch_length    = r16; // elements count to copy
 2198     const Register scratch_src_klass = r17; // array klass
 2199     const Register lh                = r15; // layout helper
 2200 
 2201     //  if (length < 0) return -1;
 2202     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2203     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2204 
 2205     __ load_klass(scratch_src_klass, src);
 2206 #ifdef ASSERT
 2207     //  assert(src->klass() != nullptr);
 2208     {
 2209       BLOCK_COMMENT("assert klasses not null {");
 2210       Label L1, L2;
 2211       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2212       __ bind(L1);
 2213       __ stop("broken null klass");
 2214       __ bind(L2);
 2215       __ load_klass(rscratch1, dst);
 2216       __ cbz(rscratch1, L1);     // this would be broken also
 2217       BLOCK_COMMENT("} assert klasses not null done");
 2218     }
 2219 #endif
 2220 
 2221     // Load layout helper (32-bits)
 2222     //
 2223     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2224     // 32        30    24            16              8     2                 0
 2225     //
 2226     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2227     //
 2228 
 2229     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2230 
 2231     // Handle objArrays completely differently...
 2232     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2233     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2234     __ movw(rscratch1, objArray_lh);
 2235     __ eorw(rscratch2, lh, rscratch1);
 2236     __ cbzw(rscratch2, L_objArray);
 2237 
 2238     //  if (src->klass() != dst->klass()) return -1;
 2239     __ load_klass(rscratch2, dst);
 2240     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2241     __ cbnz(rscratch2, L_failed);
 2242 
 2243     // Check for flat inline type array -> return -1
 2244     __ test_flat_array_oop(src, rscratch2, L_failed);
 2245 
 2246     // Check for null-free (non-flat) inline type array -> handle as object array
 2247     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2248 
 2249     //  if (!src->is_Array()) return -1;
 2250     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2251 
 2252     // At this point, it is known to be a typeArray (array_tag 0x3).
 2253 #ifdef ASSERT
 2254     {
 2255       BLOCK_COMMENT("assert primitive array {");
 2256       Label L;
 2257       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2258       __ cmpw(lh, rscratch2);
 2259       __ br(Assembler::GE, L);
 2260       __ stop("must be a primitive array");
 2261       __ bind(L);
 2262       BLOCK_COMMENT("} assert primitive array done");
 2263     }
 2264 #endif
 2265 
 2266     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2267                            rscratch2, L_failed);
 2268 
 2269     // TypeArrayKlass
 2270     //
 2271     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2272     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2273     //
 2274 
 2275     const Register rscratch1_offset = rscratch1;    // array offset
 2276     const Register r15_elsize = lh; // element size
 2277 
 2278     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2279            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2280     __ add(src, src, rscratch1_offset);           // src array offset
 2281     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2282     BLOCK_COMMENT("choose copy loop based on element size");
 2283 
 2284     // next registers should be set before the jump to corresponding stub
 2285     const Register from     = c_rarg0;  // source array address
 2286     const Register to       = c_rarg1;  // destination array address
 2287     const Register count    = c_rarg2;  // elements count
 2288 
 2289     // 'from', 'to', 'count' registers should be set in such order
 2290     // since they are the same as 'src', 'src_pos', 'dst'.
 2291 
 2292     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2293 
 2294     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2295     // size in bytes).  We do a simple bitwise binary search.
 2296   __ BIND(L_copy_bytes);
 2297     __ tbnz(r15_elsize, 1, L_copy_ints);
 2298     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2299     __ lea(from, Address(src, src_pos));// src_addr
 2300     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2301     __ movw(count, scratch_length); // length
 2302     __ b(RuntimeAddress(byte_copy_entry));
 2303 
 2304   __ BIND(L_copy_shorts);
 2305     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2307     __ movw(count, scratch_length); // length
 2308     __ b(RuntimeAddress(short_copy_entry));
 2309 
 2310   __ BIND(L_copy_ints);
 2311     __ tbnz(r15_elsize, 0, L_copy_longs);
 2312     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2313     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2314     __ movw(count, scratch_length); // length
 2315     __ b(RuntimeAddress(int_copy_entry));
 2316 
 2317   __ BIND(L_copy_longs);
 2318 #ifdef ASSERT
 2319     {
 2320       BLOCK_COMMENT("assert long copy {");
 2321       Label L;
 2322       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2323       __ cmpw(r15_elsize, LogBytesPerLong);
 2324       __ br(Assembler::EQ, L);
 2325       __ stop("must be long copy, but elsize is wrong");
 2326       __ bind(L);
 2327       BLOCK_COMMENT("} assert long copy done");
 2328     }
 2329 #endif
 2330     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2331     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2332     __ movw(count, scratch_length); // length
 2333     __ b(RuntimeAddress(long_copy_entry));
 2334 
 2335     // ObjArrayKlass
 2336   __ BIND(L_objArray);
 2337     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2338 
 2339     Label L_plain_copy, L_checkcast_copy;
 2340     //  test array classes for subtyping
 2341     __ load_klass(r15, dst);
 2342     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2343     __ br(Assembler::NE, L_checkcast_copy);
 2344 
 2345     // Identically typed arrays can be copied without element-wise checks.
 2346     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                            rscratch2, L_failed);
 2348 
 2349     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2350     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2351     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2352     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353     __ movw(count, scratch_length); // length
 2354   __ BIND(L_plain_copy);
 2355     __ b(RuntimeAddress(oop_copy_entry));
 2356 
 2357   __ BIND(L_checkcast_copy);
 2358     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2359     {
 2360       // Before looking at dst.length, make sure dst is also an objArray.
 2361       __ ldrw(rscratch1, Address(r15, lh_offset));
 2362       __ movw(rscratch2, objArray_lh);
 2363       __ eorw(rscratch1, rscratch1, rscratch2);
 2364       __ cbnzw(rscratch1, L_failed);
 2365 
 2366       // It is safe to examine both src.length and dst.length.
 2367       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2368                              r15, L_failed);
 2369 
 2370       __ load_klass(dst_klass, dst); // reload
 2371 
 2372       // Marshal the base address arguments now, freeing registers.
 2373       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2374       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2375       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2376       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2377       __ movw(count, length);           // length (reloaded)
 2378       Register sco_temp = c_rarg3;      // this register is free now
 2379       assert_different_registers(from, to, count, sco_temp,
 2380                                  dst_klass, scratch_src_klass);
 2381       // assert_clean_int(count, sco_temp);
 2382 
 2383       // Generate the type check.
 2384       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2386 
 2387       // Smashes rscratch1, rscratch2
 2388       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2389                           L_plain_copy);
 2390 
 2391       // Fetch destination element klass from the ObjArrayKlass header.
 2392       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2393       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2394       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2395 
 2396       // the checkcast_copy loop needs two extra arguments:
 2397       assert(c_rarg3 == sco_temp, "#3 already in place");
 2398       // Set up arguments for checkcast_copy_entry.
 2399       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2400       __ b(RuntimeAddress(checkcast_copy_entry));
 2401     }
 2402 
 2403   __ BIND(L_failed);
 2404     __ mov(r0, -1);
 2405     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2406     __ ret(lr);
 2407 
 2408     return start;
 2409   }
 2410 
 2411   //
 2412   // Generate stub for array fill. If "aligned" is true, the
 2413   // "to" address is assumed to be heapword aligned.
 2414   //
 2415   // Arguments for generated stub:
 2416   //   to:    c_rarg0
 2417   //   value: c_rarg1
 2418   //   count: c_rarg2 treated as signed
 2419   //
 2420   address generate_fill(StubGenStubId stub_id) {
 2421     BasicType t;
 2422     bool aligned;
 2423 
 2424     switch (stub_id) {
 2425     case jbyte_fill_id:
 2426       t = T_BYTE;
 2427       aligned = false;
 2428       break;
 2429     case jshort_fill_id:
 2430       t = T_SHORT;
 2431       aligned = false;
 2432       break;
 2433     case jint_fill_id:
 2434       t = T_INT;
 2435       aligned = false;
 2436       break;
 2437     case arrayof_jbyte_fill_id:
 2438       t = T_BYTE;
 2439       aligned = true;
 2440       break;
 2441     case arrayof_jshort_fill_id:
 2442       t = T_SHORT;
 2443       aligned = true;
 2444       break;
 2445     case arrayof_jint_fill_id:
 2446       t = T_INT;
 2447       aligned = true;
 2448       break;
 2449     default:
 2450       ShouldNotReachHere();
 2451     };
 2452 
 2453     __ align(CodeEntryAlignment);
 2454     StubCodeMark mark(this, stub_id);
 2455     address start = __ pc();
 2456 
 2457     BLOCK_COMMENT("Entry:");
 2458 
 2459     const Register to        = c_rarg0;  // source array address
 2460     const Register value     = c_rarg1;  // value
 2461     const Register count     = c_rarg2;  // elements count
 2462 
 2463     const Register bz_base = r10;        // base for block_zero routine
 2464     const Register cnt_words = r11;      // temp register
 2465 
 2466     __ enter();
 2467 
 2468     Label L_fill_elements, L_exit1;
 2469 
 2470     int shift = -1;
 2471     switch (t) {
 2472       case T_BYTE:
 2473         shift = 0;
 2474         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2475         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2476         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2477         __ br(Assembler::LO, L_fill_elements);
 2478         break;
 2479       case T_SHORT:
 2480         shift = 1;
 2481         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2482         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2483         __ br(Assembler::LO, L_fill_elements);
 2484         break;
 2485       case T_INT:
 2486         shift = 2;
 2487         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2488         __ br(Assembler::LO, L_fill_elements);
 2489         break;
 2490       default: ShouldNotReachHere();
 2491     }
 2492 
 2493     // Align source address at 8 bytes address boundary.
 2494     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2495     if (!aligned) {
 2496       switch (t) {
 2497         case T_BYTE:
 2498           // One byte misalignment happens only for byte arrays.
 2499           __ tbz(to, 0, L_skip_align1);
 2500           __ strb(value, Address(__ post(to, 1)));
 2501           __ subw(count, count, 1);
 2502           __ bind(L_skip_align1);
 2503           // Fallthrough
 2504         case T_SHORT:
 2505           // Two bytes misalignment happens only for byte and short (char) arrays.
 2506           __ tbz(to, 1, L_skip_align2);
 2507           __ strh(value, Address(__ post(to, 2)));
 2508           __ subw(count, count, 2 >> shift);
 2509           __ bind(L_skip_align2);
 2510           // Fallthrough
 2511         case T_INT:
 2512           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2513           __ tbz(to, 2, L_skip_align4);
 2514           __ strw(value, Address(__ post(to, 4)));
 2515           __ subw(count, count, 4 >> shift);
 2516           __ bind(L_skip_align4);
 2517           break;
 2518         default: ShouldNotReachHere();
 2519       }
 2520     }
 2521 
 2522     //
 2523     //  Fill large chunks
 2524     //
 2525     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2526     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2527     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2528     if (UseBlockZeroing) {
 2529       Label non_block_zeroing, rest;
 2530       // If the fill value is zero we can use the fast zero_words().
 2531       __ cbnz(value, non_block_zeroing);
 2532       __ mov(bz_base, to);
 2533       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2534       address tpc = __ zero_words(bz_base, cnt_words);
 2535       if (tpc == nullptr) {
 2536         fatal("CodeCache is full at generate_fill");
 2537       }
 2538       __ b(rest);
 2539       __ bind(non_block_zeroing);
 2540       __ fill_words(to, cnt_words, value);
 2541       __ bind(rest);
 2542     } else {
 2543       __ fill_words(to, cnt_words, value);
 2544     }
 2545 
 2546     // Remaining count is less than 8 bytes. Fill it by a single store.
 2547     // Note that the total length is no less than 8 bytes.
 2548     if (t == T_BYTE || t == T_SHORT) {
 2549       Label L_exit1;
 2550       __ cbzw(count, L_exit1);
 2551       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2552       __ str(value, Address(to, -8));    // overwrite some elements
 2553       __ bind(L_exit1);
 2554       __ leave();
 2555       __ ret(lr);
 2556     }
 2557 
 2558     // Handle copies less than 8 bytes.
 2559     Label L_fill_2, L_fill_4, L_exit2;
 2560     __ bind(L_fill_elements);
 2561     switch (t) {
 2562       case T_BYTE:
 2563         __ tbz(count, 0, L_fill_2);
 2564         __ strb(value, Address(__ post(to, 1)));
 2565         __ bind(L_fill_2);
 2566         __ tbz(count, 1, L_fill_4);
 2567         __ strh(value, Address(__ post(to, 2)));
 2568         __ bind(L_fill_4);
 2569         __ tbz(count, 2, L_exit2);
 2570         __ strw(value, Address(to));
 2571         break;
 2572       case T_SHORT:
 2573         __ tbz(count, 0, L_fill_4);
 2574         __ strh(value, Address(__ post(to, 2)));
 2575         __ bind(L_fill_4);
 2576         __ tbz(count, 1, L_exit2);
 2577         __ strw(value, Address(to));
 2578         break;
 2579       case T_INT:
 2580         __ cbzw(count, L_exit2);
 2581         __ strw(value, Address(to));
 2582         break;
 2583       default: ShouldNotReachHere();
 2584     }
 2585     __ bind(L_exit2);
 2586     __ leave();
 2587     __ ret(lr);
 2588     return start;
 2589   }
 2590 
 2591   address generate_data_cache_writeback() {
 2592     const Register line        = c_rarg0;  // address of line to write back
 2593 
 2594     __ align(CodeEntryAlignment);
 2595 
 2596     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2597     StubCodeMark mark(this, stub_id);
 2598 
 2599     address start = __ pc();
 2600     __ enter();
 2601     __ cache_wb(Address(line, 0));
 2602     __ leave();
 2603     __ ret(lr);
 2604 
 2605     return start;
 2606   }
 2607 
 2608   address generate_data_cache_writeback_sync() {
 2609     const Register is_pre     = c_rarg0;  // pre or post sync
 2610 
 2611     __ align(CodeEntryAlignment);
 2612 
 2613     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2614     StubCodeMark mark(this, stub_id);
 2615 
 2616     // pre wbsync is a no-op
 2617     // post wbsync translates to an sfence
 2618 
 2619     Label skip;
 2620     address start = __ pc();
 2621     __ enter();
 2622     __ cbnz(is_pre, skip);
 2623     __ cache_wbsync(false);
 2624     __ bind(skip);
 2625     __ leave();
 2626     __ ret(lr);
 2627 
 2628     return start;
 2629   }
 2630 
 2631   void generate_arraycopy_stubs() {
 2632     address entry;
 2633     address entry_jbyte_arraycopy;
 2634     address entry_jshort_arraycopy;
 2635     address entry_jint_arraycopy;
 2636     address entry_oop_arraycopy;
 2637     address entry_jlong_arraycopy;
 2638     address entry_checkcast_arraycopy;
 2639 
 2640     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2641     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2642 
 2643     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2644     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2645 
 2646     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2647     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2648 
 2649     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2650 
 2651     //*** jbyte
 2652     // Always need aligned and unaligned versions
 2653     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2654     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2655     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2656     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2657 
 2658     //*** jshort
 2659     // Always need aligned and unaligned versions
 2660     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2661     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2662     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2663     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2664 
 2665     //*** jint
 2666     // Aligned versions
 2667     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2668     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2669     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2670     // entry_jint_arraycopy always points to the unaligned version
 2671     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2672     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2673 
 2674     //*** jlong
 2675     // It is always aligned
 2676     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2677     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2678     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2679     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2680 
 2681     //*** oops
 2682     {
 2683       // With compressed oops we need unaligned versions; notice that
 2684       // we overwrite entry_oop_arraycopy.
 2685       bool aligned = !UseCompressedOops;
 2686 
 2687       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2688         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2689       StubRoutines::_arrayof_oop_arraycopy
 2690         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2691       // Aligned versions without pre-barriers
 2692       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2693         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2694       StubRoutines::_arrayof_oop_arraycopy_uninit
 2695         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2696     }
 2697 
 2698     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2699     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2700     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2701     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2702 
 2703     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2704     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2705 
 2706     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2707                                                               entry_jshort_arraycopy,
 2708                                                               entry_jint_arraycopy,
 2709                                                               entry_jlong_arraycopy);
 2710 
 2711     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2712                                                                entry_jshort_arraycopy,
 2713                                                                entry_jint_arraycopy,
 2714                                                                entry_oop_arraycopy,
 2715                                                                entry_jlong_arraycopy,
 2716                                                                entry_checkcast_arraycopy);
 2717 
 2718     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2719     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2720     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2721     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2722     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2723     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2724   }
 2725 
 2726   void generate_math_stubs() { Unimplemented(); }
 2727 
 2728   // Arguments:
 2729   //
 2730   // Inputs:
 2731   //   c_rarg0   - source byte array address
 2732   //   c_rarg1   - destination byte array address
 2733   //   c_rarg2   - K (key) in little endian int array
 2734   //
 2735   address generate_aescrypt_encryptBlock() {
 2736     __ align(CodeEntryAlignment);
 2737     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2738     StubCodeMark mark(this, stub_id);
 2739 
 2740     const Register from        = c_rarg0;  // source array address
 2741     const Register to          = c_rarg1;  // destination array address
 2742     const Register key         = c_rarg2;  // key array address
 2743     const Register keylen      = rscratch1;
 2744 
 2745     address start = __ pc();
 2746     __ enter();
 2747 
 2748     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2749 
 2750     __ aesenc_loadkeys(key, keylen);
 2751     __ aesecb_encrypt(from, to, keylen);
 2752 
 2753     __ mov(r0, 0);
 2754 
 2755     __ leave();
 2756     __ ret(lr);
 2757 
 2758     return start;
 2759   }
 2760 
 2761   // Arguments:
 2762   //
 2763   // Inputs:
 2764   //   c_rarg0   - source byte array address
 2765   //   c_rarg1   - destination byte array address
 2766   //   c_rarg2   - K (key) in little endian int array
 2767   //
 2768   address generate_aescrypt_decryptBlock() {
 2769     assert(UseAES, "need AES cryptographic extension support");
 2770     __ align(CodeEntryAlignment);
 2771     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2772     StubCodeMark mark(this, stub_id);
 2773     Label L_doLast;
 2774 
 2775     const Register from        = c_rarg0;  // source array address
 2776     const Register to          = c_rarg1;  // destination array address
 2777     const Register key         = c_rarg2;  // key array address
 2778     const Register keylen      = rscratch1;
 2779 
 2780     address start = __ pc();
 2781     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2782 
 2783     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2784 
 2785     __ aesecb_decrypt(from, to, key, keylen);
 2786 
 2787     __ mov(r0, 0);
 2788 
 2789     __ leave();
 2790     __ ret(lr);
 2791 
 2792     return start;
 2793   }
 2794 
 2795   // Arguments:
 2796   //
 2797   // Inputs:
 2798   //   c_rarg0   - source byte array address
 2799   //   c_rarg1   - destination byte array address
 2800   //   c_rarg2   - K (key) in little endian int array
 2801   //   c_rarg3   - r vector byte array address
 2802   //   c_rarg4   - input length
 2803   //
 2804   // Output:
 2805   //   x0        - input length
 2806   //
 2807   address generate_cipherBlockChaining_encryptAESCrypt() {
 2808     assert(UseAES, "need AES cryptographic extension support");
 2809     __ align(CodeEntryAlignment);
 2810     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2811     StubCodeMark mark(this, stub_id);
 2812 
 2813     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2814 
 2815     const Register from        = c_rarg0;  // source array address
 2816     const Register to          = c_rarg1;  // destination array address
 2817     const Register key         = c_rarg2;  // key array address
 2818     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2819                                            // and left with the results of the last encryption block
 2820     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2821     const Register keylen      = rscratch1;
 2822 
 2823     address start = __ pc();
 2824 
 2825       __ enter();
 2826 
 2827       __ movw(rscratch2, len_reg);
 2828 
 2829       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2830 
 2831       __ ld1(v0, __ T16B, rvec);
 2832 
 2833       __ cmpw(keylen, 52);
 2834       __ br(Assembler::CC, L_loadkeys_44);
 2835       __ br(Assembler::EQ, L_loadkeys_52);
 2836 
 2837       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2838       __ rev32(v17, __ T16B, v17);
 2839       __ rev32(v18, __ T16B, v18);
 2840     __ BIND(L_loadkeys_52);
 2841       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2842       __ rev32(v19, __ T16B, v19);
 2843       __ rev32(v20, __ T16B, v20);
 2844     __ BIND(L_loadkeys_44);
 2845       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2846       __ rev32(v21, __ T16B, v21);
 2847       __ rev32(v22, __ T16B, v22);
 2848       __ rev32(v23, __ T16B, v23);
 2849       __ rev32(v24, __ T16B, v24);
 2850       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2851       __ rev32(v25, __ T16B, v25);
 2852       __ rev32(v26, __ T16B, v26);
 2853       __ rev32(v27, __ T16B, v27);
 2854       __ rev32(v28, __ T16B, v28);
 2855       __ ld1(v29, v30, v31, __ T16B, key);
 2856       __ rev32(v29, __ T16B, v29);
 2857       __ rev32(v30, __ T16B, v30);
 2858       __ rev32(v31, __ T16B, v31);
 2859 
 2860     __ BIND(L_aes_loop);
 2861       __ ld1(v1, __ T16B, __ post(from, 16));
 2862       __ eor(v0, __ T16B, v0, v1);
 2863 
 2864       __ br(Assembler::CC, L_rounds_44);
 2865       __ br(Assembler::EQ, L_rounds_52);
 2866 
 2867       __ aese(v0, v17); __ aesmc(v0, v0);
 2868       __ aese(v0, v18); __ aesmc(v0, v0);
 2869     __ BIND(L_rounds_52);
 2870       __ aese(v0, v19); __ aesmc(v0, v0);
 2871       __ aese(v0, v20); __ aesmc(v0, v0);
 2872     __ BIND(L_rounds_44);
 2873       __ aese(v0, v21); __ aesmc(v0, v0);
 2874       __ aese(v0, v22); __ aesmc(v0, v0);
 2875       __ aese(v0, v23); __ aesmc(v0, v0);
 2876       __ aese(v0, v24); __ aesmc(v0, v0);
 2877       __ aese(v0, v25); __ aesmc(v0, v0);
 2878       __ aese(v0, v26); __ aesmc(v0, v0);
 2879       __ aese(v0, v27); __ aesmc(v0, v0);
 2880       __ aese(v0, v28); __ aesmc(v0, v0);
 2881       __ aese(v0, v29); __ aesmc(v0, v0);
 2882       __ aese(v0, v30);
 2883       __ eor(v0, __ T16B, v0, v31);
 2884 
 2885       __ st1(v0, __ T16B, __ post(to, 16));
 2886 
 2887       __ subw(len_reg, len_reg, 16);
 2888       __ cbnzw(len_reg, L_aes_loop);
 2889 
 2890       __ st1(v0, __ T16B, rvec);
 2891 
 2892       __ mov(r0, rscratch2);
 2893 
 2894       __ leave();
 2895       __ ret(lr);
 2896 
 2897       return start;
 2898   }
 2899 
 2900   // Arguments:
 2901   //
 2902   // Inputs:
 2903   //   c_rarg0   - source byte array address
 2904   //   c_rarg1   - destination byte array address
 2905   //   c_rarg2   - K (key) in little endian int array
 2906   //   c_rarg3   - r vector byte array address
 2907   //   c_rarg4   - input length
 2908   //
 2909   // Output:
 2910   //   r0        - input length
 2911   //
 2912   address generate_cipherBlockChaining_decryptAESCrypt() {
 2913     assert(UseAES, "need AES cryptographic extension support");
 2914     __ align(CodeEntryAlignment);
 2915     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2916     StubCodeMark mark(this, stub_id);
 2917 
 2918     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2919 
 2920     const Register from        = c_rarg0;  // source array address
 2921     const Register to          = c_rarg1;  // destination array address
 2922     const Register key         = c_rarg2;  // key array address
 2923     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2924                                            // and left with the results of the last encryption block
 2925     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2926     const Register keylen      = rscratch1;
 2927 
 2928     address start = __ pc();
 2929 
 2930       __ enter();
 2931 
 2932       __ movw(rscratch2, len_reg);
 2933 
 2934       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2935 
 2936       __ ld1(v2, __ T16B, rvec);
 2937 
 2938       __ ld1(v31, __ T16B, __ post(key, 16));
 2939       __ rev32(v31, __ T16B, v31);
 2940 
 2941       __ cmpw(keylen, 52);
 2942       __ br(Assembler::CC, L_loadkeys_44);
 2943       __ br(Assembler::EQ, L_loadkeys_52);
 2944 
 2945       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2946       __ rev32(v17, __ T16B, v17);
 2947       __ rev32(v18, __ T16B, v18);
 2948     __ BIND(L_loadkeys_52);
 2949       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2950       __ rev32(v19, __ T16B, v19);
 2951       __ rev32(v20, __ T16B, v20);
 2952     __ BIND(L_loadkeys_44);
 2953       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2954       __ rev32(v21, __ T16B, v21);
 2955       __ rev32(v22, __ T16B, v22);
 2956       __ rev32(v23, __ T16B, v23);
 2957       __ rev32(v24, __ T16B, v24);
 2958       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2959       __ rev32(v25, __ T16B, v25);
 2960       __ rev32(v26, __ T16B, v26);
 2961       __ rev32(v27, __ T16B, v27);
 2962       __ rev32(v28, __ T16B, v28);
 2963       __ ld1(v29, v30, __ T16B, key);
 2964       __ rev32(v29, __ T16B, v29);
 2965       __ rev32(v30, __ T16B, v30);
 2966 
 2967     __ BIND(L_aes_loop);
 2968       __ ld1(v0, __ T16B, __ post(from, 16));
 2969       __ orr(v1, __ T16B, v0, v0);
 2970 
 2971       __ br(Assembler::CC, L_rounds_44);
 2972       __ br(Assembler::EQ, L_rounds_52);
 2973 
 2974       __ aesd(v0, v17); __ aesimc(v0, v0);
 2975       __ aesd(v0, v18); __ aesimc(v0, v0);
 2976     __ BIND(L_rounds_52);
 2977       __ aesd(v0, v19); __ aesimc(v0, v0);
 2978       __ aesd(v0, v20); __ aesimc(v0, v0);
 2979     __ BIND(L_rounds_44);
 2980       __ aesd(v0, v21); __ aesimc(v0, v0);
 2981       __ aesd(v0, v22); __ aesimc(v0, v0);
 2982       __ aesd(v0, v23); __ aesimc(v0, v0);
 2983       __ aesd(v0, v24); __ aesimc(v0, v0);
 2984       __ aesd(v0, v25); __ aesimc(v0, v0);
 2985       __ aesd(v0, v26); __ aesimc(v0, v0);
 2986       __ aesd(v0, v27); __ aesimc(v0, v0);
 2987       __ aesd(v0, v28); __ aesimc(v0, v0);
 2988       __ aesd(v0, v29); __ aesimc(v0, v0);
 2989       __ aesd(v0, v30);
 2990       __ eor(v0, __ T16B, v0, v31);
 2991       __ eor(v0, __ T16B, v0, v2);
 2992 
 2993       __ st1(v0, __ T16B, __ post(to, 16));
 2994       __ orr(v2, __ T16B, v1, v1);
 2995 
 2996       __ subw(len_reg, len_reg, 16);
 2997       __ cbnzw(len_reg, L_aes_loop);
 2998 
 2999       __ st1(v2, __ T16B, rvec);
 3000 
 3001       __ mov(r0, rscratch2);
 3002 
 3003       __ leave();
 3004       __ ret(lr);
 3005 
 3006     return start;
 3007   }
 3008 
 3009   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3010   // Inputs: 128-bits. in is preserved.
 3011   // The least-significant 64-bit word is in the upper dword of each vector.
 3012   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3013   // Output: result
 3014   void be_add_128_64(FloatRegister result, FloatRegister in,
 3015                      FloatRegister inc, FloatRegister tmp) {
 3016     assert_different_registers(result, tmp, inc);
 3017 
 3018     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3019                                            // input
 3020     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3021     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3022                                            // MSD == 0 (must be!) to LSD
 3023     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3024   }
 3025 
 3026   // CTR AES crypt.
 3027   // Arguments:
 3028   //
 3029   // Inputs:
 3030   //   c_rarg0   - source byte array address
 3031   //   c_rarg1   - destination byte array address
 3032   //   c_rarg2   - K (key) in little endian int array
 3033   //   c_rarg3   - counter vector byte array address
 3034   //   c_rarg4   - input length
 3035   //   c_rarg5   - saved encryptedCounter start
 3036   //   c_rarg6   - saved used length
 3037   //
 3038   // Output:
 3039   //   r0       - input length
 3040   //
 3041   address generate_counterMode_AESCrypt() {
 3042     const Register in = c_rarg0;
 3043     const Register out = c_rarg1;
 3044     const Register key = c_rarg2;
 3045     const Register counter = c_rarg3;
 3046     const Register saved_len = c_rarg4, len = r10;
 3047     const Register saved_encrypted_ctr = c_rarg5;
 3048     const Register used_ptr = c_rarg6, used = r12;
 3049 
 3050     const Register offset = r7;
 3051     const Register keylen = r11;
 3052 
 3053     const unsigned char block_size = 16;
 3054     const int bulk_width = 4;
 3055     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3056     // performance with larger data sizes, but it also means that the
 3057     // fast path isn't used until you have at least 8 blocks, and up
 3058     // to 127 bytes of data will be executed on the slow path. For
 3059     // that reason, and also so as not to blow away too much icache, 4
 3060     // blocks seems like a sensible compromise.
 3061 
 3062     // Algorithm:
 3063     //
 3064     //    if (len == 0) {
 3065     //        goto DONE;
 3066     //    }
 3067     //    int result = len;
 3068     //    do {
 3069     //        if (used >= blockSize) {
 3070     //            if (len >= bulk_width * blockSize) {
 3071     //                CTR_large_block();
 3072     //                if (len == 0)
 3073     //                    goto DONE;
 3074     //            }
 3075     //            for (;;) {
 3076     //                16ByteVector v0 = counter;
 3077     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3078     //                used = 0;
 3079     //                if (len < blockSize)
 3080     //                    break;    /* goto NEXT */
 3081     //                16ByteVector v1 = load16Bytes(in, offset);
 3082     //                v1 = v1 ^ encryptedCounter;
 3083     //                store16Bytes(out, offset);
 3084     //                used = blockSize;
 3085     //                offset += blockSize;
 3086     //                len -= blockSize;
 3087     //                if (len == 0)
 3088     //                    goto DONE;
 3089     //            }
 3090     //        }
 3091     //      NEXT:
 3092     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3093     //        len--;
 3094     //    } while (len != 0);
 3095     //  DONE:
 3096     //    return result;
 3097     //
 3098     // CTR_large_block()
 3099     //    Wide bulk encryption of whole blocks.
 3100 
 3101     __ align(CodeEntryAlignment);
 3102     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3103     StubCodeMark mark(this, stub_id);
 3104     const address start = __ pc();
 3105     __ enter();
 3106 
 3107     Label DONE, CTR_large_block, large_block_return;
 3108     __ ldrw(used, Address(used_ptr));
 3109     __ cbzw(saved_len, DONE);
 3110 
 3111     __ mov(len, saved_len);
 3112     __ mov(offset, 0);
 3113 
 3114     // Compute #rounds for AES based on the length of the key array
 3115     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3116 
 3117     __ aesenc_loadkeys(key, keylen);
 3118 
 3119     {
 3120       Label L_CTR_loop, NEXT;
 3121 
 3122       __ bind(L_CTR_loop);
 3123 
 3124       __ cmp(used, block_size);
 3125       __ br(__ LO, NEXT);
 3126 
 3127       // Maybe we have a lot of data
 3128       __ subsw(rscratch1, len, bulk_width * block_size);
 3129       __ br(__ HS, CTR_large_block);
 3130       __ BIND(large_block_return);
 3131       __ cbzw(len, DONE);
 3132 
 3133       // Setup the counter
 3134       __ movi(v4, __ T4S, 0);
 3135       __ movi(v5, __ T4S, 1);
 3136       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3137 
 3138       // 128-bit big-endian increment
 3139       __ ld1(v0, __ T16B, counter);
 3140       __ rev64(v16, __ T16B, v0);
 3141       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3142       __ rev64(v16, __ T16B, v16);
 3143       __ st1(v16, __ T16B, counter);
 3144       // Previous counter value is in v0
 3145       // v4 contains { 0, 1 }
 3146 
 3147       {
 3148         // We have fewer than bulk_width blocks of data left. Encrypt
 3149         // them one by one until there is less than a full block
 3150         // remaining, being careful to save both the encrypted counter
 3151         // and the counter.
 3152 
 3153         Label inner_loop;
 3154         __ bind(inner_loop);
 3155         // Counter to encrypt is in v0
 3156         __ aesecb_encrypt(noreg, noreg, keylen);
 3157         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3158 
 3159         // Do we have a remaining full block?
 3160 
 3161         __ mov(used, 0);
 3162         __ cmp(len, block_size);
 3163         __ br(__ LO, NEXT);
 3164 
 3165         // Yes, we have a full block
 3166         __ ldrq(v1, Address(in, offset));
 3167         __ eor(v1, __ T16B, v1, v0);
 3168         __ strq(v1, Address(out, offset));
 3169         __ mov(used, block_size);
 3170         __ add(offset, offset, block_size);
 3171 
 3172         __ subw(len, len, block_size);
 3173         __ cbzw(len, DONE);
 3174 
 3175         // Increment the counter, store it back
 3176         __ orr(v0, __ T16B, v16, v16);
 3177         __ rev64(v16, __ T16B, v16);
 3178         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3179         __ rev64(v16, __ T16B, v16);
 3180         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3181 
 3182         __ b(inner_loop);
 3183       }
 3184 
 3185       __ BIND(NEXT);
 3186 
 3187       // Encrypt a single byte, and loop.
 3188       // We expect this to be a rare event.
 3189       __ ldrb(rscratch1, Address(in, offset));
 3190       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3191       __ eor(rscratch1, rscratch1, rscratch2);
 3192       __ strb(rscratch1, Address(out, offset));
 3193       __ add(offset, offset, 1);
 3194       __ add(used, used, 1);
 3195       __ subw(len, len,1);
 3196       __ cbnzw(len, L_CTR_loop);
 3197     }
 3198 
 3199     __ bind(DONE);
 3200     __ strw(used, Address(used_ptr));
 3201     __ mov(r0, saved_len);
 3202 
 3203     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3204     __ ret(lr);
 3205 
 3206     // Bulk encryption
 3207 
 3208     __ BIND (CTR_large_block);
 3209     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3210 
 3211     if (bulk_width == 8) {
 3212       __ sub(sp, sp, 4 * 16);
 3213       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3214     }
 3215     __ sub(sp, sp, 4 * 16);
 3216     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3217     RegSet saved_regs = (RegSet::of(in, out, offset)
 3218                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3219     __ push(saved_regs, sp);
 3220     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3221     __ add(in, in, offset);
 3222     __ add(out, out, offset);
 3223 
 3224     // Keys should already be loaded into the correct registers
 3225 
 3226     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3227     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3228 
 3229     // AES/CTR loop
 3230     {
 3231       Label L_CTR_loop;
 3232       __ BIND(L_CTR_loop);
 3233 
 3234       // Setup the counters
 3235       __ movi(v8, __ T4S, 0);
 3236       __ movi(v9, __ T4S, 1);
 3237       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3238 
 3239       for (int i = 0; i < bulk_width; i++) {
 3240         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3241         __ rev64(v0_ofs, __ T16B, v16);
 3242         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3243       }
 3244 
 3245       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3246 
 3247       // Encrypt the counters
 3248       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3249 
 3250       if (bulk_width == 8) {
 3251         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3252       }
 3253 
 3254       // XOR the encrypted counters with the inputs
 3255       for (int i = 0; i < bulk_width; i++) {
 3256         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3257         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3258         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3259       }
 3260 
 3261       // Write the encrypted data
 3262       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3263       if (bulk_width == 8) {
 3264         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3265       }
 3266 
 3267       __ subw(len, len, 16 * bulk_width);
 3268       __ cbnzw(len, L_CTR_loop);
 3269     }
 3270 
 3271     // Save the counter back where it goes
 3272     __ rev64(v16, __ T16B, v16);
 3273     __ st1(v16, __ T16B, counter);
 3274 
 3275     __ pop(saved_regs, sp);
 3276 
 3277     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3278     if (bulk_width == 8) {
 3279       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3280     }
 3281 
 3282     __ andr(rscratch1, len, -16 * bulk_width);
 3283     __ sub(len, len, rscratch1);
 3284     __ add(offset, offset, rscratch1);
 3285     __ mov(used, 16);
 3286     __ strw(used, Address(used_ptr));
 3287     __ b(large_block_return);
 3288 
 3289     return start;
 3290   }
 3291 
 3292   // Vector AES Galois Counter Mode implementation. Parameters:
 3293   //
 3294   // in = c_rarg0
 3295   // len = c_rarg1
 3296   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3297   // out = c_rarg3
 3298   // key = c_rarg4
 3299   // state = c_rarg5 - GHASH.state
 3300   // subkeyHtbl = c_rarg6 - powers of H
 3301   // counter = c_rarg7 - 16 bytes of CTR
 3302   // return - number of processed bytes
 3303   address generate_galoisCounterMode_AESCrypt() {
 3304     address ghash_polynomial = __ pc();
 3305     __ emit_int64(0x87);  // The low-order bits of the field
 3306                           // polynomial (i.e. p = z^7+z^2+z+1)
 3307                           // repeated in the low and high parts of a
 3308                           // 128-bit vector
 3309     __ emit_int64(0x87);
 3310 
 3311     __ align(CodeEntryAlignment);
 3312     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3313     StubCodeMark mark(this, stub_id);
 3314     address start = __ pc();
 3315     __ enter();
 3316 
 3317     const Register in = c_rarg0;
 3318     const Register len = c_rarg1;
 3319     const Register ct = c_rarg2;
 3320     const Register out = c_rarg3;
 3321     // and updated with the incremented counter in the end
 3322 
 3323     const Register key = c_rarg4;
 3324     const Register state = c_rarg5;
 3325 
 3326     const Register subkeyHtbl = c_rarg6;
 3327 
 3328     const Register counter = c_rarg7;
 3329 
 3330     const Register keylen = r10;
 3331     // Save state before entering routine
 3332     __ sub(sp, sp, 4 * 16);
 3333     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3334     __ sub(sp, sp, 4 * 16);
 3335     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3336 
 3337     // __ andr(len, len, -512);
 3338     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3339     __ str(len, __ pre(sp, -2 * wordSize));
 3340 
 3341     Label DONE;
 3342     __ cbz(len, DONE);
 3343 
 3344     // Compute #rounds for AES based on the length of the key array
 3345     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3346 
 3347     __ aesenc_loadkeys(key, keylen);
 3348     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3349     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3350 
 3351     // AES/CTR loop
 3352     {
 3353       Label L_CTR_loop;
 3354       __ BIND(L_CTR_loop);
 3355 
 3356       // Setup the counters
 3357       __ movi(v8, __ T4S, 0);
 3358       __ movi(v9, __ T4S, 1);
 3359       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3360 
 3361       assert(v0->encoding() < v8->encoding(), "");
 3362       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3363         FloatRegister f = as_FloatRegister(i);
 3364         __ rev32(f, __ T16B, v16);
 3365         __ addv(v16, __ T4S, v16, v8);
 3366       }
 3367 
 3368       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3369 
 3370       // Encrypt the counters
 3371       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3372 
 3373       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3374 
 3375       // XOR the encrypted counters with the inputs
 3376       for (int i = 0; i < 8; i++) {
 3377         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3378         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3379         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3380       }
 3381       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3382       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3383 
 3384       __ subw(len, len, 16 * 8);
 3385       __ cbnzw(len, L_CTR_loop);
 3386     }
 3387 
 3388     __ rev32(v16, __ T16B, v16);
 3389     __ st1(v16, __ T16B, counter);
 3390 
 3391     __ ldr(len, Address(sp));
 3392     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3393 
 3394     // GHASH/CTR loop
 3395     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3396                                 len, /*unrolls*/4);
 3397 
 3398 #ifdef ASSERT
 3399     { Label L;
 3400       __ cmp(len, (unsigned char)0);
 3401       __ br(Assembler::EQ, L);
 3402       __ stop("stubGenerator: abort");
 3403       __ bind(L);
 3404   }
 3405 #endif
 3406 
 3407   __ bind(DONE);
 3408     // Return the number of bytes processed
 3409     __ ldr(r0, __ post(sp, 2 * wordSize));
 3410 
 3411     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3412     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3413 
 3414     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3415     __ ret(lr);
 3416      return start;
 3417   }
 3418 
 3419   class Cached64Bytes {
 3420   private:
 3421     MacroAssembler *_masm;
 3422     Register _regs[8];
 3423 
 3424   public:
 3425     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3426       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3427       auto it = rs.begin();
 3428       for (auto &r: _regs) {
 3429         r = *it;
 3430         ++it;
 3431       }
 3432     }
 3433 
 3434     void gen_loads(Register base) {
 3435       for (int i = 0; i < 8; i += 2) {
 3436         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3437       }
 3438     }
 3439 
 3440     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3441     void extract_u32(Register dest, int i) {
 3442       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3443     }
 3444   };
 3445 
 3446   // Utility routines for md5.
 3447   // Clobbers r10 and r11.
 3448   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3449               int k, int s, int t) {
 3450     Register rscratch3 = r10;
 3451     Register rscratch4 = r11;
 3452 
 3453     __ eorw(rscratch3, r3, r4);
 3454     __ movw(rscratch2, t);
 3455     __ andw(rscratch3, rscratch3, r2);
 3456     __ addw(rscratch4, r1, rscratch2);
 3457     reg_cache.extract_u32(rscratch1, k);
 3458     __ eorw(rscratch3, rscratch3, r4);
 3459     __ addw(rscratch4, rscratch4, rscratch1);
 3460     __ addw(rscratch3, rscratch3, rscratch4);
 3461     __ rorw(rscratch2, rscratch3, 32 - s);
 3462     __ addw(r1, rscratch2, r2);
 3463   }
 3464 
 3465   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3466               int k, int s, int t) {
 3467     Register rscratch3 = r10;
 3468     Register rscratch4 = r11;
 3469 
 3470     reg_cache.extract_u32(rscratch1, k);
 3471     __ movw(rscratch2, t);
 3472     __ addw(rscratch4, r1, rscratch2);
 3473     __ addw(rscratch4, rscratch4, rscratch1);
 3474     __ bicw(rscratch2, r3, r4);
 3475     __ andw(rscratch3, r2, r4);
 3476     __ addw(rscratch2, rscratch2, rscratch4);
 3477     __ addw(rscratch2, rscratch2, rscratch3);
 3478     __ rorw(rscratch2, rscratch2, 32 - s);
 3479     __ addw(r1, rscratch2, r2);
 3480   }
 3481 
 3482   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3483               int k, int s, int t) {
 3484     Register rscratch3 = r10;
 3485     Register rscratch4 = r11;
 3486 
 3487     __ eorw(rscratch3, r3, r4);
 3488     __ movw(rscratch2, t);
 3489     __ addw(rscratch4, r1, rscratch2);
 3490     reg_cache.extract_u32(rscratch1, k);
 3491     __ eorw(rscratch3, rscratch3, r2);
 3492     __ addw(rscratch4, rscratch4, rscratch1);
 3493     __ addw(rscratch3, rscratch3, rscratch4);
 3494     __ rorw(rscratch2, rscratch3, 32 - s);
 3495     __ addw(r1, rscratch2, r2);
 3496   }
 3497 
 3498   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3499               int k, int s, int t) {
 3500     Register rscratch3 = r10;
 3501     Register rscratch4 = r11;
 3502 
 3503     __ movw(rscratch3, t);
 3504     __ ornw(rscratch2, r2, r4);
 3505     __ addw(rscratch4, r1, rscratch3);
 3506     reg_cache.extract_u32(rscratch1, k);
 3507     __ eorw(rscratch3, rscratch2, r3);
 3508     __ addw(rscratch4, rscratch4, rscratch1);
 3509     __ addw(rscratch3, rscratch3, rscratch4);
 3510     __ rorw(rscratch2, rscratch3, 32 - s);
 3511     __ addw(r1, rscratch2, r2);
 3512   }
 3513 
 3514   // Arguments:
 3515   //
 3516   // Inputs:
 3517   //   c_rarg0   - byte[]  source+offset
 3518   //   c_rarg1   - int[]   SHA.state
 3519   //   c_rarg2   - int     offset
 3520   //   c_rarg3   - int     limit
 3521   //
 3522   address generate_md5_implCompress(StubGenStubId stub_id) {
 3523     bool multi_block;
 3524     switch (stub_id) {
 3525     case md5_implCompress_id:
 3526       multi_block = false;
 3527       break;
 3528     case md5_implCompressMB_id:
 3529       multi_block = true;
 3530       break;
 3531     default:
 3532       ShouldNotReachHere();
 3533     }
 3534     __ align(CodeEntryAlignment);
 3535 
 3536     StubCodeMark mark(this, stub_id);
 3537     address start = __ pc();
 3538 
 3539     Register buf       = c_rarg0;
 3540     Register state     = c_rarg1;
 3541     Register ofs       = c_rarg2;
 3542     Register limit     = c_rarg3;
 3543     Register a         = r4;
 3544     Register b         = r5;
 3545     Register c         = r6;
 3546     Register d         = r7;
 3547     Register rscratch3 = r10;
 3548     Register rscratch4 = r11;
 3549 
 3550     Register state_regs[2] = { r12, r13 };
 3551     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3552     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3553 
 3554     __ push(saved_regs, sp);
 3555 
 3556     __ ldp(state_regs[0], state_regs[1], Address(state));
 3557     __ ubfx(a, state_regs[0],  0, 32);
 3558     __ ubfx(b, state_regs[0], 32, 32);
 3559     __ ubfx(c, state_regs[1],  0, 32);
 3560     __ ubfx(d, state_regs[1], 32, 32);
 3561 
 3562     Label md5_loop;
 3563     __ BIND(md5_loop);
 3564 
 3565     reg_cache.gen_loads(buf);
 3566 
 3567     // Round 1
 3568     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3569     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3570     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3571     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3572     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3573     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3574     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3575     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3576     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3577     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3578     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3579     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3580     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3581     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3582     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3583     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3584 
 3585     // Round 2
 3586     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3587     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3588     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3589     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3590     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3591     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3592     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3593     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3594     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3595     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3596     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3597     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3598     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3599     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3600     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3601     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3602 
 3603     // Round 3
 3604     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3605     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3606     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3607     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3608     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3609     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3610     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3611     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3612     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3613     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3614     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3615     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3616     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3617     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3618     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3619     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3620 
 3621     // Round 4
 3622     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3623     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3624     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3625     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3626     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3627     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3628     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3629     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3630     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3631     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3632     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3633     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3634     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3635     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3636     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3637     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3638 
 3639     __ addw(a, state_regs[0], a);
 3640     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3641     __ addw(b, rscratch2, b);
 3642     __ addw(c, state_regs[1], c);
 3643     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3644     __ addw(d, rscratch4, d);
 3645 
 3646     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3647     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3648 
 3649     if (multi_block) {
 3650       __ add(buf, buf, 64);
 3651       __ add(ofs, ofs, 64);
 3652       __ cmp(ofs, limit);
 3653       __ br(Assembler::LE, md5_loop);
 3654       __ mov(c_rarg0, ofs); // return ofs
 3655     }
 3656 
 3657     // write hash values back in the correct order
 3658     __ stp(state_regs[0], state_regs[1], Address(state));
 3659 
 3660     __ pop(saved_regs, sp);
 3661 
 3662     __ ret(lr);
 3663 
 3664     return start;
 3665   }
 3666 
 3667   // Arguments:
 3668   //
 3669   // Inputs:
 3670   //   c_rarg0   - byte[]  source+offset
 3671   //   c_rarg1   - int[]   SHA.state
 3672   //   c_rarg2   - int     offset
 3673   //   c_rarg3   - int     limit
 3674   //
 3675   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3676     bool multi_block;
 3677     switch (stub_id) {
 3678     case sha1_implCompress_id:
 3679       multi_block = false;
 3680       break;
 3681     case sha1_implCompressMB_id:
 3682       multi_block = true;
 3683       break;
 3684     default:
 3685       ShouldNotReachHere();
 3686     }
 3687 
 3688     __ align(CodeEntryAlignment);
 3689 
 3690     StubCodeMark mark(this, stub_id);
 3691     address start = __ pc();
 3692 
 3693     Register buf   = c_rarg0;
 3694     Register state = c_rarg1;
 3695     Register ofs   = c_rarg2;
 3696     Register limit = c_rarg3;
 3697 
 3698     Label keys;
 3699     Label sha1_loop;
 3700 
 3701     // load the keys into v0..v3
 3702     __ adr(rscratch1, keys);
 3703     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3704     // load 5 words state into v6, v7
 3705     __ ldrq(v6, Address(state, 0));
 3706     __ ldrs(v7, Address(state, 16));
 3707 
 3708 
 3709     __ BIND(sha1_loop);
 3710     // load 64 bytes of data into v16..v19
 3711     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3712     __ rev32(v16, __ T16B, v16);
 3713     __ rev32(v17, __ T16B, v17);
 3714     __ rev32(v18, __ T16B, v18);
 3715     __ rev32(v19, __ T16B, v19);
 3716 
 3717     // do the sha1
 3718     __ addv(v4, __ T4S, v16, v0);
 3719     __ orr(v20, __ T16B, v6, v6);
 3720 
 3721     FloatRegister d0 = v16;
 3722     FloatRegister d1 = v17;
 3723     FloatRegister d2 = v18;
 3724     FloatRegister d3 = v19;
 3725 
 3726     for (int round = 0; round < 20; round++) {
 3727       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3728       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3729       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3730       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3731       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3732 
 3733       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3734       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3735       __ sha1h(tmp2, __ T4S, v20);
 3736       if (round < 5)
 3737         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3738       else if (round < 10 || round >= 15)
 3739         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3740       else
 3741         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3742       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3743 
 3744       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3745     }
 3746 
 3747     __ addv(v7, __ T2S, v7, v21);
 3748     __ addv(v6, __ T4S, v6, v20);
 3749 
 3750     if (multi_block) {
 3751       __ add(ofs, ofs, 64);
 3752       __ cmp(ofs, limit);
 3753       __ br(Assembler::LE, sha1_loop);
 3754       __ mov(c_rarg0, ofs); // return ofs
 3755     }
 3756 
 3757     __ strq(v6, Address(state, 0));
 3758     __ strs(v7, Address(state, 16));
 3759 
 3760     __ ret(lr);
 3761 
 3762     __ bind(keys);
 3763     __ emit_int32(0x5a827999);
 3764     __ emit_int32(0x6ed9eba1);
 3765     __ emit_int32(0x8f1bbcdc);
 3766     __ emit_int32(0xca62c1d6);
 3767 
 3768     return start;
 3769   }
 3770 
 3771 
 3772   // Arguments:
 3773   //
 3774   // Inputs:
 3775   //   c_rarg0   - byte[]  source+offset
 3776   //   c_rarg1   - int[]   SHA.state
 3777   //   c_rarg2   - int     offset
 3778   //   c_rarg3   - int     limit
 3779   //
 3780   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3781     bool multi_block;
 3782     switch (stub_id) {
 3783     case sha256_implCompress_id:
 3784       multi_block = false;
 3785       break;
 3786     case sha256_implCompressMB_id:
 3787       multi_block = true;
 3788       break;
 3789     default:
 3790       ShouldNotReachHere();
 3791     }
 3792 
 3793     static const uint32_t round_consts[64] = {
 3794       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3795       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3796       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3797       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3798       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3799       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3800       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3801       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3802       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3803       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3804       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3805       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3806       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3807       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3808       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3809       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3810     };
 3811 
 3812     __ align(CodeEntryAlignment);
 3813 
 3814     StubCodeMark mark(this, stub_id);
 3815     address start = __ pc();
 3816 
 3817     Register buf   = c_rarg0;
 3818     Register state = c_rarg1;
 3819     Register ofs   = c_rarg2;
 3820     Register limit = c_rarg3;
 3821 
 3822     Label sha1_loop;
 3823 
 3824     __ stpd(v8, v9, __ pre(sp, -32));
 3825     __ stpd(v10, v11, Address(sp, 16));
 3826 
 3827 // dga == v0
 3828 // dgb == v1
 3829 // dg0 == v2
 3830 // dg1 == v3
 3831 // dg2 == v4
 3832 // t0 == v6
 3833 // t1 == v7
 3834 
 3835     // load 16 keys to v16..v31
 3836     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3837     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3838     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3839     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3840     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3841 
 3842     // load 8 words (256 bits) state
 3843     __ ldpq(v0, v1, state);
 3844 
 3845     __ BIND(sha1_loop);
 3846     // load 64 bytes of data into v8..v11
 3847     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3848     __ rev32(v8, __ T16B, v8);
 3849     __ rev32(v9, __ T16B, v9);
 3850     __ rev32(v10, __ T16B, v10);
 3851     __ rev32(v11, __ T16B, v11);
 3852 
 3853     __ addv(v6, __ T4S, v8, v16);
 3854     __ orr(v2, __ T16B, v0, v0);
 3855     __ orr(v3, __ T16B, v1, v1);
 3856 
 3857     FloatRegister d0 = v8;
 3858     FloatRegister d1 = v9;
 3859     FloatRegister d2 = v10;
 3860     FloatRegister d3 = v11;
 3861 
 3862 
 3863     for (int round = 0; round < 16; round++) {
 3864       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3865       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3866       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3867       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3868 
 3869       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3870        __ orr(v4, __ T16B, v2, v2);
 3871       if (round < 15)
 3872         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3873       __ sha256h(v2, __ T4S, v3, tmp2);
 3874       __ sha256h2(v3, __ T4S, v4, tmp2);
 3875       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3876 
 3877       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3878     }
 3879 
 3880     __ addv(v0, __ T4S, v0, v2);
 3881     __ addv(v1, __ T4S, v1, v3);
 3882 
 3883     if (multi_block) {
 3884       __ add(ofs, ofs, 64);
 3885       __ cmp(ofs, limit);
 3886       __ br(Assembler::LE, sha1_loop);
 3887       __ mov(c_rarg0, ofs); // return ofs
 3888     }
 3889 
 3890     __ ldpd(v10, v11, Address(sp, 16));
 3891     __ ldpd(v8, v9, __ post(sp, 32));
 3892 
 3893     __ stpq(v0, v1, state);
 3894 
 3895     __ ret(lr);
 3896 
 3897     return start;
 3898   }
 3899 
 3900   // Double rounds for sha512.
 3901   void sha512_dround(int dr,
 3902                      FloatRegister vi0, FloatRegister vi1,
 3903                      FloatRegister vi2, FloatRegister vi3,
 3904                      FloatRegister vi4, FloatRegister vrc0,
 3905                      FloatRegister vrc1, FloatRegister vin0,
 3906                      FloatRegister vin1, FloatRegister vin2,
 3907                      FloatRegister vin3, FloatRegister vin4) {
 3908       if (dr < 36) {
 3909         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3910       }
 3911       __ addv(v5, __ T2D, vrc0, vin0);
 3912       __ ext(v6, __ T16B, vi2, vi3, 8);
 3913       __ ext(v5, __ T16B, v5, v5, 8);
 3914       __ ext(v7, __ T16B, vi1, vi2, 8);
 3915       __ addv(vi3, __ T2D, vi3, v5);
 3916       if (dr < 32) {
 3917         __ ext(v5, __ T16B, vin3, vin4, 8);
 3918         __ sha512su0(vin0, __ T2D, vin1);
 3919       }
 3920       __ sha512h(vi3, __ T2D, v6, v7);
 3921       if (dr < 32) {
 3922         __ sha512su1(vin0, __ T2D, vin2, v5);
 3923       }
 3924       __ addv(vi4, __ T2D, vi1, vi3);
 3925       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3926   }
 3927 
 3928   // Arguments:
 3929   //
 3930   // Inputs:
 3931   //   c_rarg0   - byte[]  source+offset
 3932   //   c_rarg1   - int[]   SHA.state
 3933   //   c_rarg2   - int     offset
 3934   //   c_rarg3   - int     limit
 3935   //
 3936   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3937     bool multi_block;
 3938     switch (stub_id) {
 3939     case sha512_implCompress_id:
 3940       multi_block = false;
 3941       break;
 3942     case sha512_implCompressMB_id:
 3943       multi_block = true;
 3944       break;
 3945     default:
 3946       ShouldNotReachHere();
 3947     }
 3948 
 3949     static const uint64_t round_consts[80] = {
 3950       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3951       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3952       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3953       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3954       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3955       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3956       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3957       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3958       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3959       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3960       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3961       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3962       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3963       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3964       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3965       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3966       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3967       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3968       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3969       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3970       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3971       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3972       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3973       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3974       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3975       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3976       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3977     };
 3978 
 3979     __ align(CodeEntryAlignment);
 3980 
 3981     StubCodeMark mark(this, stub_id);
 3982     address start = __ pc();
 3983 
 3984     Register buf   = c_rarg0;
 3985     Register state = c_rarg1;
 3986     Register ofs   = c_rarg2;
 3987     Register limit = c_rarg3;
 3988 
 3989     __ stpd(v8, v9, __ pre(sp, -64));
 3990     __ stpd(v10, v11, Address(sp, 16));
 3991     __ stpd(v12, v13, Address(sp, 32));
 3992     __ stpd(v14, v15, Address(sp, 48));
 3993 
 3994     Label sha512_loop;
 3995 
 3996     // load state
 3997     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3998 
 3999     // load first 4 round constants
 4000     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4001     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4002 
 4003     __ BIND(sha512_loop);
 4004     // load 128B of data into v12..v19
 4005     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4006     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4007     __ rev64(v12, __ T16B, v12);
 4008     __ rev64(v13, __ T16B, v13);
 4009     __ rev64(v14, __ T16B, v14);
 4010     __ rev64(v15, __ T16B, v15);
 4011     __ rev64(v16, __ T16B, v16);
 4012     __ rev64(v17, __ T16B, v17);
 4013     __ rev64(v18, __ T16B, v18);
 4014     __ rev64(v19, __ T16B, v19);
 4015 
 4016     __ mov(rscratch2, rscratch1);
 4017 
 4018     __ mov(v0, __ T16B, v8);
 4019     __ mov(v1, __ T16B, v9);
 4020     __ mov(v2, __ T16B, v10);
 4021     __ mov(v3, __ T16B, v11);
 4022 
 4023     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4024     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4025     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4026     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4027     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4028     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4029     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4030     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4031     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4032     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4033     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4034     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4035     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4036     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4037     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4038     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4039     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4040     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4041     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4042     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4043     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4044     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4045     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4046     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4047     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4048     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4049     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4050     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4051     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4052     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4053     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4054     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4055     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4056     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4057     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4058     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4059     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4060     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4061     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4062     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4063 
 4064     __ addv(v8, __ T2D, v8, v0);
 4065     __ addv(v9, __ T2D, v9, v1);
 4066     __ addv(v10, __ T2D, v10, v2);
 4067     __ addv(v11, __ T2D, v11, v3);
 4068 
 4069     if (multi_block) {
 4070       __ add(ofs, ofs, 128);
 4071       __ cmp(ofs, limit);
 4072       __ br(Assembler::LE, sha512_loop);
 4073       __ mov(c_rarg0, ofs); // return ofs
 4074     }
 4075 
 4076     __ st1(v8, v9, v10, v11, __ T2D, state);
 4077 
 4078     __ ldpd(v14, v15, Address(sp, 48));
 4079     __ ldpd(v12, v13, Address(sp, 32));
 4080     __ ldpd(v10, v11, Address(sp, 16));
 4081     __ ldpd(v8, v9, __ post(sp, 64));
 4082 
 4083     __ ret(lr);
 4084 
 4085     return start;
 4086   }
 4087 
 4088   // Execute one round of keccak of two computations in parallel.
 4089   // One of the states should be loaded into the lower halves of
 4090   // the vector registers v0-v24, the other should be loaded into
 4091   // the upper halves of those registers. The ld1r instruction loads
 4092   // the round constant into both halves of register v31.
 4093   // Intermediate results c0...c5 and d0...d5 are computed
 4094   // in registers v25...v30.
 4095   // All vector instructions that are used operate on both register
 4096   // halves in parallel.
 4097   // If only a single computation is needed, one can only load the lower halves.
 4098   void keccak_round(Register rscratch1) {
 4099   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4100   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4101   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4102   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4103   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4104   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4105   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4106   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4107   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4108   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4109 
 4110   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4111   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4112   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4113   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4114   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4115 
 4116   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4117   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4118   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4119   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4120   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4121   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4122   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4123   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4124   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4125   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4126   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4127   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4128   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4129   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4130   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4131   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4132   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4133   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4134   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4135   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4136   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4137   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4138   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4139   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4140   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4141 
 4142   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4143   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4144   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4145   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4146   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4147 
 4148   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4149 
 4150   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4151   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4152   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4153   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4154   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4155 
 4156   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4157   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4158   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4159   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4160   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4161 
 4162   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4163   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4164   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4165   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4166   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4167 
 4168   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4169   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4170   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4171   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4172   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4173 
 4174   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4175   }
 4176 
 4177   // Arguments:
 4178   //
 4179   // Inputs:
 4180   //   c_rarg0   - byte[]  source+offset
 4181   //   c_rarg1   - byte[]  SHA.state
 4182   //   c_rarg2   - int     block_size
 4183   //   c_rarg3   - int     offset
 4184   //   c_rarg4   - int     limit
 4185   //
 4186   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4187     bool multi_block;
 4188     switch (stub_id) {
 4189     case sha3_implCompress_id:
 4190       multi_block = false;
 4191       break;
 4192     case sha3_implCompressMB_id:
 4193       multi_block = true;
 4194       break;
 4195     default:
 4196       ShouldNotReachHere();
 4197     }
 4198 
 4199     static const uint64_t round_consts[24] = {
 4200       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4201       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4202       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4203       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4204       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4205       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4206       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4207       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4208     };
 4209 
 4210     __ align(CodeEntryAlignment);
 4211 
 4212     StubCodeMark mark(this, stub_id);
 4213     address start = __ pc();
 4214 
 4215     Register buf           = c_rarg0;
 4216     Register state         = c_rarg1;
 4217     Register block_size    = c_rarg2;
 4218     Register ofs           = c_rarg3;
 4219     Register limit         = c_rarg4;
 4220 
 4221     Label sha3_loop, rounds24_loop;
 4222     Label sha3_512_or_sha3_384, shake128;
 4223 
 4224     __ stpd(v8, v9, __ pre(sp, -64));
 4225     __ stpd(v10, v11, Address(sp, 16));
 4226     __ stpd(v12, v13, Address(sp, 32));
 4227     __ stpd(v14, v15, Address(sp, 48));
 4228 
 4229     // load state
 4230     __ add(rscratch1, state, 32);
 4231     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4232     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4233     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4234     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4235     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4236     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4237     __ ld1(v24, __ T1D, rscratch1);
 4238 
 4239     __ BIND(sha3_loop);
 4240 
 4241     // 24 keccak rounds
 4242     __ movw(rscratch2, 24);
 4243 
 4244     // load round_constants base
 4245     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4246 
 4247     // load input
 4248     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4249     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4250     __ eor(v0, __ T8B, v0, v25);
 4251     __ eor(v1, __ T8B, v1, v26);
 4252     __ eor(v2, __ T8B, v2, v27);
 4253     __ eor(v3, __ T8B, v3, v28);
 4254     __ eor(v4, __ T8B, v4, v29);
 4255     __ eor(v5, __ T8B, v5, v30);
 4256     __ eor(v6, __ T8B, v6, v31);
 4257 
 4258     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4259     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4260 
 4261     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4262     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4263     __ eor(v7, __ T8B, v7, v25);
 4264     __ eor(v8, __ T8B, v8, v26);
 4265     __ eor(v9, __ T8B, v9, v27);
 4266     __ eor(v10, __ T8B, v10, v28);
 4267     __ eor(v11, __ T8B, v11, v29);
 4268     __ eor(v12, __ T8B, v12, v30);
 4269     __ eor(v13, __ T8B, v13, v31);
 4270 
 4271     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4272     __ eor(v14, __ T8B, v14, v25);
 4273     __ eor(v15, __ T8B, v15, v26);
 4274     __ eor(v16, __ T8B, v16, v27);
 4275 
 4276     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4277     __ andw(c_rarg5, block_size, 48);
 4278     __ cbzw(c_rarg5, rounds24_loop);
 4279 
 4280     __ tbnz(block_size, 5, shake128);
 4281     // block_size == 144, bit5 == 0, SHA3-224
 4282     __ ldrd(v28, __ post(buf, 8));
 4283     __ eor(v17, __ T8B, v17, v28);
 4284     __ b(rounds24_loop);
 4285 
 4286     __ BIND(shake128);
 4287     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4288     __ eor(v17, __ T8B, v17, v28);
 4289     __ eor(v18, __ T8B, v18, v29);
 4290     __ eor(v19, __ T8B, v19, v30);
 4291     __ eor(v20, __ T8B, v20, v31);
 4292     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4293 
 4294     __ BIND(sha3_512_or_sha3_384);
 4295     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4296     __ eor(v7, __ T8B, v7, v25);
 4297     __ eor(v8, __ T8B, v8, v26);
 4298     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4299 
 4300     // SHA3-384
 4301     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4302     __ eor(v9,  __ T8B, v9,  v27);
 4303     __ eor(v10, __ T8B, v10, v28);
 4304     __ eor(v11, __ T8B, v11, v29);
 4305     __ eor(v12, __ T8B, v12, v30);
 4306 
 4307     __ BIND(rounds24_loop);
 4308     __ subw(rscratch2, rscratch2, 1);
 4309 
 4310     keccak_round(rscratch1);
 4311 
 4312     __ cbnzw(rscratch2, rounds24_loop);
 4313 
 4314     if (multi_block) {
 4315       __ add(ofs, ofs, block_size);
 4316       __ cmp(ofs, limit);
 4317       __ br(Assembler::LE, sha3_loop);
 4318       __ mov(c_rarg0, ofs); // return ofs
 4319     }
 4320 
 4321     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4322     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4323     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4324     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4325     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4326     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4327     __ st1(v24, __ T1D, state);
 4328 
 4329     // restore callee-saved registers
 4330     __ ldpd(v14, v15, Address(sp, 48));
 4331     __ ldpd(v12, v13, Address(sp, 32));
 4332     __ ldpd(v10, v11, Address(sp, 16));
 4333     __ ldpd(v8, v9, __ post(sp, 64));
 4334 
 4335     __ ret(lr);
 4336 
 4337     return start;
 4338   }
 4339 
 4340   // Inputs:
 4341   //   c_rarg0   - long[]  state0
 4342   //   c_rarg1   - long[]  state1
 4343   address generate_double_keccak() {
 4344     static const uint64_t round_consts[24] = {
 4345       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4346       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4347       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4348       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4349       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4350       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4351       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4352       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4353     };
 4354 
 4355     // Implements the double_keccak() method of the
 4356     // sun.secyrity.provider.SHA3Parallel class
 4357     __ align(CodeEntryAlignment);
 4358     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4359     address start = __ pc();
 4360     __ enter();
 4361 
 4362     Register state0        = c_rarg0;
 4363     Register state1        = c_rarg1;
 4364 
 4365     Label rounds24_loop;
 4366 
 4367     // save callee-saved registers
 4368     __ stpd(v8, v9, __ pre(sp, -64));
 4369     __ stpd(v10, v11, Address(sp, 16));
 4370     __ stpd(v12, v13, Address(sp, 32));
 4371     __ stpd(v14, v15, Address(sp, 48));
 4372 
 4373     // load states
 4374     __ add(rscratch1, state0, 32);
 4375     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4376     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4377     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4378     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4379     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4380     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4381     __ ld1(v24, __ D, 0, rscratch1);
 4382     __ add(rscratch1, state1, 32);
 4383     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4384     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4385     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4386     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4387     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4388     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4389     __ ld1(v24, __ D, 1, rscratch1);
 4390 
 4391     // 24 keccak rounds
 4392     __ movw(rscratch2, 24);
 4393 
 4394     // load round_constants base
 4395     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4396 
 4397     __ BIND(rounds24_loop);
 4398     __ subw(rscratch2, rscratch2, 1);
 4399     keccak_round(rscratch1);
 4400     __ cbnzw(rscratch2, rounds24_loop);
 4401 
 4402     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4403     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4404     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4405     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4406     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4407     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4408     __ st1(v24, __ D, 0, state0);
 4409     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4410     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4411     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4412     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4413     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4414     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4415     __ st1(v24, __ D, 1, state1);
 4416 
 4417     // restore callee-saved vector registers
 4418     __ ldpd(v14, v15, Address(sp, 48));
 4419     __ ldpd(v12, v13, Address(sp, 32));
 4420     __ ldpd(v10, v11, Address(sp, 16));
 4421     __ ldpd(v8, v9, __ post(sp, 64));
 4422 
 4423     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4424     __ mov(r0, zr); // return 0
 4425     __ ret(lr);
 4426 
 4427     return start;
 4428   }
 4429 
 4430   // ChaCha20 block function.  This version parallelizes the 32-bit
 4431   // state elements on each of 16 vectors, producing 4 blocks of
 4432   // keystream at a time.
 4433   //
 4434   // state (int[16]) = c_rarg0
 4435   // keystream (byte[256]) = c_rarg1
 4436   // return - number of bytes of produced keystream (always 256)
 4437   //
 4438   // This implementation takes each 32-bit integer from the state
 4439   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4440   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4441   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4442   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4443   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4444   // operations represented by one QUARTERROUND function, we instead stack all
 4445   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4446   // and then do the same for the second set of 4 quarter rounds.  This removes
 4447   // some latency that would otherwise be incurred by waiting for an add to
 4448   // complete before performing an xor (which depends on the result of the
 4449   // add), etc. An adjustment happens between the first and second groups of 4
 4450   // quarter rounds, but this is done only in the inputs to the macro functions
 4451   // that generate the assembly instructions - these adjustments themselves are
 4452   // not part of the resulting assembly.
 4453   // The 4 registers v0-v3 are used during the quarter round operations as
 4454   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4455   // registers become the vectors involved in adding the start state back onto
 4456   // the post-QR working state.  After the adds are complete, each of the 16
 4457   // vectors write their first lane back to the keystream buffer, followed
 4458   // by the second lane from all vectors and so on.
 4459   address generate_chacha20Block_blockpar() {
 4460     Label L_twoRounds, L_cc20_const;
 4461     // The constant data is broken into two 128-bit segments to be loaded
 4462     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4463     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4464     // The second 128-bits is a table constant used for 8-bit left rotations.
 4465     __ BIND(L_cc20_const);
 4466     __ emit_int64(0x0000000100000000UL);
 4467     __ emit_int64(0x0000000300000002UL);
 4468     __ emit_int64(0x0605040702010003UL);
 4469     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4470 
 4471     __ align(CodeEntryAlignment);
 4472     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4473     StubCodeMark mark(this, stub_id);
 4474     address start = __ pc();
 4475     __ enter();
 4476 
 4477     int i, j;
 4478     const Register state = c_rarg0;
 4479     const Register keystream = c_rarg1;
 4480     const Register loopCtr = r10;
 4481     const Register tmpAddr = r11;
 4482     const FloatRegister ctrAddOverlay = v28;
 4483     const FloatRegister lrot8Tbl = v29;
 4484 
 4485     // Organize SIMD registers in an array that facilitates
 4486     // putting repetitive opcodes into loop structures.  It is
 4487     // important that each grouping of 4 registers is monotonically
 4488     // increasing to support the requirements of multi-register
 4489     // instructions (e.g. ld4r, st4, etc.)
 4490     const FloatRegister workSt[16] = {
 4491          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4492         v20, v21, v22, v23, v24, v25, v26, v27
 4493     };
 4494 
 4495     // Pull in constant data.  The first 16 bytes are the add overlay
 4496     // which is applied to the vector holding the counter (state[12]).
 4497     // The second 16 bytes is the index register for the 8-bit left
 4498     // rotation tbl instruction.
 4499     __ adr(tmpAddr, L_cc20_const);
 4500     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4501 
 4502     // Load from memory and interlace across 16 SIMD registers,
 4503     // With each word from memory being broadcast to all lanes of
 4504     // each successive SIMD register.
 4505     //      Addr(0) -> All lanes in workSt[i]
 4506     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4507     __ mov(tmpAddr, state);
 4508     for (i = 0; i < 16; i += 4) {
 4509       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4510           __ post(tmpAddr, 16));
 4511     }
 4512     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4513 
 4514     // Before entering the loop, create 5 4-register arrays.  These
 4515     // will hold the 4 registers that represent the a/b/c/d fields
 4516     // in the quarter round operation.  For instance the "b" field
 4517     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4518     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4519     // since it is part of a diagonal organization.  The aSet and scratch
 4520     // register sets are defined at declaration time because they do not change
 4521     // organization at any point during the 20-round processing.
 4522     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4523     FloatRegister bSet[4];
 4524     FloatRegister cSet[4];
 4525     FloatRegister dSet[4];
 4526     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4527 
 4528     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4529     __ mov(loopCtr, 10);
 4530     __ BIND(L_twoRounds);
 4531 
 4532     // Set to columnar organization and do the following 4 quarter-rounds:
 4533     // QUARTERROUND(0, 4, 8, 12)
 4534     // QUARTERROUND(1, 5, 9, 13)
 4535     // QUARTERROUND(2, 6, 10, 14)
 4536     // QUARTERROUND(3, 7, 11, 15)
 4537     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4538     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4539     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4540 
 4541     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4542     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4543     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4544 
 4545     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4546     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4547     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4548 
 4549     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4550     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4551     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4552 
 4553     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4554     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4555     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4556 
 4557     // Set to diagonal organization and do the next 4 quarter-rounds:
 4558     // QUARTERROUND(0, 5, 10, 15)
 4559     // QUARTERROUND(1, 6, 11, 12)
 4560     // QUARTERROUND(2, 7, 8, 13)
 4561     // QUARTERROUND(3, 4, 9, 14)
 4562     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4563     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4564     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4565 
 4566     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4567     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4568     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4569 
 4570     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4571     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4572     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4573 
 4574     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4575     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4576     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4577 
 4578     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4579     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4580     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4581 
 4582     // Decrement and iterate
 4583     __ sub(loopCtr, loopCtr, 1);
 4584     __ cbnz(loopCtr, L_twoRounds);
 4585 
 4586     __ mov(tmpAddr, state);
 4587 
 4588     // Add the starting state back to the post-loop keystream
 4589     // state.  We read/interlace the state array from memory into
 4590     // 4 registers similar to what we did in the beginning.  Then
 4591     // add the counter overlay onto workSt[12] at the end.
 4592     for (i = 0; i < 16; i += 4) {
 4593       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4594       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4595       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4596       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4597       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4598     }
 4599     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4600 
 4601     // Write working state into the keystream buffer.  This is accomplished
 4602     // by taking the lane "i" from each of the four vectors and writing
 4603     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4604     // repeating with the next 4 vectors until all 16 vectors have been used.
 4605     // Then move to the next lane and repeat the process until all lanes have
 4606     // been written.
 4607     for (i = 0; i < 4; i++) {
 4608       for (j = 0; j < 16; j += 4) {
 4609         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4610             __ post(keystream, 16));
 4611       }
 4612     }
 4613 
 4614     __ mov(r0, 256);             // Return length of output keystream
 4615     __ leave();
 4616     __ ret(lr);
 4617 
 4618     return start;
 4619   }
 4620 
 4621   // Helpers to schedule parallel operation bundles across vector
 4622   // register sequences of size 2, 4 or 8.
 4623 
 4624   // Implement various primitive computations across vector sequences
 4625 
 4626   template<int N>
 4627   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4628                const VSeq<N>& v1, const VSeq<N>& v2) {
 4629     // output must not be constant
 4630     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4631     // output cannot overwrite pending inputs
 4632     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4633     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4634     for (int i = 0; i < N; i++) {
 4635       __ addv(v[i], T, v1[i], v2[i]);
 4636     }
 4637   }
 4638 
 4639   template<int N>
 4640   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4641                const VSeq<N>& v1, const VSeq<N>& v2) {
 4642     // output must not be constant
 4643     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4644     // output cannot overwrite pending inputs
 4645     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4646     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4647     for (int i = 0; i < N; i++) {
 4648       __ subv(v[i], T, v1[i], v2[i]);
 4649     }
 4650   }
 4651 
 4652   template<int N>
 4653   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4654                const VSeq<N>& v1, const VSeq<N>& v2) {
 4655     // output must not be constant
 4656     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4657     // output cannot overwrite pending inputs
 4658     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4659     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4660     for (int i = 0; i < N; i++) {
 4661       __ mulv(v[i], T, v1[i], v2[i]);
 4662     }
 4663   }
 4664 
 4665   template<int N>
 4666   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4667     // output must not be constant
 4668     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4669     // output cannot overwrite pending inputs
 4670     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4671     for (int i = 0; i < N; i++) {
 4672       __ negr(v[i], T, v1[i]);
 4673     }
 4674   }
 4675 
 4676   template<int N>
 4677   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4678                const VSeq<N>& v1, int shift) {
 4679     // output must not be constant
 4680     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4681     // output cannot overwrite pending inputs
 4682     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4683     for (int i = 0; i < N; i++) {
 4684       __ sshr(v[i], T, v1[i], shift);
 4685     }
 4686   }
 4687 
 4688   template<int N>
 4689   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4690     // output must not be constant
 4691     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4692     // output cannot overwrite pending inputs
 4693     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4694     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4695     for (int i = 0; i < N; i++) {
 4696       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4697     }
 4698   }
 4699 
 4700   template<int N>
 4701   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4702     // output must not be constant
 4703     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4704     // output cannot overwrite pending inputs
 4705     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4706     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4707     for (int i = 0; i < N; i++) {
 4708       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4709     }
 4710   }
 4711 
 4712   template<int N>
 4713   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4714     // output must not be constant
 4715     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4716     // output cannot overwrite pending inputs
 4717     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4718     for (int i = 0; i < N; i++) {
 4719       __ notr(v[i], __ T16B, v1[i]);
 4720     }
 4721   }
 4722 
 4723   template<int N>
 4724   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4725     // output must not be constant
 4726     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4727     // output cannot overwrite pending inputs
 4728     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4729     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4730     for (int i = 0; i < N; i++) {
 4731       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4732     }
 4733   }
 4734 
 4735   template<int N>
 4736   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4737     // output must not be constant
 4738     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4739     // output cannot overwrite pending inputs
 4740     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4741     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4742     for (int i = 0; i < N; i++) {
 4743       __ mlsv(v[i], T, v1[i], v2[i]);
 4744     }
 4745   }
 4746 
 4747   // load N/2 successive pairs of quadword values from memory in order
 4748   // into N successive vector registers of the sequence via the
 4749   // address supplied in base.
 4750   template<int N>
 4751   void vs_ldpq(const VSeq<N>& v, Register base) {
 4752     for (int i = 0; i < N; i += 2) {
 4753       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4754     }
 4755   }
 4756 
 4757   // load N/2 successive pairs of quadword values from memory in order
 4758   // into N vector registers of the sequence via the address supplied
 4759   // in base using post-increment addressing
 4760   template<int N>
 4761   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4762     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4763     for (int i = 0; i < N; i += 2) {
 4764       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4765     }
 4766   }
 4767 
 4768   // store N successive vector registers of the sequence into N/2
 4769   // successive pairs of quadword memory locations via the address
 4770   // supplied in base using post-increment addressing
 4771   template<int N>
 4772   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4773     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4774     for (int i = 0; i < N; i += 2) {
 4775       __ stpq(v[i], v[i+1], __ post(base, 32));
 4776     }
 4777   }
 4778 
 4779   // load N/2 pairs of quadword values from memory de-interleaved into
 4780   // N vector registers 2 at a time via the address supplied in base
 4781   // using post-increment addressing.
 4782   template<int N>
 4783   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4784     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4785     for (int i = 0; i < N; i += 2) {
 4786       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4787     }
 4788   }
 4789 
 4790   // store N vector registers interleaved into N/2 pairs of quadword
 4791   // memory locations via the address supplied in base using
 4792   // post-increment addressing.
 4793   template<int N>
 4794   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4795     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4796     for (int i = 0; i < N; i += 2) {
 4797       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4798     }
 4799   }
 4800 
 4801   // load N quadword values from memory de-interleaved into N vector
 4802   // registers 3 elements at a time via the address supplied in base.
 4803   template<int N>
 4804   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4805     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4806     for (int i = 0; i < N; i += 3) {
 4807       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4808     }
 4809   }
 4810 
 4811   // load N quadword values from memory de-interleaved into N vector
 4812   // registers 3 elements at a time via the address supplied in base
 4813   // using post-increment addressing.
 4814   template<int N>
 4815   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4816     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4817     for (int i = 0; i < N; i += 3) {
 4818       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4819     }
 4820   }
 4821 
 4822   // load N/2 pairs of quadword values from memory into N vector
 4823   // registers via the address supplied in base with each pair indexed
 4824   // using the the start offset plus the corresponding entry in the
 4825   // offsets array
 4826   template<int N>
 4827   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4828     for (int i = 0; i < N/2; i++) {
 4829       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4830     }
 4831   }
 4832 
 4833   // store N vector registers into N/2 pairs of quadword memory
 4834   // locations via the address supplied in base with each pair indexed
 4835   // using the the start offset plus the corresponding entry in the
 4836   // offsets array
 4837   template<int N>
 4838   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4839     for (int i = 0; i < N/2; i++) {
 4840       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4841     }
 4842   }
 4843 
 4844   // load N single quadword values from memory into N vector registers
 4845   // via the address supplied in base with each value indexed using
 4846   // the the start offset plus the corresponding entry in the offsets
 4847   // array
 4848   template<int N>
 4849   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4850                       int start, int (&offsets)[N]) {
 4851     for (int i = 0; i < N; i++) {
 4852       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4853     }
 4854   }
 4855 
 4856   // store N vector registers into N single quadword memory locations
 4857   // via the address supplied in base with each value indexed using
 4858   // the the start offset plus the corresponding entry in the offsets
 4859   // array
 4860   template<int N>
 4861   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4862                       int start, int (&offsets)[N]) {
 4863     for (int i = 0; i < N; i++) {
 4864       __ str(v[i], T, Address(base, start + offsets[i]));
 4865     }
 4866   }
 4867 
 4868   // load N/2 pairs of quadword values from memory de-interleaved into
 4869   // N vector registers 2 at a time via the address supplied in base
 4870   // with each pair indexed using the the start offset plus the
 4871   // corresponding entry in the offsets array
 4872   template<int N>
 4873   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4874                       Register tmp, int start, int (&offsets)[N/2]) {
 4875     for (int i = 0; i < N/2; i++) {
 4876       __ add(tmp, base, start + offsets[i]);
 4877       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4878     }
 4879   }
 4880 
 4881   // store N vector registers 2 at a time interleaved into N/2 pairs
 4882   // of quadword memory locations via the address supplied in base
 4883   // with each pair indexed using the the start offset plus the
 4884   // corresponding entry in the offsets array
 4885   template<int N>
 4886   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4887                       Register tmp, int start, int (&offsets)[N/2]) {
 4888     for (int i = 0; i < N/2; i++) {
 4889       __ add(tmp, base, start + offsets[i]);
 4890       __ st2(v[2*i], v[2*i+1], T, tmp);
 4891     }
 4892   }
 4893 
 4894   // Helper routines for various flavours of Montgomery multiply
 4895 
 4896   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 4897   // multiplications in parallel
 4898   //
 4899 
 4900   // See the montMul() method of the sun.security.provider.ML_DSA
 4901   // class.
 4902   //
 4903   // Computes 4x4S results or 8x8H results
 4904   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 4905   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 4906   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 4907   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 4908   // Outputs: va - 4x4S or 4x8H vector register sequences
 4909   // vb, vc, vtmp and vq must all be disjoint
 4910   // va must be disjoint from all other inputs/temps or must equal vc
 4911   // va must have a non-zero delta i.e. it must not be a constant vseq.
 4912   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 4913   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 4914                    Assembler::SIMD_Arrangement T,
 4915                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4916     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 4917     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4918     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4919     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4920 
 4921     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4922     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4923 
 4924     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4925 
 4926     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4927     assert(vs_disjoint(va, vb), "va and vb overlap");
 4928     assert(vs_disjoint(va, vq), "va and vq overlap");
 4929     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4930     assert(!va.is_constant(), "output vector must identify 4 different registers");
 4931 
 4932     // schedule 4 streams of instructions across the vector sequences
 4933     for (int i = 0; i < 4; i++) {
 4934       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4935       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 4936     }
 4937 
 4938     for (int i = 0; i < 4; i++) {
 4939       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 4940     }
 4941 
 4942     for (int i = 0; i < 4; i++) {
 4943       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 4944     }
 4945 
 4946     for (int i = 0; i < 4; i++) {
 4947       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 4948     }
 4949   }
 4950 
 4951   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 4952   // multiplications in parallel
 4953   //
 4954 
 4955   // See the montMul() method of the sun.security.provider.ML_DSA
 4956   // class.
 4957   //
 4958   // Computes 4x4S results or 8x8H results
 4959   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 4960   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 4961   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 4962   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 4963   // Outputs: va - 4x4S or 4x8H vector register sequences
 4964   // vb, vc, vtmp and vq must all be disjoint
 4965   // va must be disjoint from all other inputs/temps or must equal vc
 4966   // va must have a non-zero delta i.e. it must not be a constant vseq.
 4967   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 4968   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 4969                    Assembler::SIMD_Arrangement T,
 4970                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 4971     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 4972     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4973     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4974     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4975 
 4976     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4977     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4978 
 4979     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4980 
 4981     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4982     assert(vs_disjoint(va, vb), "va and vb overlap");
 4983     assert(vs_disjoint(va, vq), "va and vq overlap");
 4984     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4985     assert(!va.is_constant(), "output vector must identify 2 different registers");
 4986 
 4987     // schedule 2 streams of instructions across the vector sequences
 4988     for (int i = 0; i < 2; i++) {
 4989       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4990       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 4991     }
 4992 
 4993     for (int i = 0; i < 2; i++) {
 4994       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 4995     }
 4996 
 4997     for (int i = 0; i < 2; i++) {
 4998       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 4999     }
 5000 
 5001     for (int i = 0; i < 2; i++) {
 5002       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5003     }
 5004   }
 5005 
 5006   // Perform 16 16-bit Montgomery multiplications in parallel.
 5007   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5008                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5009     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5010     // It will assert that the register use is valid
 5011     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5012   }
 5013 
 5014   // Perform 32 16-bit Montgomery multiplications in parallel.
 5015   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5016                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5017     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5018     // It will assert that the register use is valid
 5019     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5020   }
 5021 
 5022   // Perform 64 16-bit Montgomery multiplications in parallel.
 5023   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5024                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5025     // Schedule two successive 4x8H multiplies via the montmul helper
 5026     // on the front and back halves of va, vb and vc. The helper will
 5027     // assert that the register use has no overlap conflicts on each
 5028     // individual call but we also need to ensure that the necessary
 5029     // disjoint/equality constraints are met across both calls.
 5030 
 5031     // vb, vc, vtmp and vq must be disjoint. va must either be
 5032     // disjoint from all other registers or equal vc
 5033 
 5034     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5035     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5036     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5037 
 5038     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5039     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5040 
 5041     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5042 
 5043     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5044     assert(vs_disjoint(va, vb), "va and vb overlap");
 5045     assert(vs_disjoint(va, vq), "va and vq overlap");
 5046     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5047 
 5048     // we multiply the front and back halves of each sequence 4 at a
 5049     // time because
 5050     //
 5051     // 1) we are currently only able to get 4-way instruction
 5052     // parallelism at best
 5053     //
 5054     // 2) we need registers for the constants in vq and temporary
 5055     // scratch registers to hold intermediate results so vtmp can only
 5056     // be a VSeq<4> which means we only have 4 scratch slots
 5057 
 5058     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5059     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5060   }
 5061 
 5062   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5063                                const VSeq<4>& vc,
 5064                                const VSeq<4>& vtmp,
 5065                                const VSeq<2>& vq) {
 5066     // compute a = montmul(a1, c)
 5067     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5068     // ouptut a1 = a0 - a
 5069     vs_subv(va1, __ T8H, va0, vc);
 5070     //    and a0 = a0 + a
 5071     vs_addv(va0, __ T8H, va0, vc);
 5072   }
 5073 
 5074   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5075                                const VSeq<4>& vb,
 5076                                const VSeq<4>& vtmp1,
 5077                                const VSeq<4>& vtmp2,
 5078                                const VSeq<2>& vq) {
 5079     // compute c = a0 - a1
 5080     vs_subv(vtmp1, __ T8H, va0, va1);
 5081     // output a0 = a0 + a1
 5082     vs_addv(va0, __ T8H, va0, va1);
 5083     // output a1 = b montmul c
 5084     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5085   }
 5086 
 5087   void load64shorts(const VSeq<8>& v, Register shorts) {
 5088     vs_ldpq_post(v, shorts);
 5089   }
 5090 
 5091   void load32shorts(const VSeq<4>& v, Register shorts) {
 5092     vs_ldpq_post(v, shorts);
 5093   }
 5094 
 5095   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5096     vs_stpq_post(v, tmpAddr);
 5097   }
 5098 
 5099   // Kyber NTT function.
 5100   // Implements
 5101   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5102   //
 5103   // coeffs (short[256]) = c_rarg0
 5104   // ntt_zetas (short[256]) = c_rarg1
 5105   address generate_kyberNtt() {
 5106 
 5107     __ align(CodeEntryAlignment);
 5108     StubGenStubId stub_id = StubGenStubId::kyberNtt_id;
 5109     StubCodeMark mark(this, stub_id);
 5110     address start = __ pc();
 5111     __ enter();
 5112 
 5113     const Register coeffs = c_rarg0;
 5114     const Register zetas = c_rarg1;
 5115 
 5116     const Register kyberConsts = r10;
 5117     const Register tmpAddr = r11;
 5118 
 5119     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5120     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5121     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5122 
 5123     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5124     // load the montmul constants
 5125     vs_ldpq(vq, kyberConsts);
 5126 
 5127     // Each level corresponds to an iteration of the outermost loop of the
 5128     // Java method seilerNTT(int[] coeffs). There are some differences
 5129     // from what is done in the seilerNTT() method, though:
 5130     // 1. The computation is using 16-bit signed values, we do not convert them
 5131     // to ints here.
 5132     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5133     // this array for each level, it is easier that way to fill up the vector
 5134     // registers.
 5135     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5136     // multiplications (this is because that way there should not be any
 5137     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5138     // that we can use the 16-bit arithmetic in the vector unit.
 5139     //
 5140     // On each level, we fill up the vector registers in such a way that the
 5141     // array elements that need to be multiplied by the zetas go into one
 5142     // set of vector registers while the corresponding ones that don't need to
 5143     // be multiplied, go into another set.
 5144     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5145     // registers interleaving the steps of 4 identical computations,
 5146     // each done on 8 16-bit values per register.
 5147 
 5148     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5149     // to the zetas occur in discrete blocks whose size is some multiple
 5150     // of 32.
 5151 
 5152     // level 0
 5153     __ add(tmpAddr, coeffs, 256);
 5154     load64shorts(vs1, tmpAddr);
 5155     load64shorts(vs2, zetas);
 5156     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5157     __ add(tmpAddr, coeffs, 0);
 5158     load64shorts(vs1, tmpAddr);
 5159     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5160     vs_addv(vs1, __ T8H, vs1, vs2);
 5161     __ add(tmpAddr, coeffs, 0);
 5162     vs_stpq_post(vs1, tmpAddr);
 5163     __ add(tmpAddr, coeffs, 256);
 5164     vs_stpq_post(vs3, tmpAddr);
 5165     // restore montmul constants
 5166     vs_ldpq(vq, kyberConsts);
 5167     load64shorts(vs1, tmpAddr);
 5168     load64shorts(vs2, zetas);
 5169     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5170     __ add(tmpAddr, coeffs, 128);
 5171     load64shorts(vs1, tmpAddr);
 5172     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5173     vs_addv(vs1, __ T8H, vs1, vs2);
 5174     __ add(tmpAddr, coeffs, 128);
 5175     store64shorts(vs1, tmpAddr);
 5176     __ add(tmpAddr, coeffs, 384);
 5177     store64shorts(vs3, tmpAddr);
 5178 
 5179     // level 1
 5180     // restore montmul constants
 5181     vs_ldpq(vq, kyberConsts);
 5182     __ add(tmpAddr, coeffs, 128);
 5183     load64shorts(vs1, tmpAddr);
 5184     load64shorts(vs2, zetas);
 5185     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5186     __ add(tmpAddr, coeffs, 0);
 5187     load64shorts(vs1, tmpAddr);
 5188     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5189     vs_addv(vs1, __ T8H, vs1, vs2);
 5190     __ add(tmpAddr, coeffs, 0);
 5191     store64shorts(vs1, tmpAddr);
 5192     store64shorts(vs3, tmpAddr);
 5193     vs_ldpq(vq, kyberConsts);
 5194     __ add(tmpAddr, coeffs, 384);
 5195     load64shorts(vs1, tmpAddr);
 5196     load64shorts(vs2, zetas);
 5197     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5198     __ add(tmpAddr, coeffs, 256);
 5199     load64shorts(vs1, tmpAddr);
 5200     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5201     vs_addv(vs1, __ T8H, vs1, vs2);
 5202     __ add(tmpAddr, coeffs, 256);
 5203     store64shorts(vs1, tmpAddr);
 5204     store64shorts(vs3, tmpAddr);
 5205 
 5206     // level 2
 5207     vs_ldpq(vq, kyberConsts);
 5208     int offsets1[4] = { 0, 32, 128, 160 };
 5209     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5210     load64shorts(vs2, zetas);
 5211     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5212     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5213     // kyber_subv_addv64();
 5214     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5215     vs_addv(vs1, __ T8H, vs1, vs2);
 5216     __ add(tmpAddr, coeffs, 0);
 5217     vs_stpq_post(vs_front(vs1), tmpAddr);
 5218     vs_stpq_post(vs_front(vs3), tmpAddr);
 5219     vs_stpq_post(vs_back(vs1), tmpAddr);
 5220     vs_stpq_post(vs_back(vs3), tmpAddr);
 5221     vs_ldpq(vq, kyberConsts);
 5222     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5223     load64shorts(vs2, zetas);
 5224     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5225     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5226     // kyber_subv_addv64();
 5227     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5228     vs_addv(vs1, __ T8H, vs1, vs2);
 5229     __ add(tmpAddr, coeffs, 256);
 5230     vs_stpq_post(vs_front(vs1), tmpAddr);
 5231     vs_stpq_post(vs_front(vs3), tmpAddr);
 5232     vs_stpq_post(vs_back(vs1), tmpAddr);
 5233     vs_stpq_post(vs_back(vs3), tmpAddr);
 5234 
 5235     // level 3
 5236     vs_ldpq(vq, kyberConsts);
 5237     int offsets2[4] = { 0, 64, 128, 192 };
 5238     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5239     load64shorts(vs2, zetas);
 5240     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5241     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5242     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5243     vs_addv(vs1, __ T8H, vs1, vs2);
 5244     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5245     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5246 
 5247     vs_ldpq(vq, kyberConsts);
 5248     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5249     load64shorts(vs2, zetas);
 5250     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5251     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5252     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5253     vs_addv(vs1, __ T8H, vs1, vs2);
 5254     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5255     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5256 
 5257     // level 4
 5258     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5259     // so they are loaded using employing an ldr at 8 distinct offsets.
 5260 
 5261     vs_ldpq(vq, kyberConsts);
 5262     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5263     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5264     load64shorts(vs2, zetas);
 5265     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5266     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5267     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5268     vs_addv(vs1, __ T8H, vs1, vs2);
 5269     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5270     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5271 
 5272     vs_ldpq(vq, kyberConsts);
 5273     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5274     load64shorts(vs2, zetas);
 5275     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5276     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5277     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5278     vs_addv(vs1, __ T8H, vs1, vs2);
 5279     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5280     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5281 
 5282     // level 5
 5283     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5284     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5285 
 5286     vs_ldpq(vq, kyberConsts);
 5287     int offsets4[4] = { 0, 32, 64, 96 };
 5288     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5289     load32shorts(vs_front(vs2), zetas);
 5290     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5291     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5292     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5293     load32shorts(vs_front(vs2), zetas);
 5294     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5295     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5296     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5297     load32shorts(vs_front(vs2), zetas);
 5298     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5299     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5300 
 5301     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5302     load32shorts(vs_front(vs2), zetas);
 5303     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5304     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5305 
 5306     // level 6
 5307     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5308     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5309 
 5310     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5311     load32shorts(vs_front(vs2), zetas);
 5312     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5313     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5314     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5315     // __ ldpq(v18, v19, __ post(zetas, 32));
 5316     load32shorts(vs_front(vs2), zetas);
 5317     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5318     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5319 
 5320     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5321     load32shorts(vs_front(vs2), zetas);
 5322     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5323     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5324 
 5325     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5326     load32shorts(vs_front(vs2), zetas);
 5327     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5328     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5329 
 5330     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5331     __ mov(r0, zr); // return 0
 5332     __ ret(lr);
 5333 
 5334     return start;
 5335   }
 5336 
 5337   // Kyber Inverse NTT function
 5338   // Implements
 5339   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5340   //
 5341   // coeffs (short[256]) = c_rarg0
 5342   // ntt_zetas (short[256]) = c_rarg1
 5343   address generate_kyberInverseNtt() {
 5344 
 5345     __ align(CodeEntryAlignment);
 5346     StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id;
 5347     StubCodeMark mark(this, stub_id);
 5348     address start = __ pc();
 5349     __ enter();
 5350 
 5351     const Register coeffs = c_rarg0;
 5352     const Register zetas = c_rarg1;
 5353 
 5354     const Register kyberConsts = r10;
 5355     const Register tmpAddr = r11;
 5356     const Register tmpAddr2 = c_rarg2;
 5357 
 5358     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5359     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5360     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5361 
 5362     __ lea(kyberConsts,
 5363              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5364 
 5365     // level 0
 5366     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5367     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5368 
 5369     vs_ldpq(vq, kyberConsts);
 5370     int offsets4[4] = { 0, 32, 64, 96 };
 5371     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5372     load32shorts(vs_front(vs2), zetas);
 5373     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5374                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5375     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5376     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5377     load32shorts(vs_front(vs2), zetas);
 5378     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5379                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5380     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5381     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5382     load32shorts(vs_front(vs2), zetas);
 5383     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5384                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5385     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5386     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5387     load32shorts(vs_front(vs2), zetas);
 5388     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5389                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5390     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5391 
 5392     // level 1
 5393     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5394     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5395 
 5396     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5397     load32shorts(vs_front(vs2), zetas);
 5398     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5399                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5400     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5401     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5402     load32shorts(vs_front(vs2), zetas);
 5403     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5404                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5405     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5406 
 5407     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5408     load32shorts(vs_front(vs2), zetas);
 5409     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5410                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5411     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5412     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5413     load32shorts(vs_front(vs2), zetas);
 5414     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5415                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5416     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5417 
 5418     // level 2
 5419     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5420     // so they are loaded using employing an ldr at 8 distinct offsets.
 5421 
 5422     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5423     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5424     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5425     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5426     vs_subv(vs1, __ T8H, vs1, vs2);
 5427     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5428     load64shorts(vs2, zetas);
 5429     vs_ldpq(vq, kyberConsts);
 5430     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5431     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5432 
 5433     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5434     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5435     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5436     vs_subv(vs1, __ T8H, vs1, vs2);
 5437     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5438     load64shorts(vs2, zetas);
 5439     vs_ldpq(vq, kyberConsts);
 5440     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5441     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5442 
 5443     // Barrett reduction at indexes where overflow may happen
 5444 
 5445     // load q and the multiplier for the Barrett reduction
 5446     __ add(tmpAddr, kyberConsts, 16);
 5447     vs_ldpq(vq, tmpAddr);
 5448 
 5449     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5450     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5451     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5452     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5453     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5454     vs_sshr(vs2, __ T8H, vs2, 11);
 5455     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5456     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5457     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5458     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5459     vs_sshr(vs2, __ T8H, vs2, 11);
 5460     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5461     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5462 
 5463     // level 3
 5464     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5465     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5466 
 5467     int offsets2[4] = { 0, 64, 128, 192 };
 5468     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5469     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5470     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5471     vs_subv(vs1, __ T8H, vs1, vs2);
 5472     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5473     load64shorts(vs2, zetas);
 5474     vs_ldpq(vq, kyberConsts);
 5475     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5476     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5477 
 5478     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5479     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5480     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5481     vs_subv(vs1, __ T8H, vs1, vs2);
 5482     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5483     load64shorts(vs2, zetas);
 5484     vs_ldpq(vq, kyberConsts);
 5485     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5486     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5487 
 5488     // level 4
 5489 
 5490     int offsets1[4] = { 0, 32, 128, 160 };
 5491     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5492     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5493     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5494     vs_subv(vs1, __ T8H, vs1, vs2);
 5495     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5496     load64shorts(vs2, zetas);
 5497     vs_ldpq(vq, kyberConsts);
 5498     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5499     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5500 
 5501     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5502     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5503     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5504     vs_subv(vs1, __ T8H, vs1, vs2);
 5505     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5506     load64shorts(vs2, zetas);
 5507     vs_ldpq(vq, kyberConsts);
 5508     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5509     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5510 
 5511     // level 5
 5512 
 5513     __ add(tmpAddr, coeffs, 0);
 5514     load64shorts(vs1, tmpAddr);
 5515     __ add(tmpAddr, coeffs, 128);
 5516     load64shorts(vs2, tmpAddr);
 5517     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5518     vs_subv(vs1, __ T8H, vs1, vs2);
 5519     __ add(tmpAddr, coeffs, 0);
 5520     store64shorts(vs3, tmpAddr);
 5521     load64shorts(vs2, zetas);
 5522     vs_ldpq(vq, kyberConsts);
 5523     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5524     __ add(tmpAddr, coeffs, 128);
 5525     store64shorts(vs2, tmpAddr);
 5526 
 5527     load64shorts(vs1, tmpAddr);
 5528     __ add(tmpAddr, coeffs, 384);
 5529     load64shorts(vs2, tmpAddr);
 5530     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5531     vs_subv(vs1, __ T8H, vs1, vs2);
 5532     __ add(tmpAddr, coeffs, 256);
 5533     store64shorts(vs3, tmpAddr);
 5534     load64shorts(vs2, zetas);
 5535     vs_ldpq(vq, kyberConsts);
 5536     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5537     __ add(tmpAddr, coeffs, 384);
 5538     store64shorts(vs2, tmpAddr);
 5539 
 5540     // Barrett reduction at indexes where overflow may happen
 5541 
 5542     // load q and the multiplier for the Barrett reduction
 5543     __ add(tmpAddr, kyberConsts, 16);
 5544     vs_ldpq(vq, tmpAddr);
 5545 
 5546     int offsets0[2] = { 0, 256 };
 5547     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5548     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5549     vs_sshr(vs2, __ T8H, vs2, 11);
 5550     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5551     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5552 
 5553     // level 6
 5554 
 5555     __ add(tmpAddr, coeffs, 0);
 5556     load64shorts(vs1, tmpAddr);
 5557     __ add(tmpAddr, coeffs, 256);
 5558     load64shorts(vs2, tmpAddr);
 5559     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5560     vs_subv(vs1, __ T8H, vs1, vs2);
 5561     __ add(tmpAddr, coeffs, 0);
 5562     store64shorts(vs3, tmpAddr);
 5563     load64shorts(vs2, zetas);
 5564     vs_ldpq(vq, kyberConsts);
 5565     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5566     __ add(tmpAddr, coeffs, 256);
 5567     store64shorts(vs2, tmpAddr);
 5568 
 5569     __ add(tmpAddr, coeffs, 128);
 5570     load64shorts(vs1, tmpAddr);
 5571     __ add(tmpAddr, coeffs, 384);
 5572     load64shorts(vs2, tmpAddr);
 5573     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5574     vs_subv(vs1, __ T8H, vs1, vs2);
 5575     __ add(tmpAddr, coeffs, 128);
 5576     store64shorts(vs3, tmpAddr);
 5577     load64shorts(vs2, zetas);
 5578     vs_ldpq(vq, kyberConsts);
 5579     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5580     __ add(tmpAddr, coeffs, 384);
 5581     store64shorts(vs2, tmpAddr);
 5582 
 5583     // multiply by 2^-n
 5584 
 5585     // load toMont(2^-n mod q)
 5586     __ add(tmpAddr, kyberConsts, 48);
 5587     __ ldr(v29, __ Q, tmpAddr);
 5588 
 5589     vs_ldpq(vq, kyberConsts);
 5590     __ add(tmpAddr, coeffs, 0);
 5591     load64shorts(vs1, tmpAddr);
 5592     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5593     __ add(tmpAddr, coeffs, 0);
 5594     store64shorts(vs2, tmpAddr);
 5595 
 5596     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5597     load64shorts(vs1, tmpAddr);
 5598     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5599     __ add(tmpAddr, coeffs, 128);
 5600     store64shorts(vs2, tmpAddr);
 5601 
 5602     // now tmpAddr contains coeffs + 256
 5603     load64shorts(vs1, tmpAddr);
 5604     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5605     __ add(tmpAddr, coeffs, 256);
 5606     store64shorts(vs2, tmpAddr);
 5607 
 5608     // now tmpAddr contains coeffs + 384
 5609     load64shorts(vs1, tmpAddr);
 5610     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5611     __ add(tmpAddr, coeffs, 384);
 5612     store64shorts(vs2, tmpAddr);
 5613 
 5614     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5615     __ mov(r0, zr); // return 0
 5616     __ ret(lr);
 5617 
 5618     return start;
 5619   }
 5620 
 5621   // Kyber multiply polynomials in the NTT domain.
 5622   // Implements
 5623   // static int implKyberNttMult(
 5624   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5625   //
 5626   // result (short[256]) = c_rarg0
 5627   // ntta (short[256]) = c_rarg1
 5628   // nttb (short[256]) = c_rarg2
 5629   // zetas (short[128]) = c_rarg3
 5630   address generate_kyberNttMult() {
 5631 
 5632     __ align(CodeEntryAlignment);
 5633     StubGenStubId stub_id = StubGenStubId::kyberNttMult_id;
 5634     StubCodeMark mark(this, stub_id);
 5635     address start = __ pc();
 5636     __ enter();
 5637 
 5638     const Register result = c_rarg0;
 5639     const Register ntta = c_rarg1;
 5640     const Register nttb = c_rarg2;
 5641     const Register zetas = c_rarg3;
 5642 
 5643     const Register kyberConsts = r10;
 5644     const Register limit = r11;
 5645 
 5646     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5647     VSeq<4> vs3(16), vs4(20);
 5648     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5649     VSeq<2> vz(28);          // pair of zetas
 5650     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5651 
 5652     __ lea(kyberConsts,
 5653              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5654 
 5655     Label kyberNttMult_loop;
 5656 
 5657     __ add(limit, result, 512);
 5658 
 5659     // load q and qinv
 5660     vs_ldpq(vq, kyberConsts);
 5661 
 5662     // load R^2 mod q (to convert back from Montgomery representation)
 5663     __ add(kyberConsts, kyberConsts, 64);
 5664     __ ldr(v27, __ Q, kyberConsts);
 5665 
 5666     __ BIND(kyberNttMult_loop);
 5667 
 5668     // load 16 zetas
 5669     vs_ldpq_post(vz, zetas);
 5670 
 5671     // load 2 sets of 32 coefficients from the two input arrays
 5672     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5673     // are striped across pairs of vector registers
 5674     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5675     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5676     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5677     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5678 
 5679     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5680     // i.e. montmul the first and second halves of vs1 in order and
 5681     // then with one sequence reversed storing the two results in vs3
 5682     //
 5683     // vs3[0] <- montmul(a0, b0)
 5684     // vs3[1] <- montmul(a1, b1)
 5685     // vs3[2] <- montmul(a0, b1)
 5686     // vs3[3] <- montmul(a1, b0)
 5687     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5688     kyber_montmul16(vs_back(vs3),
 5689                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5690 
 5691     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5692     // i.e. montmul the first and second halves of vs4 in order and
 5693     // then with one sequence reversed storing the two results in vs1
 5694     //
 5695     // vs1[0] <- montmul(a2, b2)
 5696     // vs1[1] <- montmul(a3, b3)
 5697     // vs1[2] <- montmul(a2, b3)
 5698     // vs1[3] <- montmul(a3, b2)
 5699     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5700     kyber_montmul16(vs_back(vs1),
 5701                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5702 
 5703     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5704     // We can schedule two montmuls at a time if we use a suitable vector
 5705     // sequence <vs3[1], vs1[1]>.
 5706     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5707     VSeq<2> vs5(vs3[1], delta);
 5708 
 5709     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5710     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5711     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5712 
 5713     // add results in pairs storing in vs3
 5714     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5715     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5716     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5717 
 5718     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5719     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5720     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5721 
 5722     // vs1 <- montmul(vs3, montRSquareModQ)
 5723     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5724 
 5725     // store back the two pairs of result vectors de-interleaved as 8H elements
 5726     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5727     // in memory
 5728     vs_st2_post(vs1, __ T8H, result);
 5729 
 5730     __ cmp(result, limit);
 5731     __ br(Assembler::NE, kyberNttMult_loop);
 5732 
 5733     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5734     __ mov(r0, zr); // return 0
 5735     __ ret(lr);
 5736 
 5737     return start;
 5738   }
 5739 
 5740   // Kyber add 2 polynomials.
 5741   // Implements
 5742   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5743   //
 5744   // result (short[256]) = c_rarg0
 5745   // a (short[256]) = c_rarg1
 5746   // b (short[256]) = c_rarg2
 5747   address generate_kyberAddPoly_2() {
 5748 
 5749     __ align(CodeEntryAlignment);
 5750     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id;
 5751     StubCodeMark mark(this, stub_id);
 5752     address start = __ pc();
 5753     __ enter();
 5754 
 5755     const Register result = c_rarg0;
 5756     const Register a = c_rarg1;
 5757     const Register b = c_rarg2;
 5758 
 5759     const Register kyberConsts = r11;
 5760 
 5761     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5762     // So, we can load, add and store the data in 3 groups of 11,
 5763     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5764     // registers. A further constraint is that the mapping needs
 5765     // to skip callee saves. So, we allocate the register
 5766     // sequences using two 8 sequences, two 2 sequences and two
 5767     // single registers.
 5768     VSeq<8> vs1_1(0);
 5769     VSeq<2> vs1_2(16);
 5770     FloatRegister vs1_3 = v28;
 5771     VSeq<8> vs2_1(18);
 5772     VSeq<2> vs2_2(26);
 5773     FloatRegister vs2_3 = v29;
 5774 
 5775     // two constant vector sequences
 5776     VSeq<8> vc_1(31, 0);
 5777     VSeq<2> vc_2(31, 0);
 5778 
 5779     FloatRegister vc_3 = v31;
 5780     __ lea(kyberConsts,
 5781              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5782 
 5783     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5784     for (int i = 0; i < 3; i++) {
 5785       // load 80 or 88 values from a into vs1_1/2/3
 5786       vs_ldpq_post(vs1_1, a);
 5787       vs_ldpq_post(vs1_2, a);
 5788       if (i < 2) {
 5789         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5790       }
 5791       // load 80 or 88 values from b into vs2_1/2/3
 5792       vs_ldpq_post(vs2_1, b);
 5793       vs_ldpq_post(vs2_2, b);
 5794       if (i < 2) {
 5795         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5796       }
 5797       // sum 80 or 88 values across vs1 and vs2 into vs1
 5798       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5799       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5800       if (i < 2) {
 5801         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5802       }
 5803       // add constant to all 80 or 88 results
 5804       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5805       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5806       if (i < 2) {
 5807         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5808       }
 5809       // store 80 or 88 values
 5810       vs_stpq_post(vs1_1, result);
 5811       vs_stpq_post(vs1_2, result);
 5812       if (i < 2) {
 5813         __ str(vs1_3, __ Q, __ post(result, 16));
 5814       }
 5815     }
 5816 
 5817     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5818     __ mov(r0, zr); // return 0
 5819     __ ret(lr);
 5820 
 5821     return start;
 5822   }
 5823 
 5824   // Kyber add 3 polynomials.
 5825   // Implements
 5826   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5827   //
 5828   // result (short[256]) = c_rarg0
 5829   // a (short[256]) = c_rarg1
 5830   // b (short[256]) = c_rarg2
 5831   // c (short[256]) = c_rarg3
 5832   address generate_kyberAddPoly_3() {
 5833 
 5834     __ align(CodeEntryAlignment);
 5835     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id;
 5836     StubCodeMark mark(this, stub_id);
 5837     address start = __ pc();
 5838     __ enter();
 5839 
 5840     const Register result = c_rarg0;
 5841     const Register a = c_rarg1;
 5842     const Register b = c_rarg2;
 5843     const Register c = c_rarg3;
 5844 
 5845     const Register kyberConsts = r11;
 5846 
 5847     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5848     // quadwords.  So, we can load, add and store the data in 3
 5849     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5850     // of 10 or 11 registers. A further constraint is that the
 5851     // mapping needs to skip callee saves. So, we allocate the
 5852     // register sequences using two 8 sequences, two 2 sequences
 5853     // and two single registers.
 5854     VSeq<8> vs1_1(0);
 5855     VSeq<2> vs1_2(16);
 5856     FloatRegister vs1_3 = v28;
 5857     VSeq<8> vs2_1(18);
 5858     VSeq<2> vs2_2(26);
 5859     FloatRegister vs2_3 = v29;
 5860 
 5861     // two constant vector sequences
 5862     VSeq<8> vc_1(31, 0);
 5863     VSeq<2> vc_2(31, 0);
 5864 
 5865     FloatRegister vc_3 = v31;
 5866 
 5867     __ lea(kyberConsts,
 5868              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5869 
 5870     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5871     for (int i = 0; i < 3; i++) {
 5872       // load 80 or 88 values from a into vs1_1/2/3
 5873       vs_ldpq_post(vs1_1, a);
 5874       vs_ldpq_post(vs1_2, a);
 5875       if (i < 2) {
 5876         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5877       }
 5878       // load 80 or 88 values from b into vs2_1/2/3
 5879       vs_ldpq_post(vs2_1, b);
 5880       vs_ldpq_post(vs2_2, b);
 5881       if (i < 2) {
 5882         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5883       }
 5884       // sum 80 or 88 values across vs1 and vs2 into vs1
 5885       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5886       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5887       if (i < 2) {
 5888         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5889       }
 5890       // load 80 or 88 values from c into vs2_1/2/3
 5891       vs_ldpq_post(vs2_1, c);
 5892       vs_ldpq_post(vs2_2, c);
 5893       if (i < 2) {
 5894         __ ldr(vs2_3, __ Q, __ post(c, 16));
 5895       }
 5896       // sum 80 or 88 values across vs1 and vs2 into vs1
 5897       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5898       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5899       if (i < 2) {
 5900         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5901       }
 5902       // add constant to all 80 or 88 results
 5903       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5904       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5905       if (i < 2) {
 5906         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5907       }
 5908       // store 80 or 88 values
 5909       vs_stpq_post(vs1_1, result);
 5910       vs_stpq_post(vs1_2, result);
 5911       if (i < 2) {
 5912         __ str(vs1_3, __ Q, __ post(result, 16));
 5913       }
 5914     }
 5915 
 5916     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5917     __ mov(r0, zr); // return 0
 5918     __ ret(lr);
 5919 
 5920     return start;
 5921   }
 5922 
 5923   // Kyber parse XOF output to polynomial coefficient candidates
 5924   // or decodePoly(12, ...).
 5925   // Implements
 5926   // static int implKyber12To16(
 5927   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 5928   //
 5929   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 5930   //
 5931   // condensed (byte[]) = c_rarg0
 5932   // condensedIndex = c_rarg1
 5933   // parsed (short[112 or 256]) = c_rarg2
 5934   // parsedLength (112 or 256) = c_rarg3
 5935   address generate_kyber12To16() {
 5936     Label L_F00, L_loop, L_end;
 5937 
 5938     __ BIND(L_F00);
 5939     __ emit_int64(0x0f000f000f000f00);
 5940     __ emit_int64(0x0f000f000f000f00);
 5941 
 5942     __ align(CodeEntryAlignment);
 5943     StubGenStubId stub_id = StubGenStubId::kyber12To16_id;
 5944     StubCodeMark mark(this, stub_id);
 5945     address start = __ pc();
 5946     __ enter();
 5947 
 5948     const Register condensed = c_rarg0;
 5949     const Register condensedOffs = c_rarg1;
 5950     const Register parsed = c_rarg2;
 5951     const Register parsedLength = c_rarg3;
 5952 
 5953     const Register tmpAddr = r11;
 5954 
 5955     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 5956     // quadwords so we need a 6 vector sequence for the inputs.
 5957     // Parsing produces 64 shorts, employing two 8 vector
 5958     // sequences to store and combine the intermediate data.
 5959     VSeq<6> vin(24);
 5960     VSeq<8> va(0), vb(16);
 5961 
 5962     __ adr(tmpAddr, L_F00);
 5963     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 5964     __ add(condensed, condensed, condensedOffs);
 5965 
 5966     __ BIND(L_loop);
 5967     // load 96 (6 x 16B) byte values
 5968     vs_ld3_post(vin, __ T16B, condensed);
 5969 
 5970     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 5971     // holds 48 (16x3) contiguous bytes from memory striped
 5972     // horizontally across each of the 16 byte lanes. Equivalently,
 5973     // that is 16 pairs of 12-bit integers. Likewise the back half
 5974     // holds the next 48 bytes in the same arrangement.
 5975 
 5976     // Each vector in the front half can also be viewed as a vertical
 5977     // strip across the 16 pairs of 12 bit integers. Each byte in
 5978     // vin[0] stores the low 8 bits of the first int in a pair. Each
 5979     // byte in vin[1] stores the high 4 bits of the first int and the
 5980     // low 4 bits of the second int. Each byte in vin[2] stores the
 5981     // high 8 bits of the second int. Likewise the vectors in second
 5982     // half.
 5983 
 5984     // Converting the data to 16-bit shorts requires first of all
 5985     // expanding each of the 6 x 16B vectors into 6 corresponding
 5986     // pairs of 8H vectors. Mask, shift and add operations on the
 5987     // resulting vector pairs can be used to combine 4 and 8 bit
 5988     // parts of related 8H vector elements.
 5989     //
 5990     // The middle vectors (vin[2] and vin[5]) are actually expanded
 5991     // twice, one copy manipulated to provide the lower 4 bits
 5992     // belonging to the first short in a pair and another copy
 5993     // manipulated to provide the higher 4 bits belonging to the
 5994     // second short in a pair. This is why the the vector sequences va
 5995     // and vb used to hold the expanded 8H elements are of length 8.
 5996 
 5997     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 5998     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 5999     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6000     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6001     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6002     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6003     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6004     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6005 
 6006     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6007     // and vb[4:5]
 6008     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6009     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6010     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6011     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6012     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6013     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6014 
 6015     // shift lo byte of copy 1 of the middle stripe into the high byte
 6016     __ shl(va[2], __ T8H, va[2], 8);
 6017     __ shl(va[3], __ T8H, va[3], 8);
 6018     __ shl(vb[2], __ T8H, vb[2], 8);
 6019     __ shl(vb[3], __ T8H, vb[3], 8);
 6020 
 6021     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6022     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6023     // are in bit positions [4..11].
 6024     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6025     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6026     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6027     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6028 
 6029     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6030     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6031     // copy2
 6032     __ andr(va[2], __ T16B, va[2], v31);
 6033     __ andr(va[3], __ T16B, va[3], v31);
 6034     __ ushr(va[4], __ T8H, va[4], 4);
 6035     __ ushr(va[5], __ T8H, va[5], 4);
 6036     __ andr(vb[2], __ T16B, vb[2], v31);
 6037     __ andr(vb[3], __ T16B, vb[3], v31);
 6038     __ ushr(vb[4], __ T8H, vb[4], 4);
 6039     __ ushr(vb[5], __ T8H, vb[5], 4);
 6040 
 6041     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6042     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6043     // n.b. the ordering ensures: i) inputs are consumed before they
 6044     // are overwritten ii) the order of 16-bit results across successive
 6045     // pairs of vectors in va and then vb reflects the order of the
 6046     // corresponding 12-bit inputs
 6047     __ addv(va[0], __ T8H, va[0], va[2]);
 6048     __ addv(va[2], __ T8H, va[1], va[3]);
 6049     __ addv(va[1], __ T8H, va[4], va[6]);
 6050     __ addv(va[3], __ T8H, va[5], va[7]);
 6051     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6052     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6053     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6054     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6055 
 6056     // store 64 results interleaved as shorts
 6057     vs_st2_post(vs_front(va), __ T8H, parsed);
 6058     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6059 
 6060     __ sub(parsedLength, parsedLength, 64);
 6061     __ cmp(parsedLength, (u1)64);
 6062     __ br(Assembler::GE, L_loop);
 6063     __ cbz(parsedLength, L_end);
 6064 
 6065     // if anything is left it should be a final 72 bytes of input
 6066     // i.e. a final 48 12-bit values. so we handle this by loading
 6067     // 48 bytes into all 16B lanes of front(vin) and only 24
 6068     // bytes into the lower 8B lane of back(vin)
 6069     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6070     vs_ld3(vs_back(vin), __ T8B, condensed);
 6071 
 6072     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6073     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6074     // 5 and target element 2 of vb duplicates element 4.
 6075     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6076     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6077     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6078     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6079     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6080     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6081 
 6082     // This time expand just the lower 8 lanes
 6083     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6084     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6085     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6086 
 6087     // shift lo byte of copy 1 of the middle stripe into the high byte
 6088     __ shl(va[2], __ T8H, va[2], 8);
 6089     __ shl(va[3], __ T8H, va[3], 8);
 6090     __ shl(vb[2], __ T8H, vb[2], 8);
 6091 
 6092     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6093     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6094     // int are in bit positions [4..11].
 6095     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6096     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6097     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6098 
 6099     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6100     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6101     // copy2
 6102     __ andr(va[2], __ T16B, va[2], v31);
 6103     __ andr(va[3], __ T16B, va[3], v31);
 6104     __ ushr(va[4], __ T8H, va[4], 4);
 6105     __ ushr(va[5], __ T8H, va[5], 4);
 6106     __ andr(vb[2], __ T16B, vb[2], v31);
 6107     __ ushr(vb[4], __ T8H, vb[4], 4);
 6108 
 6109 
 6110 
 6111     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6112     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6113 
 6114     // n.b. ordering ensures: i) inputs are consumed before they are
 6115     // overwritten ii) order of 16-bit results across succsessive
 6116     // pairs of vectors in va and then lower half of vb reflects order
 6117     // of corresponding 12-bit inputs
 6118     __ addv(va[0], __ T8H, va[0], va[2]);
 6119     __ addv(va[2], __ T8H, va[1], va[3]);
 6120     __ addv(va[1], __ T8H, va[4], va[6]);
 6121     __ addv(va[3], __ T8H, va[5], va[7]);
 6122     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6123     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6124 
 6125     // store 48 results interleaved as shorts
 6126     vs_st2_post(vs_front(va), __ T8H, parsed);
 6127     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6128 
 6129     __ BIND(L_end);
 6130 
 6131     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6132     __ mov(r0, zr); // return 0
 6133     __ ret(lr);
 6134 
 6135     return start;
 6136   }
 6137 
 6138   // Kyber Barrett reduce function.
 6139   // Implements
 6140   // static int implKyberBarrettReduce(short[] coeffs) {}
 6141   //
 6142   // coeffs (short[256]) = c_rarg0
 6143   address generate_kyberBarrettReduce() {
 6144 
 6145     __ align(CodeEntryAlignment);
 6146     StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id;
 6147     StubCodeMark mark(this, stub_id);
 6148     address start = __ pc();
 6149     __ enter();
 6150 
 6151     const Register coeffs = c_rarg0;
 6152 
 6153     const Register kyberConsts = r10;
 6154     const Register result = r11;
 6155 
 6156     // As above we process 256 sets of values in total i.e. 32 x
 6157     // 8H quadwords. So, we can load, add and store the data in 3
 6158     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6159     // of 10 or 11 registers. A further constraint is that the
 6160     // mapping needs to skip callee saves. So, we allocate the
 6161     // register sequences using two 8 sequences, two 2 sequences
 6162     // and two single registers.
 6163     VSeq<8> vs1_1(0);
 6164     VSeq<2> vs1_2(16);
 6165     FloatRegister vs1_3 = v28;
 6166     VSeq<8> vs2_1(18);
 6167     VSeq<2> vs2_2(26);
 6168     FloatRegister vs2_3 = v29;
 6169 
 6170     // we also need a pair of corresponding constant sequences
 6171 
 6172     VSeq<8> vc1_1(30, 0);
 6173     VSeq<2> vc1_2(30, 0);
 6174     FloatRegister vc1_3 = v30; // for kyber_q
 6175 
 6176     VSeq<8> vc2_1(31, 0);
 6177     VSeq<2> vc2_2(31, 0);
 6178     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6179 
 6180     __ add(result, coeffs, 0);
 6181     __ lea(kyberConsts,
 6182              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6183 
 6184     // load q and the multiplier for the Barrett reduction
 6185     __ add(kyberConsts, kyberConsts, 16);
 6186     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6187 
 6188     for (int i = 0; i < 3; i++) {
 6189       // load 80 or 88 coefficients
 6190       vs_ldpq_post(vs1_1, coeffs);
 6191       vs_ldpq_post(vs1_2, coeffs);
 6192       if (i < 2) {
 6193         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6194       }
 6195 
 6196       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6197       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6198       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6199       if (i < 2) {
 6200         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6201       }
 6202 
 6203       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6204       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6205       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6206       if (i < 2) {
 6207         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6208       }
 6209 
 6210       // vs1 <- vs1 - vs2 * kyber_q
 6211       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6212       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6213       if (i < 2) {
 6214         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6215       }
 6216 
 6217       vs_stpq_post(vs1_1, result);
 6218       vs_stpq_post(vs1_2, result);
 6219       if (i < 2) {
 6220         __ str(vs1_3, __ Q, __ post(result, 16));
 6221       }
 6222     }
 6223 
 6224     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6225     __ mov(r0, zr); // return 0
 6226     __ ret(lr);
 6227 
 6228     return start;
 6229   }
 6230 
 6231 
 6232   // Dilithium-specific montmul helper routines that generate parallel
 6233   // code for, respectively, a single 4x4s vector sequence montmul or
 6234   // two such multiplies in a row.
 6235 
 6236   // Perform 16 32-bit Montgomery multiplications in parallel
 6237   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6238                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6239     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6240     // It will assert that the register use is valid
 6241     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6242   }
 6243 
 6244   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6245   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6246                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6247     // Schedule two successive 4x4S multiplies via the montmul helper
 6248     // on the front and back halves of va, vb and vc. The helper will
 6249     // assert that the register use has no overlap conflicts on each
 6250     // individual call but we also need to ensure that the necessary
 6251     // disjoint/equality constraints are met across both calls.
 6252 
 6253     // vb, vc, vtmp and vq must be disjoint. va must either be
 6254     // disjoint from all other registers or equal vc
 6255 
 6256     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6257     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6258     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6259 
 6260     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6261     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6262 
 6263     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6264 
 6265     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6266     assert(vs_disjoint(va, vb), "va and vb overlap");
 6267     assert(vs_disjoint(va, vq), "va and vq overlap");
 6268     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6269 
 6270     // We multiply the front and back halves of each sequence 4 at a
 6271     // time because
 6272     //
 6273     // 1) we are currently only able to get 4-way instruction
 6274     // parallelism at best
 6275     //
 6276     // 2) we need registers for the constants in vq and temporary
 6277     // scratch registers to hold intermediate results so vtmp can only
 6278     // be a VSeq<4> which means we only have 4 scratch slots.
 6279 
 6280     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6281     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6282   }
 6283 
 6284   // Perform combined montmul then add/sub on 4x4S vectors.
 6285   void dilithium_montmul16_sub_add(
 6286           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6287           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6288     // compute a = montmul(a1, c)
 6289     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6290     // ouptut a1 = a0 - a
 6291     vs_subv(va1, __ T4S, va0, vc);
 6292     //    and a0 = a0 + a
 6293     vs_addv(va0, __ T4S, va0, vc);
 6294   }
 6295 
 6296   // Perform combined add/sub then montul on 4x4S vectors.
 6297   void dilithium_sub_add_montmul16(
 6298           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6299           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6300     // compute c = a0 - a1
 6301     vs_subv(vtmp1, __ T4S, va0, va1);
 6302     // output a0 = a0 + a1
 6303     vs_addv(va0, __ T4S, va0, va1);
 6304     // output a1 = b montmul c
 6305     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6306   }
 6307 
 6308   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6309   // in the Java implementation come in sequences of at least 8, so we
 6310   // can use ldpq to collect the corresponding data into pairs of vector
 6311   // registers.
 6312   // We collect the coefficients corresponding to the 'j+l' indexes into
 6313   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6314   // then we do the (Montgomery) multiplications by the zetas in parallel
 6315   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6316   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6317   // v0-v7 and finally save the results back to the coeffs array.
 6318   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6319     const Register coeffs, const Register zetas) {
 6320     int c1 = 0;
 6321     int c2 = 512;
 6322     int startIncr;
 6323     // don't use callee save registers v8 - v15
 6324     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6325     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6326     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6327     int offsets[4] = { 0, 32, 64, 96 };
 6328 
 6329     for (int level = 0; level < 5; level++) {
 6330       int c1Start = c1;
 6331       int c2Start = c2;
 6332       if (level == 3) {
 6333         offsets[1] = 32;
 6334         offsets[2] = 128;
 6335         offsets[3] = 160;
 6336       } else if (level == 4) {
 6337         offsets[1] = 64;
 6338         offsets[2] = 128;
 6339         offsets[3] = 192;
 6340       }
 6341 
 6342       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6343       // time at 4 different offsets and multiply them in order by the
 6344       // next set of input values. So we employ indexed load and store
 6345       // pair instructions with arrangement 4S.
 6346       for (int i = 0; i < 4; i++) {
 6347         // reload q and qinv
 6348         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6349         // load 8x4S coefficients via second start pos == c2
 6350         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6351         // load next 8x4S inputs == b
 6352         vs_ldpq_post(vs2, zetas);
 6353         // compute a == c2 * b mod MONT_Q
 6354         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6355         // load 8x4s coefficients via first start pos == c1
 6356         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6357         // compute a1 =  c1 + a
 6358         vs_addv(vs3, __ T4S, vs1, vs2);
 6359         // compute a2 =  c1 - a
 6360         vs_subv(vs1, __ T4S, vs1, vs2);
 6361         // output a1 and a2
 6362         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6363         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6364 
 6365         int k = 4 * level + i;
 6366 
 6367         if (k > 7) {
 6368           startIncr = 256;
 6369         } else if (k == 5) {
 6370           startIncr = 384;
 6371         } else {
 6372           startIncr = 128;
 6373         }
 6374 
 6375         c1Start += startIncr;
 6376         c2Start += startIncr;
 6377       }
 6378 
 6379       c2 /= 2;
 6380     }
 6381   }
 6382 
 6383   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6384   // Implements the method
 6385   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6386   // of the Java class sun.security.provider
 6387   //
 6388   // coeffs (int[256]) = c_rarg0
 6389   // zetas (int[256]) = c_rarg1
 6390   address generate_dilithiumAlmostNtt() {
 6391 
 6392     __ align(CodeEntryAlignment);
 6393     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 6394     StubCodeMark mark(this, stub_id);
 6395     address start = __ pc();
 6396     __ enter();
 6397 
 6398     const Register coeffs = c_rarg0;
 6399     const Register zetas = c_rarg1;
 6400 
 6401     const Register tmpAddr = r9;
 6402     const Register dilithiumConsts = r10;
 6403     const Register result = r11;
 6404     // don't use callee save registers v8 - v15
 6405     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6406     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6407     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6408     int offsets[4] = { 0, 32, 64, 96};
 6409     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6410     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6411     __ add(result, coeffs, 0);
 6412     __ lea(dilithiumConsts,
 6413              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6414 
 6415     // Each level represents one iteration of the outer for loop of the Java version.
 6416 
 6417     // level 0-4
 6418     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6419 
 6420     // level 5
 6421 
 6422     // At level 5 the coefficients we need to combine with the zetas
 6423     // are grouped in memory in blocks of size 4. So, for both sets of
 6424     // coefficients we load 4 adjacent values at 8 different offsets
 6425     // using an indexed ldr with register variant Q and multiply them
 6426     // in sequence order by the next set of inputs. Likewise we store
 6427     // the resuls using an indexed str with register variant Q.
 6428     for (int i = 0; i < 1024; i += 256) {
 6429       // reload constants q, qinv each iteration as they get clobbered later
 6430       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6431       // load 32 (8x4S) coefficients via first offsets = c1
 6432       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6433       // load next 32 (8x4S) inputs = b
 6434       vs_ldpq_post(vs2, zetas);
 6435       // a = b montul c1
 6436       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6437       // load 32 (8x4S) coefficients via second offsets = c2
 6438       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6439       // add/sub with result of multiply
 6440       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6441       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6442       // write back new coefficients using same offsets
 6443       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6444       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6445     }
 6446 
 6447     // level 6
 6448     // At level 6 the coefficients we need to combine with the zetas
 6449     // are grouped in memory in pairs, the first two being montmul
 6450     // inputs and the second add/sub inputs. We can still implement
 6451     // the montmul+sub+add using 4-way parallelism but only if we
 6452     // combine the coefficients with the zetas 16 at a time. We load 8
 6453     // adjacent values at 4 different offsets using an ld2 load with
 6454     // arrangement 2D. That interleaves the lower and upper halves of
 6455     // each pair of quadwords into successive vector registers. We
 6456     // then need to montmul the 4 even elements of the coefficients
 6457     // register sequence by the zetas in order and then add/sub the 4
 6458     // odd elements of the coefficients register sequence. We use an
 6459     // equivalent st2 operation to store the results back into memory
 6460     // de-interleaved.
 6461     for (int i = 0; i < 1024; i += 128) {
 6462       // reload constants q, qinv each iteration as they get clobbered later
 6463       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6464       // load interleaved 16 (4x2D) coefficients via offsets
 6465       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6466       // load next 16 (4x4S) inputs
 6467       vs_ldpq_post(vs_front(vs2), zetas);
 6468       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6469       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6470                                   vs_front(vs2), vtmp, vq);
 6471       // store interleaved 16 (4x2D) coefficients via offsets
 6472       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6473     }
 6474 
 6475     // level 7
 6476     // At level 7 the coefficients we need to combine with the zetas
 6477     // occur singly with montmul inputs alterating with add/sub
 6478     // inputs. Once again we can use 4-way parallelism to combine 16
 6479     // zetas at a time. However, we have to load 8 adjacent values at
 6480     // 4 different offsets using an ld2 load with arrangement 4S. That
 6481     // interleaves the the odd words of each pair into one
 6482     // coefficients vector register and the even words of the pair
 6483     // into the next register. We then need to montmul the 4 even
 6484     // elements of the coefficients register sequence by the zetas in
 6485     // order and then add/sub the 4 odd elements of the coefficients
 6486     // register sequence. We use an equivalent st2 operation to store
 6487     // the results back into memory de-interleaved.
 6488 
 6489     for (int i = 0; i < 1024; i += 128) {
 6490       // reload constants q, qinv each iteration as they get clobbered later
 6491       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6492       // load interleaved 16 (4x4S) coefficients via offsets
 6493       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6494       // load next 16 (4x4S) inputs
 6495       vs_ldpq_post(vs_front(vs2), zetas);
 6496       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6497       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6498                                   vs_front(vs2), vtmp, vq);
 6499       // store interleaved 16 (4x4S) coefficients via offsets
 6500       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6501     }
 6502     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6503     __ mov(r0, zr); // return 0
 6504     __ ret(lr);
 6505 
 6506     return start;
 6507   }
 6508 
 6509   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6510   // in the Java implementation come in sequences of at least 8, so we
 6511   // can use ldpq to collect the corresponding data into pairs of vector
 6512   // registers
 6513   // We collect the coefficients that correspond to the 'j's into vs1
 6514   // the coefficiets that correspond to the 'j+l's into vs2 then
 6515   // do the additions into vs3 and the subtractions into vs1 then
 6516   // save the result of the additions, load the zetas into vs2
 6517   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6518   // finally save the results back to the coeffs array
 6519   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6520     const Register coeffs, const Register zetas) {
 6521     int c1 = 0;
 6522     int c2 = 32;
 6523     int startIncr;
 6524     int offsets[4];
 6525     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6526     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6527     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6528 
 6529     offsets[0] = 0;
 6530 
 6531     for (int level = 3; level < 8; level++) {
 6532       int c1Start = c1;
 6533       int c2Start = c2;
 6534       if (level == 3) {
 6535         offsets[1] = 64;
 6536         offsets[2] = 128;
 6537         offsets[3] = 192;
 6538       } else if (level == 4) {
 6539         offsets[1] = 32;
 6540         offsets[2] = 128;
 6541         offsets[3] = 160;
 6542       } else {
 6543         offsets[1] = 32;
 6544         offsets[2] = 64;
 6545         offsets[3] = 96;
 6546       }
 6547 
 6548       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6549       // time at 4 different offsets and multiply them in order by the
 6550       // next set of input values. So we employ indexed load and store
 6551       // pair instructions with arrangement 4S.
 6552       for (int i = 0; i < 4; i++) {
 6553         // load v1 32 (8x4S) coefficients relative to first start index
 6554         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6555         // load v2 32 (8x4S) coefficients relative to second start index
 6556         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6557         // a0 = v1 + v2 -- n.b. clobbers vqs
 6558         vs_addv(vs3, __ T4S, vs1, vs2);
 6559         // a1 = v1 - v2
 6560         vs_subv(vs1, __ T4S, vs1, vs2);
 6561         // save a1 relative to first start index
 6562         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6563         // load constants q, qinv each iteration as they get clobbered above
 6564         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6565         // load b next 32 (8x4S) inputs
 6566         vs_ldpq_post(vs2, zetas);
 6567         // a = a1 montmul b
 6568         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6569         // save a relative to second start index
 6570         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6571 
 6572         int k = 4 * level + i;
 6573 
 6574         if (k < 24) {
 6575           startIncr = 256;
 6576         } else if (k == 25) {
 6577           startIncr = 384;
 6578         } else {
 6579           startIncr = 128;
 6580         }
 6581 
 6582         c1Start += startIncr;
 6583         c2Start += startIncr;
 6584       }
 6585 
 6586       c2 *= 2;
 6587     }
 6588   }
 6589 
 6590   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6591   // Implements the method
 6592   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6593   // the sun.security.provider.ML_DSA class.
 6594   //
 6595   // coeffs (int[256]) = c_rarg0
 6596   // zetas (int[256]) = c_rarg1
 6597   address generate_dilithiumAlmostInverseNtt() {
 6598 
 6599     __ align(CodeEntryAlignment);
 6600     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 6601     StubCodeMark mark(this, stub_id);
 6602     address start = __ pc();
 6603     __ enter();
 6604 
 6605     const Register coeffs = c_rarg0;
 6606     const Register zetas = c_rarg1;
 6607 
 6608     const Register tmpAddr = r9;
 6609     const Register dilithiumConsts = r10;
 6610     const Register result = r11;
 6611     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6612     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6613     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6614     int offsets[4] = { 0, 32, 64, 96 };
 6615     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6616     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6617 
 6618     __ add(result, coeffs, 0);
 6619     __ lea(dilithiumConsts,
 6620              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6621 
 6622     // Each level represents one iteration of the outer for loop of the Java version
 6623 
 6624     // level 0
 6625     // At level 0 we need to interleave adjacent quartets of
 6626     // coefficients before we multiply and add/sub by the next 16
 6627     // zetas just as we did for level 7 in the multiply code. So we
 6628     // load and store the values using an ld2/st2 with arrangement 4S.
 6629     for (int i = 0; i < 1024; i += 128) {
 6630       // load constants q, qinv
 6631       // n.b. this can be moved out of the loop as they do not get
 6632       // clobbered by first two loops
 6633       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6634       // a0/a1 load interleaved 32 (8x4S) coefficients
 6635       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6636       // b load next 32 (8x4S) inputs
 6637       vs_ldpq_post(vs_front(vs2), zetas);
 6638       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6639       // n.b. second half of vs2 provides temporary register storage
 6640       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6641                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6642       // a0/a1 store interleaved 32 (8x4S) coefficients
 6643       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6644     }
 6645 
 6646     // level 1
 6647     // At level 1 we need to interleave pairs of adjacent pairs of
 6648     // coefficients before we multiply by the next 16 zetas just as we
 6649     // did for level 6 in the multiply code. So we load and store the
 6650     // values an ld2/st2 with arrangement 2D.
 6651     for (int i = 0; i < 1024; i += 128) {
 6652       // a0/a1 load interleaved 32 (8x2D) coefficients
 6653       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6654       // b load next 16 (4x4S) inputs
 6655       vs_ldpq_post(vs_front(vs2), zetas);
 6656       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6657       // n.b. second half of vs2 provides temporary register storage
 6658       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6659                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6660       // a0/a1 store interleaved 32 (8x2D) coefficients
 6661       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6662     }
 6663 
 6664     // level 2
 6665     // At level 2 coefficients come in blocks of 4. So, we load 4
 6666     // adjacent coefficients at 8 distinct offsets for both the first
 6667     // and second coefficient sequences, using an ldr with register
 6668     // variant Q then combine them with next set of 32 zetas. Likewise
 6669     // we store the results using an str with register variant Q.
 6670     for (int i = 0; i < 1024; i += 256) {
 6671       // c0 load 32 (8x4S) coefficients via first offsets
 6672       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6673       // c1 load 32 (8x4S) coefficients via second offsets
 6674       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6675       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6676       vs_addv(vs3, __ T4S, vs1, vs2);
 6677       // c = c0 - c1
 6678       vs_subv(vs1, __ T4S, vs1, vs2);
 6679       // store a0 32 (8x4S) coefficients via first offsets
 6680       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6681       // b load 32 (8x4S) next inputs
 6682       vs_ldpq_post(vs2, zetas);
 6683       // reload constants q, qinv -- they were clobbered earlier
 6684       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6685       // compute a1 = b montmul c
 6686       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6687       // store a1 32 (8x4S) coefficients via second offsets
 6688       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6689     }
 6690 
 6691     // level 3-7
 6692     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6693 
 6694     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6695     __ mov(r0, zr); // return 0
 6696     __ ret(lr);
 6697 
 6698     return start;
 6699   }
 6700 
 6701   // Dilithium multiply polynomials in the NTT domain.
 6702   // Straightforward implementation of the method
 6703   // static int implDilithiumNttMult(
 6704   //              int[] result, int[] ntta, int[] nttb {} of
 6705   // the sun.security.provider.ML_DSA class.
 6706   //
 6707   // result (int[256]) = c_rarg0
 6708   // poly1 (int[256]) = c_rarg1
 6709   // poly2 (int[256]) = c_rarg2
 6710   address generate_dilithiumNttMult() {
 6711 
 6712         __ align(CodeEntryAlignment);
 6713     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 6714     StubCodeMark mark(this, stub_id);
 6715     address start = __ pc();
 6716     __ enter();
 6717 
 6718     Label L_loop;
 6719 
 6720     const Register result = c_rarg0;
 6721     const Register poly1 = c_rarg1;
 6722     const Register poly2 = c_rarg2;
 6723 
 6724     const Register dilithiumConsts = r10;
 6725     const Register len = r11;
 6726 
 6727     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6728     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6729     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6730     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6731 
 6732     __ lea(dilithiumConsts,
 6733              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6734 
 6735     // load constants q, qinv
 6736     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6737     // load constant rSquare into v29
 6738     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6739 
 6740     __ mov(len, zr);
 6741     __ add(len, len, 1024);
 6742 
 6743     __ BIND(L_loop);
 6744 
 6745     // b load 32 (8x4S) next inputs from poly1
 6746     vs_ldpq_post(vs1, poly1);
 6747     // c load 32 (8x4S) next inputs from poly2
 6748     vs_ldpq_post(vs2, poly2);
 6749     // compute a = b montmul c
 6750     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6751     // compute a = rsquare montmul a
 6752     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6753     // save a 32 (8x4S) results
 6754     vs_stpq_post(vs2, result);
 6755 
 6756     __ sub(len, len, 128);
 6757     __ cmp(len, (u1)128);
 6758     __ br(Assembler::GE, L_loop);
 6759 
 6760     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6761     __ mov(r0, zr); // return 0
 6762     __ ret(lr);
 6763 
 6764     return start;
 6765   }
 6766 
 6767   // Dilithium Motgomery multiply an array by a constant.
 6768   // A straightforward implementation of the method
 6769   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6770   // of the sun.security.provider.MLDSA class
 6771   //
 6772   // coeffs (int[256]) = c_rarg0
 6773   // constant (int) = c_rarg1
 6774   address generate_dilithiumMontMulByConstant() {
 6775 
 6776     __ align(CodeEntryAlignment);
 6777     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 6778     StubCodeMark mark(this, stub_id);
 6779     address start = __ pc();
 6780     __ enter();
 6781 
 6782     Label L_loop;
 6783 
 6784     const Register coeffs = c_rarg0;
 6785     const Register constant = c_rarg1;
 6786 
 6787     const Register dilithiumConsts = r10;
 6788     const Register result = r11;
 6789     const Register len = r12;
 6790 
 6791     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6792     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6793     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6794     VSeq<8> vconst(29, 0);             // for montmul by constant
 6795 
 6796     // results track inputs
 6797     __ add(result, coeffs, 0);
 6798     __ lea(dilithiumConsts,
 6799              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6800 
 6801     // load constants q, qinv -- they do not get clobbered by first two loops
 6802     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6803     // copy caller supplied constant across vconst
 6804     __ dup(vconst[0], __ T4S, constant);
 6805     __ mov(len, zr);
 6806     __ add(len, len, 1024);
 6807 
 6808     __ BIND(L_loop);
 6809 
 6810     // load next 32 inputs
 6811     vs_ldpq_post(vs2, coeffs);
 6812     // mont mul by constant
 6813     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6814     // write next 32 results
 6815     vs_stpq_post(vs2, result);
 6816 
 6817     __ sub(len, len, 128);
 6818     __ cmp(len, (u1)128);
 6819     __ br(Assembler::GE, L_loop);
 6820 
 6821     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6822     __ mov(r0, zr); // return 0
 6823     __ ret(lr);
 6824 
 6825     return start;
 6826   }
 6827 
 6828   // Dilithium decompose poly.
 6829   // Implements the method
 6830   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6831   // of the sun.security.provider.ML_DSA class
 6832   //
 6833   // input (int[256]) = c_rarg0
 6834   // lowPart (int[256]) = c_rarg1
 6835   // highPart (int[256]) = c_rarg2
 6836   // twoGamma2  (int) = c_rarg3
 6837   // multiplier (int) = c_rarg4
 6838   address generate_dilithiumDecomposePoly() {
 6839 
 6840     __ align(CodeEntryAlignment);
 6841     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 6842     StubCodeMark mark(this, stub_id);
 6843     address start = __ pc();
 6844     Label L_loop;
 6845 
 6846     const Register input = c_rarg0;
 6847     const Register lowPart = c_rarg1;
 6848     const Register highPart = c_rarg2;
 6849     const Register twoGamma2 = c_rarg3;
 6850     const Register multiplier = c_rarg4;
 6851 
 6852     const Register len = r9;
 6853     const Register dilithiumConsts = r10;
 6854     const Register tmp = r11;
 6855 
 6856     // 6 independent sets of 4x4s values
 6857     VSeq<4> vs1(0), vs2(4), vs3(8);
 6858     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6859 
 6860     // 7 constants for cross-multiplying
 6861     VSeq<4> one(25, 0);
 6862     VSeq<4> qminus1(26, 0);
 6863     VSeq<4> g2(27, 0);
 6864     VSeq<4> twog2(28, 0);
 6865     VSeq<4> mult(29, 0);
 6866     VSeq<4> q(30, 0);
 6867     VSeq<4> qadd(31, 0);
 6868 
 6869     __ enter();
 6870 
 6871     __ lea(dilithiumConsts,
 6872              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6873 
 6874     // save callee-saved registers
 6875     __ stpd(v8, v9, __ pre(sp, -64));
 6876     __ stpd(v10, v11, Address(sp, 16));
 6877     __ stpd(v12, v13, Address(sp, 32));
 6878     __ stpd(v14, v15, Address(sp, 48));
 6879 
 6880     // populate constant registers
 6881     __ mov(tmp, zr);
 6882     __ add(tmp, tmp, 1);
 6883     __ dup(one[0], __ T4S, tmp); // 1
 6884     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6885     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6886     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6887     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6888     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6889     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6890 
 6891     __ mov(len, zr);
 6892     __ add(len, len, 1024);
 6893 
 6894     __ BIND(L_loop);
 6895 
 6896     // load next 4x4S inputs interleaved: rplus --> vs1
 6897     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 6898 
 6899     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 6900     vs_addv(vtmp, __ T4S, vs1, qadd);
 6901     vs_sshr(vtmp, __ T4S, vtmp, 23);
 6902     vs_mulv(vtmp, __ T4S, vtmp, q);
 6903     vs_subv(vs1, __ T4S, vs1, vtmp);
 6904 
 6905     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 6906     vs_sshr(vtmp, __ T4S, vs1, 31);
 6907     vs_andr(vtmp, vtmp, q);
 6908     vs_addv(vs1, __ T4S, vs1, vtmp);
 6909 
 6910     // quotient --> vs2
 6911     // int quotient = (rplus * multiplier) >> 22;
 6912     vs_mulv(vtmp, __ T4S, vs1, mult);
 6913     vs_sshr(vs2, __ T4S, vtmp, 22);
 6914 
 6915     // r0 --> vs3
 6916     // int r0 = rplus - quotient * twoGamma2;
 6917     vs_mulv(vtmp, __ T4S, vs2, twog2);
 6918     vs_subv(vs3, __ T4S, vs1, vtmp);
 6919 
 6920     // mask --> vs4
 6921     // int mask = (twoGamma2 - r0) >> 22;
 6922     vs_subv(vtmp, __ T4S, twog2, vs3);
 6923     vs_sshr(vs4, __ T4S, vtmp, 22);
 6924 
 6925     // r0 -= (mask & twoGamma2);
 6926     vs_andr(vtmp, vs4, twog2);
 6927     vs_subv(vs3, __ T4S, vs3, vtmp);
 6928 
 6929     //  quotient += (mask & 1);
 6930     vs_andr(vtmp, vs4, one);
 6931     vs_addv(vs2, __ T4S, vs2, vtmp);
 6932 
 6933     // mask = (twoGamma2 / 2 - r0) >> 31;
 6934     vs_subv(vtmp, __ T4S, g2, vs3);
 6935     vs_sshr(vs4, __ T4S, vtmp, 31);
 6936 
 6937     // r0 -= (mask & twoGamma2);
 6938     vs_andr(vtmp, vs4, twog2);
 6939     vs_subv(vs3, __ T4S, vs3, vtmp);
 6940 
 6941     // quotient += (mask & 1);
 6942     vs_andr(vtmp, vs4, one);
 6943     vs_addv(vs2, __ T4S, vs2, vtmp);
 6944 
 6945     // r1 --> vs5
 6946     // int r1 = rplus - r0 - (dilithium_q - 1);
 6947     vs_subv(vtmp, __ T4S, vs1, vs3);
 6948     vs_subv(vs5, __ T4S, vtmp, qminus1);
 6949 
 6950     // r1 --> vs1 (overwriting rplus)
 6951     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 6952     vs_negr(vtmp, __ T4S, vs5);
 6953     vs_orr(vtmp, vs5, vtmp);
 6954     vs_sshr(vs1, __ T4S, vtmp, 31);
 6955 
 6956     // r0 += ~r1;
 6957     vs_notr(vtmp, vs1);
 6958     vs_addv(vs3, __ T4S, vs3, vtmp);
 6959 
 6960     // r1 = r1 & quotient;
 6961     vs_andr(vs1, vs2, vs1);
 6962 
 6963     // store results inteleaved
 6964     // lowPart[m] = r0;
 6965     // highPart[m] = r1;
 6966     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 6967     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 6968 
 6969     __ sub(len, len, 64);
 6970     __ cmp(len, (u1)64);
 6971     __ br(Assembler::GE, L_loop);
 6972 
 6973     // restore callee-saved vector registers
 6974     __ ldpd(v14, v15, Address(sp, 48));
 6975     __ ldpd(v12, v13, Address(sp, 32));
 6976     __ ldpd(v10, v11, Address(sp, 16));
 6977     __ ldpd(v8, v9, __ post(sp, 64));
 6978 
 6979     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6980     __ mov(r0, zr); // return 0
 6981     __ ret(lr);
 6982 
 6983     return start;
 6984   }
 6985 
 6986   /**
 6987    *  Arguments:
 6988    *
 6989    * Inputs:
 6990    *   c_rarg0   - int crc
 6991    *   c_rarg1   - byte* buf
 6992    *   c_rarg2   - int length
 6993    *
 6994    * Output:
 6995    *       rax   - int crc result
 6996    */
 6997   address generate_updateBytesCRC32() {
 6998     assert(UseCRC32Intrinsics, "what are we doing here?");
 6999 
 7000     __ align(CodeEntryAlignment);
 7001     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 7002     StubCodeMark mark(this, stub_id);
 7003 
 7004     address start = __ pc();
 7005 
 7006     const Register crc   = c_rarg0;  // crc
 7007     const Register buf   = c_rarg1;  // source java byte array address
 7008     const Register len   = c_rarg2;  // length
 7009     const Register table0 = c_rarg3; // crc_table address
 7010     const Register table1 = c_rarg4;
 7011     const Register table2 = c_rarg5;
 7012     const Register table3 = c_rarg6;
 7013     const Register tmp3 = c_rarg7;
 7014 
 7015     BLOCK_COMMENT("Entry:");
 7016     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7017 
 7018     __ kernel_crc32(crc, buf, len,
 7019               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7020 
 7021     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7022     __ ret(lr);
 7023 
 7024     return start;
 7025   }
 7026 
 7027   /**
 7028    *  Arguments:
 7029    *
 7030    * Inputs:
 7031    *   c_rarg0   - int crc
 7032    *   c_rarg1   - byte* buf
 7033    *   c_rarg2   - int length
 7034    *   c_rarg3   - int* table
 7035    *
 7036    * Output:
 7037    *       r0   - int crc result
 7038    */
 7039   address generate_updateBytesCRC32C() {
 7040     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7041 
 7042     __ align(CodeEntryAlignment);
 7043     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 7044     StubCodeMark mark(this, stub_id);
 7045 
 7046     address start = __ pc();
 7047 
 7048     const Register crc   = c_rarg0;  // crc
 7049     const Register buf   = c_rarg1;  // source java byte array address
 7050     const Register len   = c_rarg2;  // length
 7051     const Register table0 = c_rarg3; // crc_table address
 7052     const Register table1 = c_rarg4;
 7053     const Register table2 = c_rarg5;
 7054     const Register table3 = c_rarg6;
 7055     const Register tmp3 = c_rarg7;
 7056 
 7057     BLOCK_COMMENT("Entry:");
 7058     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7059 
 7060     __ kernel_crc32c(crc, buf, len,
 7061               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7062 
 7063     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7064     __ ret(lr);
 7065 
 7066     return start;
 7067   }
 7068 
 7069   /***
 7070    *  Arguments:
 7071    *
 7072    *  Inputs:
 7073    *   c_rarg0   - int   adler
 7074    *   c_rarg1   - byte* buff
 7075    *   c_rarg2   - int   len
 7076    *
 7077    * Output:
 7078    *   c_rarg0   - int adler result
 7079    */
 7080   address generate_updateBytesAdler32() {
 7081     __ align(CodeEntryAlignment);
 7082     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 7083     StubCodeMark mark(this, stub_id);
 7084     address start = __ pc();
 7085 
 7086     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7087 
 7088     // Aliases
 7089     Register adler  = c_rarg0;
 7090     Register s1     = c_rarg0;
 7091     Register s2     = c_rarg3;
 7092     Register buff   = c_rarg1;
 7093     Register len    = c_rarg2;
 7094     Register nmax  = r4;
 7095     Register base  = r5;
 7096     Register count = r6;
 7097     Register temp0 = rscratch1;
 7098     Register temp1 = rscratch2;
 7099     FloatRegister vbytes = v0;
 7100     FloatRegister vs1acc = v1;
 7101     FloatRegister vs2acc = v2;
 7102     FloatRegister vtable = v3;
 7103 
 7104     // Max number of bytes we can process before having to take the mod
 7105     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7106     uint64_t BASE = 0xfff1;
 7107     uint64_t NMAX = 0x15B0;
 7108 
 7109     __ mov(base, BASE);
 7110     __ mov(nmax, NMAX);
 7111 
 7112     // Load accumulation coefficients for the upper 16 bits
 7113     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7114     __ ld1(vtable, __ T16B, Address(temp0));
 7115 
 7116     // s1 is initialized to the lower 16 bits of adler
 7117     // s2 is initialized to the upper 16 bits of adler
 7118     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7119     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7120 
 7121     // The pipelined loop needs at least 16 elements for 1 iteration
 7122     // It does check this, but it is more effective to skip to the cleanup loop
 7123     __ cmp(len, (u1)16);
 7124     __ br(Assembler::HS, L_nmax);
 7125     __ cbz(len, L_combine);
 7126 
 7127     __ bind(L_simple_by1_loop);
 7128     __ ldrb(temp0, Address(__ post(buff, 1)));
 7129     __ add(s1, s1, temp0);
 7130     __ add(s2, s2, s1);
 7131     __ subs(len, len, 1);
 7132     __ br(Assembler::HI, L_simple_by1_loop);
 7133 
 7134     // s1 = s1 % BASE
 7135     __ subs(temp0, s1, base);
 7136     __ csel(s1, temp0, s1, Assembler::HS);
 7137 
 7138     // s2 = s2 % BASE
 7139     __ lsr(temp0, s2, 16);
 7140     __ lsl(temp1, temp0, 4);
 7141     __ sub(temp1, temp1, temp0);
 7142     __ add(s2, temp1, s2, ext::uxth);
 7143 
 7144     __ subs(temp0, s2, base);
 7145     __ csel(s2, temp0, s2, Assembler::HS);
 7146 
 7147     __ b(L_combine);
 7148 
 7149     __ bind(L_nmax);
 7150     __ subs(len, len, nmax);
 7151     __ sub(count, nmax, 16);
 7152     __ br(Assembler::LO, L_by16);
 7153 
 7154     __ bind(L_nmax_loop);
 7155 
 7156     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7157                                       vbytes, vs1acc, vs2acc, vtable);
 7158 
 7159     __ subs(count, count, 16);
 7160     __ br(Assembler::HS, L_nmax_loop);
 7161 
 7162     // s1 = s1 % BASE
 7163     __ lsr(temp0, s1, 16);
 7164     __ lsl(temp1, temp0, 4);
 7165     __ sub(temp1, temp1, temp0);
 7166     __ add(temp1, temp1, s1, ext::uxth);
 7167 
 7168     __ lsr(temp0, temp1, 16);
 7169     __ lsl(s1, temp0, 4);
 7170     __ sub(s1, s1, temp0);
 7171     __ add(s1, s1, temp1, ext:: uxth);
 7172 
 7173     __ subs(temp0, s1, base);
 7174     __ csel(s1, temp0, s1, Assembler::HS);
 7175 
 7176     // s2 = s2 % BASE
 7177     __ lsr(temp0, s2, 16);
 7178     __ lsl(temp1, temp0, 4);
 7179     __ sub(temp1, temp1, temp0);
 7180     __ add(temp1, temp1, s2, ext::uxth);
 7181 
 7182     __ lsr(temp0, temp1, 16);
 7183     __ lsl(s2, temp0, 4);
 7184     __ sub(s2, s2, temp0);
 7185     __ add(s2, s2, temp1, ext:: uxth);
 7186 
 7187     __ subs(temp0, s2, base);
 7188     __ csel(s2, temp0, s2, Assembler::HS);
 7189 
 7190     __ subs(len, len, nmax);
 7191     __ sub(count, nmax, 16);
 7192     __ br(Assembler::HS, L_nmax_loop);
 7193 
 7194     __ bind(L_by16);
 7195     __ adds(len, len, count);
 7196     __ br(Assembler::LO, L_by1);
 7197 
 7198     __ bind(L_by16_loop);
 7199 
 7200     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7201                                       vbytes, vs1acc, vs2acc, vtable);
 7202 
 7203     __ subs(len, len, 16);
 7204     __ br(Assembler::HS, L_by16_loop);
 7205 
 7206     __ bind(L_by1);
 7207     __ adds(len, len, 15);
 7208     __ br(Assembler::LO, L_do_mod);
 7209 
 7210     __ bind(L_by1_loop);
 7211     __ ldrb(temp0, Address(__ post(buff, 1)));
 7212     __ add(s1, temp0, s1);
 7213     __ add(s2, s2, s1);
 7214     __ subs(len, len, 1);
 7215     __ br(Assembler::HS, L_by1_loop);
 7216 
 7217     __ bind(L_do_mod);
 7218     // s1 = s1 % BASE
 7219     __ lsr(temp0, s1, 16);
 7220     __ lsl(temp1, temp0, 4);
 7221     __ sub(temp1, temp1, temp0);
 7222     __ add(temp1, temp1, s1, ext::uxth);
 7223 
 7224     __ lsr(temp0, temp1, 16);
 7225     __ lsl(s1, temp0, 4);
 7226     __ sub(s1, s1, temp0);
 7227     __ add(s1, s1, temp1, ext:: uxth);
 7228 
 7229     __ subs(temp0, s1, base);
 7230     __ csel(s1, temp0, s1, Assembler::HS);
 7231 
 7232     // s2 = s2 % BASE
 7233     __ lsr(temp0, s2, 16);
 7234     __ lsl(temp1, temp0, 4);
 7235     __ sub(temp1, temp1, temp0);
 7236     __ add(temp1, temp1, s2, ext::uxth);
 7237 
 7238     __ lsr(temp0, temp1, 16);
 7239     __ lsl(s2, temp0, 4);
 7240     __ sub(s2, s2, temp0);
 7241     __ add(s2, s2, temp1, ext:: uxth);
 7242 
 7243     __ subs(temp0, s2, base);
 7244     __ csel(s2, temp0, s2, Assembler::HS);
 7245 
 7246     // Combine lower bits and higher bits
 7247     __ bind(L_combine);
 7248     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7249 
 7250     __ ret(lr);
 7251 
 7252     return start;
 7253   }
 7254 
 7255   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7256           Register temp0, Register temp1, FloatRegister vbytes,
 7257           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7258     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7259     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7260     // In non-vectorized code, we update s1 and s2 as:
 7261     //   s1 <- s1 + b1
 7262     //   s2 <- s2 + s1
 7263     //   s1 <- s1 + b2
 7264     //   s2 <- s2 + b1
 7265     //   ...
 7266     //   s1 <- s1 + b16
 7267     //   s2 <- s2 + s1
 7268     // Putting above assignments together, we have:
 7269     //   s1_new = s1 + b1 + b2 + ... + b16
 7270     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7271     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7272     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7273     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7274 
 7275     // s2 = s2 + s1 * 16
 7276     __ add(s2, s2, s1, Assembler::LSL, 4);
 7277 
 7278     // vs1acc = b1 + b2 + b3 + ... + b16
 7279     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7280     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7281     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7282     __ uaddlv(vs1acc, __ T16B, vbytes);
 7283     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7284 
 7285     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7286     __ fmovd(temp0, vs1acc);
 7287     __ fmovd(temp1, vs2acc);
 7288     __ add(s1, s1, temp0);
 7289     __ add(s2, s2, temp1);
 7290   }
 7291 
 7292   /**
 7293    *  Arguments:
 7294    *
 7295    *  Input:
 7296    *    c_rarg0   - x address
 7297    *    c_rarg1   - x length
 7298    *    c_rarg2   - y address
 7299    *    c_rarg3   - y length
 7300    *    c_rarg4   - z address
 7301    */
 7302   address generate_multiplyToLen() {
 7303     __ align(CodeEntryAlignment);
 7304     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 7305     StubCodeMark mark(this, stub_id);
 7306 
 7307     address start = __ pc();
 7308     const Register x     = r0;
 7309     const Register xlen  = r1;
 7310     const Register y     = r2;
 7311     const Register ylen  = r3;
 7312     const Register z     = r4;
 7313 
 7314     const Register tmp0  = r5;
 7315     const Register tmp1  = r10;
 7316     const Register tmp2  = r11;
 7317     const Register tmp3  = r12;
 7318     const Register tmp4  = r13;
 7319     const Register tmp5  = r14;
 7320     const Register tmp6  = r15;
 7321     const Register tmp7  = r16;
 7322 
 7323     BLOCK_COMMENT("Entry:");
 7324     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7325     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7326     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7327     __ ret(lr);
 7328 
 7329     return start;
 7330   }
 7331 
 7332   address generate_squareToLen() {
 7333     // squareToLen algorithm for sizes 1..127 described in java code works
 7334     // faster than multiply_to_len on some CPUs and slower on others, but
 7335     // multiply_to_len shows a bit better overall results
 7336     __ align(CodeEntryAlignment);
 7337     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 7338     StubCodeMark mark(this, stub_id);
 7339     address start = __ pc();
 7340 
 7341     const Register x     = r0;
 7342     const Register xlen  = r1;
 7343     const Register z     = r2;
 7344     const Register y     = r4; // == x
 7345     const Register ylen  = r5; // == xlen
 7346 
 7347     const Register tmp0  = r3;
 7348     const Register tmp1  = r10;
 7349     const Register tmp2  = r11;
 7350     const Register tmp3  = r12;
 7351     const Register tmp4  = r13;
 7352     const Register tmp5  = r14;
 7353     const Register tmp6  = r15;
 7354     const Register tmp7  = r16;
 7355 
 7356     RegSet spilled_regs = RegSet::of(y, ylen);
 7357     BLOCK_COMMENT("Entry:");
 7358     __ enter();
 7359     __ push(spilled_regs, sp);
 7360     __ mov(y, x);
 7361     __ mov(ylen, xlen);
 7362     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7363     __ pop(spilled_regs, sp);
 7364     __ leave();
 7365     __ ret(lr);
 7366     return start;
 7367   }
 7368 
 7369   address generate_mulAdd() {
 7370     __ align(CodeEntryAlignment);
 7371     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 7372     StubCodeMark mark(this, stub_id);
 7373 
 7374     address start = __ pc();
 7375 
 7376     const Register out     = r0;
 7377     const Register in      = r1;
 7378     const Register offset  = r2;
 7379     const Register len     = r3;
 7380     const Register k       = r4;
 7381 
 7382     BLOCK_COMMENT("Entry:");
 7383     __ enter();
 7384     __ mul_add(out, in, offset, len, k);
 7385     __ leave();
 7386     __ ret(lr);
 7387 
 7388     return start;
 7389   }
 7390 
 7391   // Arguments:
 7392   //
 7393   // Input:
 7394   //   c_rarg0   - newArr address
 7395   //   c_rarg1   - oldArr address
 7396   //   c_rarg2   - newIdx
 7397   //   c_rarg3   - shiftCount
 7398   //   c_rarg4   - numIter
 7399   //
 7400   address generate_bigIntegerRightShift() {
 7401     __ align(CodeEntryAlignment);
 7402     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 7403     StubCodeMark mark(this, stub_id);
 7404     address start = __ pc();
 7405 
 7406     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7407 
 7408     Register newArr        = c_rarg0;
 7409     Register oldArr        = c_rarg1;
 7410     Register newIdx        = c_rarg2;
 7411     Register shiftCount    = c_rarg3;
 7412     Register numIter       = c_rarg4;
 7413     Register idx           = numIter;
 7414 
 7415     Register newArrCur     = rscratch1;
 7416     Register shiftRevCount = rscratch2;
 7417     Register oldArrCur     = r13;
 7418     Register oldArrNext    = r14;
 7419 
 7420     FloatRegister oldElem0        = v0;
 7421     FloatRegister oldElem1        = v1;
 7422     FloatRegister newElem         = v2;
 7423     FloatRegister shiftVCount     = v3;
 7424     FloatRegister shiftVRevCount  = v4;
 7425 
 7426     __ cbz(idx, Exit);
 7427 
 7428     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7429 
 7430     // left shift count
 7431     __ movw(shiftRevCount, 32);
 7432     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7433 
 7434     // numIter too small to allow a 4-words SIMD loop, rolling back
 7435     __ cmp(numIter, (u1)4);
 7436     __ br(Assembler::LT, ShiftThree);
 7437 
 7438     __ dup(shiftVCount,    __ T4S, shiftCount);
 7439     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7440     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7441 
 7442     __ BIND(ShiftSIMDLoop);
 7443 
 7444     // Calculate the load addresses
 7445     __ sub(idx, idx, 4);
 7446     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7447     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7448     __ add(oldArrCur,  oldArrNext, 4);
 7449 
 7450     // Load 4 words and process
 7451     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7452     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7453     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7454     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7455     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7456     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7457 
 7458     __ cmp(idx, (u1)4);
 7459     __ br(Assembler::LT, ShiftTwoLoop);
 7460     __ b(ShiftSIMDLoop);
 7461 
 7462     __ BIND(ShiftTwoLoop);
 7463     __ cbz(idx, Exit);
 7464     __ cmp(idx, (u1)1);
 7465     __ br(Assembler::EQ, ShiftOne);
 7466 
 7467     // Calculate the load addresses
 7468     __ sub(idx, idx, 2);
 7469     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7470     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7471     __ add(oldArrCur,  oldArrNext, 4);
 7472 
 7473     // Load 2 words and process
 7474     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7475     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7476     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7477     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7478     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7479     __ st1(newElem,   __ T2S, Address(newArrCur));
 7480     __ b(ShiftTwoLoop);
 7481 
 7482     __ BIND(ShiftThree);
 7483     __ tbz(idx, 1, ShiftOne);
 7484     __ tbz(idx, 0, ShiftTwo);
 7485     __ ldrw(r10,  Address(oldArr, 12));
 7486     __ ldrw(r11,  Address(oldArr, 8));
 7487     __ lsrvw(r10, r10, shiftCount);
 7488     __ lslvw(r11, r11, shiftRevCount);
 7489     __ orrw(r12,  r10, r11);
 7490     __ strw(r12,  Address(newArr, 8));
 7491 
 7492     __ BIND(ShiftTwo);
 7493     __ ldrw(r10,  Address(oldArr, 8));
 7494     __ ldrw(r11,  Address(oldArr, 4));
 7495     __ lsrvw(r10, r10, shiftCount);
 7496     __ lslvw(r11, r11, shiftRevCount);
 7497     __ orrw(r12,  r10, r11);
 7498     __ strw(r12,  Address(newArr, 4));
 7499 
 7500     __ BIND(ShiftOne);
 7501     __ ldrw(r10,  Address(oldArr, 4));
 7502     __ ldrw(r11,  Address(oldArr));
 7503     __ lsrvw(r10, r10, shiftCount);
 7504     __ lslvw(r11, r11, shiftRevCount);
 7505     __ orrw(r12,  r10, r11);
 7506     __ strw(r12,  Address(newArr));
 7507 
 7508     __ BIND(Exit);
 7509     __ ret(lr);
 7510 
 7511     return start;
 7512   }
 7513 
 7514   // Arguments:
 7515   //
 7516   // Input:
 7517   //   c_rarg0   - newArr address
 7518   //   c_rarg1   - oldArr address
 7519   //   c_rarg2   - newIdx
 7520   //   c_rarg3   - shiftCount
 7521   //   c_rarg4   - numIter
 7522   //
 7523   address generate_bigIntegerLeftShift() {
 7524     __ align(CodeEntryAlignment);
 7525     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 7526     StubCodeMark mark(this, stub_id);
 7527     address start = __ pc();
 7528 
 7529     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7530 
 7531     Register newArr        = c_rarg0;
 7532     Register oldArr        = c_rarg1;
 7533     Register newIdx        = c_rarg2;
 7534     Register shiftCount    = c_rarg3;
 7535     Register numIter       = c_rarg4;
 7536 
 7537     Register shiftRevCount = rscratch1;
 7538     Register oldArrNext    = rscratch2;
 7539 
 7540     FloatRegister oldElem0        = v0;
 7541     FloatRegister oldElem1        = v1;
 7542     FloatRegister newElem         = v2;
 7543     FloatRegister shiftVCount     = v3;
 7544     FloatRegister shiftVRevCount  = v4;
 7545 
 7546     __ cbz(numIter, Exit);
 7547 
 7548     __ add(oldArrNext, oldArr, 4);
 7549     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7550 
 7551     // right shift count
 7552     __ movw(shiftRevCount, 32);
 7553     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7554 
 7555     // numIter too small to allow a 4-words SIMD loop, rolling back
 7556     __ cmp(numIter, (u1)4);
 7557     __ br(Assembler::LT, ShiftThree);
 7558 
 7559     __ dup(shiftVCount,     __ T4S, shiftCount);
 7560     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 7561     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 7562 
 7563     __ BIND(ShiftSIMDLoop);
 7564 
 7565     // load 4 words and process
 7566     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 7567     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 7568     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7569     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7570     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7571     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 7572     __ sub(numIter,   numIter, 4);
 7573 
 7574     __ cmp(numIter, (u1)4);
 7575     __ br(Assembler::LT, ShiftTwoLoop);
 7576     __ b(ShiftSIMDLoop);
 7577 
 7578     __ BIND(ShiftTwoLoop);
 7579     __ cbz(numIter, Exit);
 7580     __ cmp(numIter, (u1)1);
 7581     __ br(Assembler::EQ, ShiftOne);
 7582 
 7583     // load 2 words and process
 7584     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 7585     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 7586     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 7587     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 7588     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 7589     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 7590     __ sub(numIter,   numIter, 2);
 7591     __ b(ShiftTwoLoop);
 7592 
 7593     __ BIND(ShiftThree);
 7594     __ ldrw(r10,  __ post(oldArr, 4));
 7595     __ ldrw(r11,  __ post(oldArrNext, 4));
 7596     __ lslvw(r10, r10, shiftCount);
 7597     __ lsrvw(r11, r11, shiftRevCount);
 7598     __ orrw(r12,  r10, r11);
 7599     __ strw(r12,  __ post(newArr, 4));
 7600     __ tbz(numIter, 1, Exit);
 7601     __ tbz(numIter, 0, ShiftOne);
 7602 
 7603     __ BIND(ShiftTwo);
 7604     __ ldrw(r10,  __ post(oldArr, 4));
 7605     __ ldrw(r11,  __ post(oldArrNext, 4));
 7606     __ lslvw(r10, r10, shiftCount);
 7607     __ lsrvw(r11, r11, shiftRevCount);
 7608     __ orrw(r12,  r10, r11);
 7609     __ strw(r12,  __ post(newArr, 4));
 7610 
 7611     __ BIND(ShiftOne);
 7612     __ ldrw(r10,  Address(oldArr));
 7613     __ ldrw(r11,  Address(oldArrNext));
 7614     __ lslvw(r10, r10, shiftCount);
 7615     __ lsrvw(r11, r11, shiftRevCount);
 7616     __ orrw(r12,  r10, r11);
 7617     __ strw(r12,  Address(newArr));
 7618 
 7619     __ BIND(Exit);
 7620     __ ret(lr);
 7621 
 7622     return start;
 7623   }
 7624 
 7625   address generate_count_positives(address &count_positives_long) {
 7626     const u1 large_loop_size = 64;
 7627     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 7628     int dcache_line = VM_Version::dcache_line_size();
 7629 
 7630     Register ary1 = r1, len = r2, result = r0;
 7631 
 7632     __ align(CodeEntryAlignment);
 7633 
 7634     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 7635     StubCodeMark mark(this, stub_id);
 7636 
 7637     address entry = __ pc();
 7638 
 7639     __ enter();
 7640     // precondition: a copy of len is already in result
 7641     // __ mov(result, len);
 7642 
 7643   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 7644         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 7645 
 7646   __ cmp(len, (u1)15);
 7647   __ br(Assembler::GT, LEN_OVER_15);
 7648   // The only case when execution falls into this code is when pointer is near
 7649   // the end of memory page and we have to avoid reading next page
 7650   __ add(ary1, ary1, len);
 7651   __ subs(len, len, 8);
 7652   __ br(Assembler::GT, LEN_OVER_8);
 7653   __ ldr(rscratch2, Address(ary1, -8));
 7654   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 7655   __ lsrv(rscratch2, rscratch2, rscratch1);
 7656   __ tst(rscratch2, UPPER_BIT_MASK);
 7657   __ csel(result, zr, result, Assembler::NE);
 7658   __ leave();
 7659   __ ret(lr);
 7660   __ bind(LEN_OVER_8);
 7661   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 7662   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 7663   __ tst(rscratch2, UPPER_BIT_MASK);
 7664   __ br(Assembler::NE, RET_NO_POP);
 7665   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 7666   __ lsrv(rscratch1, rscratch1, rscratch2);
 7667   __ tst(rscratch1, UPPER_BIT_MASK);
 7668   __ bind(RET_NO_POP);
 7669   __ csel(result, zr, result, Assembler::NE);
 7670   __ leave();
 7671   __ ret(lr);
 7672 
 7673   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 7674   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 7675 
 7676   count_positives_long = __ pc(); // 2nd entry point
 7677 
 7678   __ enter();
 7679 
 7680   __ bind(LEN_OVER_15);
 7681     __ push(spilled_regs, sp);
 7682     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 7683     __ cbz(rscratch2, ALIGNED);
 7684     __ ldp(tmp6, tmp1, Address(ary1));
 7685     __ mov(tmp5, 16);
 7686     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 7687     __ add(ary1, ary1, rscratch1);
 7688     __ orr(tmp6, tmp6, tmp1);
 7689     __ tst(tmp6, UPPER_BIT_MASK);
 7690     __ br(Assembler::NE, RET_ADJUST);
 7691     __ sub(len, len, rscratch1);
 7692 
 7693   __ bind(ALIGNED);
 7694     __ cmp(len, large_loop_size);
 7695     __ br(Assembler::LT, CHECK_16);
 7696     // Perform 16-byte load as early return in pre-loop to handle situation
 7697     // when initially aligned large array has negative values at starting bytes,
 7698     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 7699     // slower. Cases with negative bytes further ahead won't be affected that
 7700     // much. In fact, it'll be faster due to early loads, less instructions and
 7701     // less branches in LARGE_LOOP.
 7702     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 7703     __ sub(len, len, 16);
 7704     __ orr(tmp6, tmp6, tmp1);
 7705     __ tst(tmp6, UPPER_BIT_MASK);
 7706     __ br(Assembler::NE, RET_ADJUST_16);
 7707     __ cmp(len, large_loop_size);
 7708     __ br(Assembler::LT, CHECK_16);
 7709 
 7710     if (SoftwarePrefetchHintDistance >= 0
 7711         && SoftwarePrefetchHintDistance >= dcache_line) {
 7712       // initial prefetch
 7713       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 7714     }
 7715   __ bind(LARGE_LOOP);
 7716     if (SoftwarePrefetchHintDistance >= 0) {
 7717       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 7718     }
 7719     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 7720     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 7721     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 7722     // instructions per cycle and have less branches, but this approach disables
 7723     // early return, thus, all 64 bytes are loaded and checked every time.
 7724     __ ldp(tmp2, tmp3, Address(ary1));
 7725     __ ldp(tmp4, tmp5, Address(ary1, 16));
 7726     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 7727     __ ldp(tmp6, tmp1, Address(ary1, 48));
 7728     __ add(ary1, ary1, large_loop_size);
 7729     __ sub(len, len, large_loop_size);
 7730     __ orr(tmp2, tmp2, tmp3);
 7731     __ orr(tmp4, tmp4, tmp5);
 7732     __ orr(rscratch1, rscratch1, rscratch2);
 7733     __ orr(tmp6, tmp6, tmp1);
 7734     __ orr(tmp2, tmp2, tmp4);
 7735     __ orr(rscratch1, rscratch1, tmp6);
 7736     __ orr(tmp2, tmp2, rscratch1);
 7737     __ tst(tmp2, UPPER_BIT_MASK);
 7738     __ br(Assembler::NE, RET_ADJUST_LONG);
 7739     __ cmp(len, large_loop_size);
 7740     __ br(Assembler::GE, LARGE_LOOP);
 7741 
 7742   __ bind(CHECK_16); // small 16-byte load pre-loop
 7743     __ cmp(len, (u1)16);
 7744     __ br(Assembler::LT, POST_LOOP16);
 7745 
 7746   __ bind(LOOP16); // small 16-byte load loop
 7747     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 7748     __ sub(len, len, 16);
 7749     __ orr(tmp2, tmp2, tmp3);
 7750     __ tst(tmp2, UPPER_BIT_MASK);
 7751     __ br(Assembler::NE, RET_ADJUST_16);
 7752     __ cmp(len, (u1)16);
 7753     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 7754 
 7755   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 7756     __ cmp(len, (u1)8);
 7757     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 7758     __ ldr(tmp3, Address(__ post(ary1, 8)));
 7759     __ tst(tmp3, UPPER_BIT_MASK);
 7760     __ br(Assembler::NE, RET_ADJUST);
 7761     __ sub(len, len, 8);
 7762 
 7763   __ bind(POST_LOOP16_LOAD_TAIL);
 7764     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 7765     __ ldr(tmp1, Address(ary1));
 7766     __ mov(tmp2, 64);
 7767     __ sub(tmp4, tmp2, len, __ LSL, 3);
 7768     __ lslv(tmp1, tmp1, tmp4);
 7769     __ tst(tmp1, UPPER_BIT_MASK);
 7770     __ br(Assembler::NE, RET_ADJUST);
 7771     // Fallthrough
 7772 
 7773   __ bind(RET_LEN);
 7774     __ pop(spilled_regs, sp);
 7775     __ leave();
 7776     __ ret(lr);
 7777 
 7778     // difference result - len is the count of guaranteed to be
 7779     // positive bytes
 7780 
 7781   __ bind(RET_ADJUST_LONG);
 7782     __ add(len, len, (u1)(large_loop_size - 16));
 7783   __ bind(RET_ADJUST_16);
 7784     __ add(len, len, 16);
 7785   __ bind(RET_ADJUST);
 7786     __ pop(spilled_regs, sp);
 7787     __ leave();
 7788     __ sub(result, result, len);
 7789     __ ret(lr);
 7790 
 7791     return entry;
 7792   }
 7793 
 7794   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 7795         bool usePrefetch, Label &NOT_EQUAL) {
 7796     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7797         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7798         tmp7 = r12, tmp8 = r13;
 7799     Label LOOP;
 7800 
 7801     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7802     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7803     __ bind(LOOP);
 7804     if (usePrefetch) {
 7805       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7806       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7807     }
 7808     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7809     __ eor(tmp1, tmp1, tmp2);
 7810     __ eor(tmp3, tmp3, tmp4);
 7811     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7812     __ orr(tmp1, tmp1, tmp3);
 7813     __ cbnz(tmp1, NOT_EQUAL);
 7814     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7815     __ eor(tmp5, tmp5, tmp6);
 7816     __ eor(tmp7, tmp7, tmp8);
 7817     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7818     __ orr(tmp5, tmp5, tmp7);
 7819     __ cbnz(tmp5, NOT_EQUAL);
 7820     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7821     __ eor(tmp1, tmp1, tmp2);
 7822     __ eor(tmp3, tmp3, tmp4);
 7823     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7824     __ orr(tmp1, tmp1, tmp3);
 7825     __ cbnz(tmp1, NOT_EQUAL);
 7826     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7827     __ eor(tmp5, tmp5, tmp6);
 7828     __ sub(cnt1, cnt1, 8 * wordSize);
 7829     __ eor(tmp7, tmp7, tmp8);
 7830     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7831     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 7832     // cmp) because subs allows an unlimited range of immediate operand.
 7833     __ subs(tmp6, cnt1, loopThreshold);
 7834     __ orr(tmp5, tmp5, tmp7);
 7835     __ cbnz(tmp5, NOT_EQUAL);
 7836     __ br(__ GE, LOOP);
 7837     // post-loop
 7838     __ eor(tmp1, tmp1, tmp2);
 7839     __ eor(tmp3, tmp3, tmp4);
 7840     __ orr(tmp1, tmp1, tmp3);
 7841     __ sub(cnt1, cnt1, 2 * wordSize);
 7842     __ cbnz(tmp1, NOT_EQUAL);
 7843   }
 7844 
 7845   void generate_large_array_equals_loop_simd(int loopThreshold,
 7846         bool usePrefetch, Label &NOT_EQUAL) {
 7847     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7848         tmp2 = rscratch2;
 7849     Label LOOP;
 7850 
 7851     __ bind(LOOP);
 7852     if (usePrefetch) {
 7853       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7854       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7855     }
 7856     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 7857     __ sub(cnt1, cnt1, 8 * wordSize);
 7858     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 7859     __ subs(tmp1, cnt1, loopThreshold);
 7860     __ eor(v0, __ T16B, v0, v4);
 7861     __ eor(v1, __ T16B, v1, v5);
 7862     __ eor(v2, __ T16B, v2, v6);
 7863     __ eor(v3, __ T16B, v3, v7);
 7864     __ orr(v0, __ T16B, v0, v1);
 7865     __ orr(v1, __ T16B, v2, v3);
 7866     __ orr(v0, __ T16B, v0, v1);
 7867     __ umov(tmp1, v0, __ D, 0);
 7868     __ umov(tmp2, v0, __ D, 1);
 7869     __ orr(tmp1, tmp1, tmp2);
 7870     __ cbnz(tmp1, NOT_EQUAL);
 7871     __ br(__ GE, LOOP);
 7872   }
 7873 
 7874   // a1 = r1 - array1 address
 7875   // a2 = r2 - array2 address
 7876   // result = r0 - return value. Already contains "false"
 7877   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 7878   // r3-r5 are reserved temporary registers
 7879   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 7880   address generate_large_array_equals() {
 7881     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7882         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7883         tmp7 = r12, tmp8 = r13;
 7884     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 7885         SMALL_LOOP, POST_LOOP;
 7886     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 7887     // calculate if at least 32 prefetched bytes are used
 7888     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 7889     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 7890     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 7891     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 7892         tmp5, tmp6, tmp7, tmp8);
 7893 
 7894     __ align(CodeEntryAlignment);
 7895 
 7896     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 7897     StubCodeMark mark(this, stub_id);
 7898 
 7899     address entry = __ pc();
 7900     __ enter();
 7901     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 7902     // also advance pointers to use post-increment instead of pre-increment
 7903     __ add(a1, a1, wordSize);
 7904     __ add(a2, a2, wordSize);
 7905     if (AvoidUnalignedAccesses) {
 7906       // both implementations (SIMD/nonSIMD) are using relatively large load
 7907       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 7908       // on some CPUs in case of address is not at least 16-byte aligned.
 7909       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 7910       // load if needed at least for 1st address and make if 16-byte aligned.
 7911       Label ALIGNED16;
 7912       __ tbz(a1, 3, ALIGNED16);
 7913       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 7914       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 7915       __ sub(cnt1, cnt1, wordSize);
 7916       __ eor(tmp1, tmp1, tmp2);
 7917       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 7918       __ bind(ALIGNED16);
 7919     }
 7920     if (UseSIMDForArrayEquals) {
 7921       if (SoftwarePrefetchHintDistance >= 0) {
 7922         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 7923         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 7924         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 7925             /* prfm = */ true, NOT_EQUAL);
 7926         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 7927         __ br(__ LT, TAIL);
 7928       }
 7929       __ bind(NO_PREFETCH_LARGE_LOOP);
 7930       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 7931           /* prfm = */ false, NOT_EQUAL);
 7932     } else {
 7933       __ push(spilled_regs, sp);
 7934       if (SoftwarePrefetchHintDistance >= 0) {
 7935         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 7936         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 7937         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 7938             /* prfm = */ true, NOT_EQUAL);
 7939         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 7940         __ br(__ LT, TAIL);
 7941       }
 7942       __ bind(NO_PREFETCH_LARGE_LOOP);
 7943       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 7944           /* prfm = */ false, NOT_EQUAL);
 7945     }
 7946     __ bind(TAIL);
 7947       __ cbz(cnt1, EQUAL);
 7948       __ subs(cnt1, cnt1, wordSize);
 7949       __ br(__ LE, POST_LOOP);
 7950     __ bind(SMALL_LOOP);
 7951       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 7952       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 7953       __ subs(cnt1, cnt1, wordSize);
 7954       __ eor(tmp1, tmp1, tmp2);
 7955       __ cbnz(tmp1, NOT_EQUAL);
 7956       __ br(__ GT, SMALL_LOOP);
 7957     __ bind(POST_LOOP);
 7958       __ ldr(tmp1, Address(a1, cnt1));
 7959       __ ldr(tmp2, Address(a2, cnt1));
 7960       __ eor(tmp1, tmp1, tmp2);
 7961       __ cbnz(tmp1, NOT_EQUAL);
 7962     __ bind(EQUAL);
 7963       __ mov(result, true);
 7964     __ bind(NOT_EQUAL);
 7965       if (!UseSIMDForArrayEquals) {
 7966         __ pop(spilled_regs, sp);
 7967       }
 7968     __ bind(NOT_EQUAL_NO_POP);
 7969     __ leave();
 7970     __ ret(lr);
 7971     return entry;
 7972   }
 7973 
 7974   // result = r0 - return value. Contains initial hashcode value on entry.
 7975   // ary = r1 - array address
 7976   // cnt = r2 - elements count
 7977   // Clobbers: v0-v13, rscratch1, rscratch2
 7978   address generate_large_arrays_hashcode(BasicType eltype) {
 7979     const Register result = r0, ary = r1, cnt = r2;
 7980     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 7981     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 7982     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 7983     const FloatRegister vpowm = v13;
 7984 
 7985     ARRAYS_HASHCODE_REGISTERS;
 7986 
 7987     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 7988 
 7989     unsigned int vf; // vectorization factor
 7990     bool multiply_by_halves;
 7991     Assembler::SIMD_Arrangement load_arrangement;
 7992     switch (eltype) {
 7993     case T_BOOLEAN:
 7994     case T_BYTE:
 7995       load_arrangement = Assembler::T8B;
 7996       multiply_by_halves = true;
 7997       vf = 8;
 7998       break;
 7999     case T_CHAR:
 8000     case T_SHORT:
 8001       load_arrangement = Assembler::T8H;
 8002       multiply_by_halves = true;
 8003       vf = 8;
 8004       break;
 8005     case T_INT:
 8006       load_arrangement = Assembler::T4S;
 8007       multiply_by_halves = false;
 8008       vf = 4;
 8009       break;
 8010     default:
 8011       ShouldNotReachHere();
 8012     }
 8013 
 8014     // Unroll factor
 8015     const unsigned uf = 4;
 8016 
 8017     // Effective vectorization factor
 8018     const unsigned evf = vf * uf;
 8019 
 8020     __ align(CodeEntryAlignment);
 8021 
 8022     StubGenStubId stub_id;
 8023     switch (eltype) {
 8024     case T_BOOLEAN:
 8025       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 8026       break;
 8027     case T_BYTE:
 8028       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 8029       break;
 8030     case T_CHAR:
 8031       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 8032       break;
 8033     case T_SHORT:
 8034       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 8035       break;
 8036     case T_INT:
 8037       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 8038       break;
 8039     default:
 8040       stub_id = StubGenStubId::NO_STUBID;
 8041       ShouldNotReachHere();
 8042     };
 8043 
 8044     StubCodeMark mark(this, stub_id);
 8045 
 8046     address entry = __ pc();
 8047     __ enter();
 8048 
 8049     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8050     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8051     // value shouldn't change throughout both loops.
 8052     __ movw(rscratch1, intpow(31U, 3));
 8053     __ mov(vpow, Assembler::S, 0, rscratch1);
 8054     __ movw(rscratch1, intpow(31U, 2));
 8055     __ mov(vpow, Assembler::S, 1, rscratch1);
 8056     __ movw(rscratch1, intpow(31U, 1));
 8057     __ mov(vpow, Assembler::S, 2, rscratch1);
 8058     __ movw(rscratch1, intpow(31U, 0));
 8059     __ mov(vpow, Assembler::S, 3, rscratch1);
 8060 
 8061     __ mov(vmul0, Assembler::T16B, 0);
 8062     __ mov(vmul0, Assembler::S, 3, result);
 8063 
 8064     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8065     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8066 
 8067     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8068     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8069 
 8070     // SMALL LOOP
 8071     __ bind(SMALL_LOOP);
 8072 
 8073     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8074     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8075     __ subsw(rscratch2, rscratch2, vf);
 8076 
 8077     if (load_arrangement == Assembler::T8B) {
 8078       // Extend 8B to 8H to be able to use vector multiply
 8079       // instructions
 8080       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8081       if (is_signed_subword_type(eltype)) {
 8082         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8083       } else {
 8084         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8085       }
 8086     }
 8087 
 8088     switch (load_arrangement) {
 8089     case Assembler::T4S:
 8090       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8091       break;
 8092     case Assembler::T8B:
 8093     case Assembler::T8H:
 8094       assert(is_subword_type(eltype), "subword type expected");
 8095       if (is_signed_subword_type(eltype)) {
 8096         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8097       } else {
 8098         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8099       }
 8100       break;
 8101     default:
 8102       __ should_not_reach_here();
 8103     }
 8104 
 8105     // Process the upper half of a vector
 8106     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8107       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8108       if (is_signed_subword_type(eltype)) {
 8109         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8110       } else {
 8111         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8112       }
 8113     }
 8114 
 8115     __ br(Assembler::HI, SMALL_LOOP);
 8116 
 8117     // SMALL LOOP'S EPILOQUE
 8118     __ lsr(rscratch2, cnt, exact_log2(evf));
 8119     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8120 
 8121     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8122     __ addv(vmul0, Assembler::T4S, vmul0);
 8123     __ umov(result, vmul0, Assembler::S, 0);
 8124 
 8125     // TAIL
 8126     __ bind(TAIL);
 8127 
 8128     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8129     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8130     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8131     __ andr(rscratch2, cnt, vf - 1);
 8132     __ bind(TAIL_SHORTCUT);
 8133     __ adr(rscratch1, BR_BASE);
 8134     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 8135     __ movw(rscratch2, 0x1f);
 8136     __ br(rscratch1);
 8137 
 8138     for (size_t i = 0; i < vf - 1; ++i) {
 8139       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8140                                    eltype);
 8141       __ maddw(result, result, rscratch2, rscratch1);
 8142     }
 8143     __ bind(BR_BASE);
 8144 
 8145     __ leave();
 8146     __ ret(lr);
 8147 
 8148     // LARGE LOOP
 8149     __ bind(LARGE_LOOP_PREHEADER);
 8150 
 8151     __ lsr(rscratch2, cnt, exact_log2(evf));
 8152 
 8153     if (multiply_by_halves) {
 8154       // 31^4 - multiplier between lower and upper parts of a register
 8155       __ movw(rscratch1, intpow(31U, vf / 2));
 8156       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8157       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8158       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8159       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8160     } else {
 8161       // 31^16
 8162       __ movw(rscratch1, intpow(31U, evf));
 8163       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8164     }
 8165 
 8166     __ mov(vmul3, Assembler::T16B, 0);
 8167     __ mov(vmul2, Assembler::T16B, 0);
 8168     __ mov(vmul1, Assembler::T16B, 0);
 8169 
 8170     __ bind(LARGE_LOOP);
 8171 
 8172     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8173     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8174     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8175     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8176 
 8177     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8178            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8179 
 8180     if (load_arrangement == Assembler::T8B) {
 8181       // Extend 8B to 8H to be able to use vector multiply
 8182       // instructions
 8183       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8184       if (is_signed_subword_type(eltype)) {
 8185         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8186         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8187         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8188         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8189       } else {
 8190         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8191         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8192         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8193         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8194       }
 8195     }
 8196 
 8197     switch (load_arrangement) {
 8198     case Assembler::T4S:
 8199       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8200       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8201       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8202       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8203       break;
 8204     case Assembler::T8B:
 8205     case Assembler::T8H:
 8206       assert(is_subword_type(eltype), "subword type expected");
 8207       if (is_signed_subword_type(eltype)) {
 8208         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8209         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8210         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8211         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8212       } else {
 8213         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8214         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8215         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8216         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8217       }
 8218       break;
 8219     default:
 8220       __ should_not_reach_here();
 8221     }
 8222 
 8223     // Process the upper half of a vector
 8224     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8225       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8226       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8227       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8228       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8229       if (is_signed_subword_type(eltype)) {
 8230         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8231         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8232         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8233         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8234       } else {
 8235         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8236         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8237         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8238         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8239       }
 8240     }
 8241 
 8242     __ subsw(rscratch2, rscratch2, 1);
 8243     __ br(Assembler::HI, LARGE_LOOP);
 8244 
 8245     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8246     __ addv(vmul3, Assembler::T4S, vmul3);
 8247     __ umov(result, vmul3, Assembler::S, 0);
 8248 
 8249     __ mov(rscratch2, intpow(31U, vf));
 8250 
 8251     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8252     __ addv(vmul2, Assembler::T4S, vmul2);
 8253     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8254     __ maddw(result, result, rscratch2, rscratch1);
 8255 
 8256     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8257     __ addv(vmul1, Assembler::T4S, vmul1);
 8258     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8259     __ maddw(result, result, rscratch2, rscratch1);
 8260 
 8261     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8262     __ addv(vmul0, Assembler::T4S, vmul0);
 8263     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8264     __ maddw(result, result, rscratch2, rscratch1);
 8265 
 8266     __ andr(rscratch2, cnt, vf - 1);
 8267     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8268 
 8269     __ leave();
 8270     __ ret(lr);
 8271 
 8272     return entry;
 8273   }
 8274 
 8275   address generate_dsin_dcos(bool isCos) {
 8276     __ align(CodeEntryAlignment);
 8277     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 8278     StubCodeMark mark(this, stub_id);
 8279     address start = __ pc();
 8280     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8281         (address)StubRoutines::aarch64::_two_over_pi,
 8282         (address)StubRoutines::aarch64::_pio2,
 8283         (address)StubRoutines::aarch64::_dsin_coef,
 8284         (address)StubRoutines::aarch64::_dcos_coef);
 8285     return start;
 8286   }
 8287 
 8288   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8289   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8290       Label &DIFF2) {
 8291     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8292     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8293 
 8294     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8295     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8296     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8297     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8298 
 8299     __ fmovd(tmpL, vtmp3);
 8300     __ eor(rscratch2, tmp3, tmpL);
 8301     __ cbnz(rscratch2, DIFF2);
 8302 
 8303     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8304     __ umov(tmpL, vtmp3, __ D, 1);
 8305     __ eor(rscratch2, tmpU, tmpL);
 8306     __ cbnz(rscratch2, DIFF1);
 8307 
 8308     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8309     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8310     __ fmovd(tmpL, vtmp);
 8311     __ eor(rscratch2, tmp3, tmpL);
 8312     __ cbnz(rscratch2, DIFF2);
 8313 
 8314     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8315     __ umov(tmpL, vtmp, __ D, 1);
 8316     __ eor(rscratch2, tmpU, tmpL);
 8317     __ cbnz(rscratch2, DIFF1);
 8318   }
 8319 
 8320   // r0  = result
 8321   // r1  = str1
 8322   // r2  = cnt1
 8323   // r3  = str2
 8324   // r4  = cnt2
 8325   // r10 = tmp1
 8326   // r11 = tmp2
 8327   address generate_compare_long_string_different_encoding(bool isLU) {
 8328     __ align(CodeEntryAlignment);
 8329     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 8330     StubCodeMark mark(this, stub_id);
 8331     address entry = __ pc();
 8332     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8333         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8334         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8335     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8336         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8337     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8338     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8339 
 8340     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8341 
 8342     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8343     // cnt2 == amount of characters left to compare
 8344     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8345     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8346     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8347     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8348     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8349     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8350     __ eor(rscratch2, tmp1, tmp2);
 8351     __ mov(rscratch1, tmp2);
 8352     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8353     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8354              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8355     __ push(spilled_regs, sp);
 8356     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8357     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8358 
 8359     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8360 
 8361     if (SoftwarePrefetchHintDistance >= 0) {
 8362       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8363       __ br(__ LT, NO_PREFETCH);
 8364       __ bind(LARGE_LOOP_PREFETCH);
 8365         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8366         __ mov(tmp4, 2);
 8367         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8368         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8369           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8370           __ subs(tmp4, tmp4, 1);
 8371           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8372           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8373           __ mov(tmp4, 2);
 8374         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8375           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8376           __ subs(tmp4, tmp4, 1);
 8377           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8378           __ sub(cnt2, cnt2, 64);
 8379           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8380           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8381     }
 8382     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8383     __ bind(NO_PREFETCH);
 8384     __ subs(cnt2, cnt2, 16);
 8385     __ br(__ LT, TAIL);
 8386     __ align(OptoLoopAlignment);
 8387     __ bind(SMALL_LOOP); // smaller loop
 8388       __ subs(cnt2, cnt2, 16);
 8389       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8390       __ br(__ GE, SMALL_LOOP);
 8391       __ cmn(cnt2, (u1)16);
 8392       __ br(__ EQ, LOAD_LAST);
 8393     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8394       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8395       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8396       __ ldr(tmp3, Address(cnt1, -8));
 8397       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8398       __ b(LOAD_LAST);
 8399     __ bind(DIFF2);
 8400       __ mov(tmpU, tmp3);
 8401     __ bind(DIFF1);
 8402       __ pop(spilled_regs, sp);
 8403       __ b(CALCULATE_DIFFERENCE);
 8404     __ bind(LOAD_LAST);
 8405       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8406       // No need to load it again
 8407       __ mov(tmpU, tmp3);
 8408       __ pop(spilled_regs, sp);
 8409 
 8410       // tmp2 points to the address of the last 4 Latin1 characters right now
 8411       __ ldrs(vtmp, Address(tmp2));
 8412       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8413       __ fmovd(tmpL, vtmp);
 8414 
 8415       __ eor(rscratch2, tmpU, tmpL);
 8416       __ cbz(rscratch2, DONE);
 8417 
 8418     // Find the first different characters in the longwords and
 8419     // compute their difference.
 8420     __ bind(CALCULATE_DIFFERENCE);
 8421       __ rev(rscratch2, rscratch2);
 8422       __ clz(rscratch2, rscratch2);
 8423       __ andr(rscratch2, rscratch2, -16);
 8424       __ lsrv(tmp1, tmp1, rscratch2);
 8425       __ uxthw(tmp1, tmp1);
 8426       __ lsrv(rscratch1, rscratch1, rscratch2);
 8427       __ uxthw(rscratch1, rscratch1);
 8428       __ subw(result, tmp1, rscratch1);
 8429     __ bind(DONE);
 8430       __ ret(lr);
 8431     return entry;
 8432   }
 8433 
 8434   // r0 = input (float16)
 8435   // v0 = result (float)
 8436   // v1 = temporary float register
 8437   address generate_float16ToFloat() {
 8438     __ align(CodeEntryAlignment);
 8439     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 8440     StubCodeMark mark(this, stub_id);
 8441     address entry = __ pc();
 8442     BLOCK_COMMENT("Entry:");
 8443     __ flt16_to_flt(v0, r0, v1);
 8444     __ ret(lr);
 8445     return entry;
 8446   }
 8447 
 8448   // v0 = input (float)
 8449   // r0 = result (float16)
 8450   // v1 = temporary float register
 8451   address generate_floatToFloat16() {
 8452     __ align(CodeEntryAlignment);
 8453     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 8454     StubCodeMark mark(this, stub_id);
 8455     address entry = __ pc();
 8456     BLOCK_COMMENT("Entry:");
 8457     __ flt_to_flt16(r0, v0, v1);
 8458     __ ret(lr);
 8459     return entry;
 8460   }
 8461 
 8462   address generate_method_entry_barrier() {
 8463     __ align(CodeEntryAlignment);
 8464     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 8465     StubCodeMark mark(this, stub_id);
 8466 
 8467     Label deoptimize_label;
 8468 
 8469     address start = __ pc();
 8470 
 8471     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8472 
 8473     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8474       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8475       // We can get here despite the nmethod being good, if we have not
 8476       // yet applied our cross modification fence (or data fence).
 8477       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8478       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8479       __ ldrw(rscratch2, rscratch2);
 8480       __ strw(rscratch2, thread_epoch_addr);
 8481       __ isb();
 8482       __ membar(__ LoadLoad);
 8483     }
 8484 
 8485     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8486 
 8487     __ enter();
 8488     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8489 
 8490     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8491 
 8492     __ push_call_clobbered_registers();
 8493 
 8494     __ mov(c_rarg0, rscratch2);
 8495     __ call_VM_leaf
 8496          (CAST_FROM_FN_PTR
 8497           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8498 
 8499     __ reset_last_Java_frame(true);
 8500 
 8501     __ mov(rscratch1, r0);
 8502 
 8503     __ pop_call_clobbered_registers();
 8504 
 8505     __ cbnz(rscratch1, deoptimize_label);
 8506 
 8507     __ leave();
 8508     __ ret(lr);
 8509 
 8510     __ BIND(deoptimize_label);
 8511 
 8512     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8513     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8514 
 8515     __ mov(sp, rscratch1);
 8516     __ br(rscratch2);
 8517 
 8518     return start;
 8519   }
 8520 
 8521   // r0  = result
 8522   // r1  = str1
 8523   // r2  = cnt1
 8524   // r3  = str2
 8525   // r4  = cnt2
 8526   // r10 = tmp1
 8527   // r11 = tmp2
 8528   address generate_compare_long_string_same_encoding(bool isLL) {
 8529     __ align(CodeEntryAlignment);
 8530     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 8531     StubCodeMark mark(this, stub_id);
 8532     address entry = __ pc();
 8533     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8534         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 8535 
 8536     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 8537 
 8538     // exit from large loop when less than 64 bytes left to read or we're about
 8539     // to prefetch memory behind array border
 8540     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 8541 
 8542     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 8543     __ eor(rscratch2, tmp1, tmp2);
 8544     __ cbnz(rscratch2, CAL_DIFFERENCE);
 8545 
 8546     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 8547     // update pointers, because of previous read
 8548     __ add(str1, str1, wordSize);
 8549     __ add(str2, str2, wordSize);
 8550     if (SoftwarePrefetchHintDistance >= 0) {
 8551       __ align(OptoLoopAlignment);
 8552       __ bind(LARGE_LOOP_PREFETCH);
 8553         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 8554         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 8555 
 8556         for (int i = 0; i < 4; i++) {
 8557           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 8558           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 8559           __ cmp(tmp1, tmp2);
 8560           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8561           __ br(Assembler::NE, DIFF);
 8562         }
 8563         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 8564         __ add(str1, str1, 64);
 8565         __ add(str2, str2, 64);
 8566         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 8567         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 8568         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 8569     }
 8570 
 8571     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 8572     __ br(Assembler::LE, LESS16);
 8573     __ align(OptoLoopAlignment);
 8574     __ bind(LOOP_COMPARE16);
 8575       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8576       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8577       __ cmp(tmp1, tmp2);
 8578       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8579       __ br(Assembler::NE, DIFF);
 8580       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8581       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8582       __ br(Assembler::LT, LESS16);
 8583 
 8584       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8585       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8586       __ cmp(tmp1, tmp2);
 8587       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8588       __ br(Assembler::NE, DIFF);
 8589       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8590       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8591       __ br(Assembler::GE, LOOP_COMPARE16);
 8592       __ cbz(cnt2, LENGTH_DIFF);
 8593 
 8594     __ bind(LESS16);
 8595       // each 8 compare
 8596       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 8597       __ br(Assembler::LE, LESS8);
 8598       __ ldr(tmp1, Address(__ post(str1, 8)));
 8599       __ ldr(tmp2, Address(__ post(str2, 8)));
 8600       __ eor(rscratch2, tmp1, tmp2);
 8601       __ cbnz(rscratch2, CAL_DIFFERENCE);
 8602       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 8603 
 8604     __ bind(LESS8); // directly load last 8 bytes
 8605       if (!isLL) {
 8606         __ add(cnt2, cnt2, cnt2);
 8607       }
 8608       __ ldr(tmp1, Address(str1, cnt2));
 8609       __ ldr(tmp2, Address(str2, cnt2));
 8610       __ eor(rscratch2, tmp1, tmp2);
 8611       __ cbz(rscratch2, LENGTH_DIFF);
 8612       __ b(CAL_DIFFERENCE);
 8613 
 8614     __ bind(DIFF);
 8615       __ cmp(tmp1, tmp2);
 8616       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 8617       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 8618       // reuse rscratch2 register for the result of eor instruction
 8619       __ eor(rscratch2, tmp1, tmp2);
 8620 
 8621     __ bind(CAL_DIFFERENCE);
 8622       __ rev(rscratch2, rscratch2);
 8623       __ clz(rscratch2, rscratch2);
 8624       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 8625       __ lsrv(tmp1, tmp1, rscratch2);
 8626       __ lsrv(tmp2, tmp2, rscratch2);
 8627       if (isLL) {
 8628         __ uxtbw(tmp1, tmp1);
 8629         __ uxtbw(tmp2, tmp2);
 8630       } else {
 8631         __ uxthw(tmp1, tmp1);
 8632         __ uxthw(tmp2, tmp2);
 8633       }
 8634       __ subw(result, tmp1, tmp2);
 8635 
 8636     __ bind(LENGTH_DIFF);
 8637       __ ret(lr);
 8638     return entry;
 8639   }
 8640 
 8641   enum string_compare_mode {
 8642     LL,
 8643     LU,
 8644     UL,
 8645     UU,
 8646   };
 8647 
 8648   // The following registers are declared in aarch64.ad
 8649   // r0  = result
 8650   // r1  = str1
 8651   // r2  = cnt1
 8652   // r3  = str2
 8653   // r4  = cnt2
 8654   // r10 = tmp1
 8655   // r11 = tmp2
 8656   // z0  = ztmp1
 8657   // z1  = ztmp2
 8658   // p0  = pgtmp1
 8659   // p1  = pgtmp2
 8660   address generate_compare_long_string_sve(string_compare_mode mode) {
 8661     StubGenStubId stub_id;
 8662     switch (mode) {
 8663       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 8664       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 8665       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 8666       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 8667       default: ShouldNotReachHere();
 8668     }
 8669 
 8670     __ align(CodeEntryAlignment);
 8671     address entry = __ pc();
 8672     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8673              tmp1 = r10, tmp2 = r11;
 8674 
 8675     Label LOOP, DONE, MISMATCH;
 8676     Register vec_len = tmp1;
 8677     Register idx = tmp2;
 8678     // The minimum of the string lengths has been stored in cnt2.
 8679     Register cnt = cnt2;
 8680     FloatRegister ztmp1 = z0, ztmp2 = z1;
 8681     PRegister pgtmp1 = p0, pgtmp2 = p1;
 8682 
 8683 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 8684     switch (mode) {                                                            \
 8685       case LL:                                                                 \
 8686         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 8687         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 8688         break;                                                                 \
 8689       case LU:                                                                 \
 8690         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 8691         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8692         break;                                                                 \
 8693       case UL:                                                                 \
 8694         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8695         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 8696         break;                                                                 \
 8697       case UU:                                                                 \
 8698         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8699         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8700         break;                                                                 \
 8701       default:                                                                 \
 8702         ShouldNotReachHere();                                                  \
 8703     }
 8704 
 8705     StubCodeMark mark(this, stub_id);
 8706 
 8707     __ mov(idx, 0);
 8708     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8709 
 8710     if (mode == LL) {
 8711       __ sve_cntb(vec_len);
 8712     } else {
 8713       __ sve_cnth(vec_len);
 8714     }
 8715 
 8716     __ sub(rscratch1, cnt, vec_len);
 8717 
 8718     __ bind(LOOP);
 8719 
 8720       // main loop
 8721       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8722       __ add(idx, idx, vec_len);
 8723       // Compare strings.
 8724       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8725       __ br(__ NE, MISMATCH);
 8726       __ cmp(idx, rscratch1);
 8727       __ br(__ LT, LOOP);
 8728 
 8729     // post loop, last iteration
 8730     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8731 
 8732     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8733     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8734     __ br(__ EQ, DONE);
 8735 
 8736     __ bind(MISMATCH);
 8737 
 8738     // Crop the vector to find its location.
 8739     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 8740     // Extract the first different characters of each string.
 8741     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 8742     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 8743 
 8744     // Compute the difference of the first different characters.
 8745     __ sub(result, rscratch1, rscratch2);
 8746 
 8747     __ bind(DONE);
 8748     __ ret(lr);
 8749 #undef LOAD_PAIR
 8750     return entry;
 8751   }
 8752 
 8753   void generate_compare_long_strings() {
 8754     if (UseSVE == 0) {
 8755       StubRoutines::aarch64::_compare_long_string_LL
 8756           = generate_compare_long_string_same_encoding(true);
 8757       StubRoutines::aarch64::_compare_long_string_UU
 8758           = generate_compare_long_string_same_encoding(false);
 8759       StubRoutines::aarch64::_compare_long_string_LU
 8760           = generate_compare_long_string_different_encoding(true);
 8761       StubRoutines::aarch64::_compare_long_string_UL
 8762           = generate_compare_long_string_different_encoding(false);
 8763     } else {
 8764       StubRoutines::aarch64::_compare_long_string_LL
 8765           = generate_compare_long_string_sve(LL);
 8766       StubRoutines::aarch64::_compare_long_string_UU
 8767           = generate_compare_long_string_sve(UU);
 8768       StubRoutines::aarch64::_compare_long_string_LU
 8769           = generate_compare_long_string_sve(LU);
 8770       StubRoutines::aarch64::_compare_long_string_UL
 8771           = generate_compare_long_string_sve(UL);
 8772     }
 8773   }
 8774 
 8775   // R0 = result
 8776   // R1 = str2
 8777   // R2 = cnt1
 8778   // R3 = str1
 8779   // R4 = cnt2
 8780   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 8781   //
 8782   // This generic linear code use few additional ideas, which makes it faster:
 8783   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 8784   // in order to skip initial loading(help in systems with 1 ld pipeline)
 8785   // 2) we can use "fast" algorithm of finding single character to search for
 8786   // first symbol with less branches(1 branch per each loaded register instead
 8787   // of branch for each symbol), so, this is where constants like
 8788   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 8789   // 3) after loading and analyzing 1st register of source string, it can be
 8790   // used to search for every 1st character entry, saving few loads in
 8791   // comparison with "simplier-but-slower" implementation
 8792   // 4) in order to avoid lots of push/pop operations, code below is heavily
 8793   // re-using/re-initializing/compressing register values, which makes code
 8794   // larger and a bit less readable, however, most of extra operations are
 8795   // issued during loads or branches, so, penalty is minimal
 8796   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 8797     StubGenStubId stub_id;
 8798     if (str1_isL) {
 8799       if (str2_isL) {
 8800         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 8801       } else {
 8802         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 8803       }
 8804     } else {
 8805       if (str2_isL) {
 8806         ShouldNotReachHere();
 8807       } else {
 8808         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 8809       }
 8810     }
 8811     __ align(CodeEntryAlignment);
 8812     StubCodeMark mark(this, stub_id);
 8813     address entry = __ pc();
 8814 
 8815     int str1_chr_size = str1_isL ? 1 : 2;
 8816     int str2_chr_size = str2_isL ? 1 : 2;
 8817     int str1_chr_shift = str1_isL ? 0 : 1;
 8818     int str2_chr_shift = str2_isL ? 0 : 1;
 8819     bool isL = str1_isL && str2_isL;
 8820    // parameters
 8821     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 8822     // temporary registers
 8823     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 8824     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 8825     // redefinitions
 8826     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 8827 
 8828     __ push(spilled_regs, sp);
 8829     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 8830         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 8831         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 8832         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 8833         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 8834         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 8835     // Read whole register from str1. It is safe, because length >=8 here
 8836     __ ldr(ch1, Address(str1));
 8837     // Read whole register from str2. It is safe, because length >=8 here
 8838     __ ldr(ch2, Address(str2));
 8839     __ sub(cnt2, cnt2, cnt1);
 8840     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 8841     if (str1_isL != str2_isL) {
 8842       __ eor(v0, __ T16B, v0, v0);
 8843     }
 8844     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 8845     __ mul(first, first, tmp1);
 8846     // check if we have less than 1 register to check
 8847     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 8848     if (str1_isL != str2_isL) {
 8849       __ fmovd(v1, ch1);
 8850     }
 8851     __ br(__ LE, L_SMALL);
 8852     __ eor(ch2, first, ch2);
 8853     if (str1_isL != str2_isL) {
 8854       __ zip1(v1, __ T16B, v1, v0);
 8855     }
 8856     __ sub(tmp2, ch2, tmp1);
 8857     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8858     __ bics(tmp2, tmp2, ch2);
 8859     if (str1_isL != str2_isL) {
 8860       __ fmovd(ch1, v1);
 8861     }
 8862     __ br(__ NE, L_HAS_ZERO);
 8863     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8864     __ add(result, result, wordSize/str2_chr_size);
 8865     __ add(str2, str2, wordSize);
 8866     __ br(__ LT, L_POST_LOOP);
 8867     __ BIND(L_LOOP);
 8868       __ ldr(ch2, Address(str2));
 8869       __ eor(ch2, first, ch2);
 8870       __ sub(tmp2, ch2, tmp1);
 8871       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8872       __ bics(tmp2, tmp2, ch2);
 8873       __ br(__ NE, L_HAS_ZERO);
 8874     __ BIND(L_LOOP_PROCEED);
 8875       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8876       __ add(str2, str2, wordSize);
 8877       __ add(result, result, wordSize/str2_chr_size);
 8878       __ br(__ GE, L_LOOP);
 8879     __ BIND(L_POST_LOOP);
 8880       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 8881       __ br(__ LE, NOMATCH);
 8882       __ ldr(ch2, Address(str2));
 8883       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 8884       __ eor(ch2, first, ch2);
 8885       __ sub(tmp2, ch2, tmp1);
 8886       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8887       __ mov(tmp4, -1); // all bits set
 8888       __ b(L_SMALL_PROCEED);
 8889     __ align(OptoLoopAlignment);
 8890     __ BIND(L_SMALL);
 8891       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 8892       __ eor(ch2, first, ch2);
 8893       if (str1_isL != str2_isL) {
 8894         __ zip1(v1, __ T16B, v1, v0);
 8895       }
 8896       __ sub(tmp2, ch2, tmp1);
 8897       __ mov(tmp4, -1); // all bits set
 8898       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8899       if (str1_isL != str2_isL) {
 8900         __ fmovd(ch1, v1); // move converted 4 symbols
 8901       }
 8902     __ BIND(L_SMALL_PROCEED);
 8903       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 8904       __ bic(tmp2, tmp2, ch2);
 8905       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 8906       __ rbit(tmp2, tmp2);
 8907       __ br(__ EQ, NOMATCH);
 8908     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 8909       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 8910       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 8911       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 8912       if (str2_isL) { // LL
 8913         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 8914         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 8915         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 8916         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 8917         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8918       } else {
 8919         __ mov(ch2, 0xE); // all bits in byte set except last one
 8920         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8921         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8922         __ lslv(tmp2, tmp2, tmp4);
 8923         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8924         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8925         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8926         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8927       }
 8928       __ cmp(ch1, ch2);
 8929       __ mov(tmp4, wordSize/str2_chr_size);
 8930       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8931     __ BIND(L_SMALL_CMP_LOOP);
 8932       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 8933                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 8934       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 8935                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 8936       __ add(tmp4, tmp4, 1);
 8937       __ cmp(tmp4, cnt1);
 8938       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 8939       __ cmp(first, ch2);
 8940       __ br(__ EQ, L_SMALL_CMP_LOOP);
 8941     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 8942       __ cbz(tmp2, NOMATCH); // no more matches. exit
 8943       __ clz(tmp4, tmp2);
 8944       __ add(result, result, 1); // advance index
 8945       __ add(str2, str2, str2_chr_size); // advance pointer
 8946       __ b(L_SMALL_HAS_ZERO_LOOP);
 8947     __ align(OptoLoopAlignment);
 8948     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 8949       __ cmp(first, ch2);
 8950       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8951       __ b(DONE);
 8952     __ align(OptoLoopAlignment);
 8953     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 8954       if (str2_isL) { // LL
 8955         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 8956         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 8957         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 8958         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 8959         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8960       } else {
 8961         __ mov(ch2, 0xE); // all bits in byte set except last one
 8962         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8963         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8964         __ lslv(tmp2, tmp2, tmp4);
 8965         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8966         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8967         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8968         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8969       }
 8970       __ cmp(ch1, ch2);
 8971       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8972       __ b(DONE);
 8973     __ align(OptoLoopAlignment);
 8974     __ BIND(L_HAS_ZERO);
 8975       __ rbit(tmp2, tmp2);
 8976       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 8977       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 8978       // It's fine because both counters are 32bit and are not changed in this
 8979       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 8980       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 8981       __ sub(result, result, 1);
 8982     __ BIND(L_HAS_ZERO_LOOP);
 8983       __ mov(cnt1, wordSize/str2_chr_size);
 8984       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 8985       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 8986       if (str2_isL) {
 8987         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 8988         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8989         __ lslv(tmp2, tmp2, tmp4);
 8990         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8991         __ add(tmp4, tmp4, 1);
 8992         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8993         __ lsl(tmp2, tmp2, 1);
 8994         __ mov(tmp4, wordSize/str2_chr_size);
 8995       } else {
 8996         __ mov(ch2, 0xE);
 8997         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8998         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8999         __ lslv(tmp2, tmp2, tmp4);
 9000         __ add(tmp4, tmp4, 1);
 9001         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9002         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9003         __ lsl(tmp2, tmp2, 1);
 9004         __ mov(tmp4, wordSize/str2_chr_size);
 9005         __ sub(str2, str2, str2_chr_size);
 9006       }
 9007       __ cmp(ch1, ch2);
 9008       __ mov(tmp4, wordSize/str2_chr_size);
 9009       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9010     __ BIND(L_CMP_LOOP);
 9011       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9012                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9013       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9014                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9015       __ add(tmp4, tmp4, 1);
 9016       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9017       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9018       __ cmp(cnt1, ch2);
 9019       __ br(__ EQ, L_CMP_LOOP);
 9020     __ BIND(L_CMP_LOOP_NOMATCH);
 9021       // here we're not matched
 9022       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9023       __ clz(tmp4, tmp2);
 9024       __ add(str2, str2, str2_chr_size); // advance pointer
 9025       __ b(L_HAS_ZERO_LOOP);
 9026     __ align(OptoLoopAlignment);
 9027     __ BIND(L_CMP_LOOP_LAST_CMP);
 9028       __ cmp(cnt1, ch2);
 9029       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9030       __ b(DONE);
 9031     __ align(OptoLoopAlignment);
 9032     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9033       if (str2_isL) {
 9034         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9035         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9036         __ lslv(tmp2, tmp2, tmp4);
 9037         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9038         __ add(tmp4, tmp4, 1);
 9039         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9040         __ lsl(tmp2, tmp2, 1);
 9041       } else {
 9042         __ mov(ch2, 0xE);
 9043         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9044         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9045         __ lslv(tmp2, tmp2, tmp4);
 9046         __ add(tmp4, tmp4, 1);
 9047         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9048         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9049         __ lsl(tmp2, tmp2, 1);
 9050         __ sub(str2, str2, str2_chr_size);
 9051       }
 9052       __ cmp(ch1, ch2);
 9053       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9054       __ b(DONE);
 9055     __ align(OptoLoopAlignment);
 9056     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9057       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9058       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9059       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9060       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9061       // result by analyzed characters value, so, we can just reset lower bits
 9062       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9063       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9064       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9065       // index of last analyzed substring inside current octet. So, str2 in at
 9066       // respective start address. We need to advance it to next octet
 9067       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9068       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9069       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9070       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9071       __ movw(cnt2, cnt2);
 9072       __ b(L_LOOP_PROCEED);
 9073     __ align(OptoLoopAlignment);
 9074     __ BIND(NOMATCH);
 9075       __ mov(result, -1);
 9076     __ BIND(DONE);
 9077       __ pop(spilled_regs, sp);
 9078       __ ret(lr);
 9079     return entry;
 9080   }
 9081 
 9082   void generate_string_indexof_stubs() {
 9083     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9084     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9085     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9086   }
 9087 
 9088   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9089       FloatRegister src1, FloatRegister src2) {
 9090     Register dst = r1;
 9091     __ zip1(v1, __ T16B, src1, v0);
 9092     __ zip2(v2, __ T16B, src1, v0);
 9093     if (generatePrfm) {
 9094       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9095     }
 9096     __ zip1(v3, __ T16B, src2, v0);
 9097     __ zip2(v4, __ T16B, src2, v0);
 9098     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9099   }
 9100 
 9101   // R0 = src
 9102   // R1 = dst
 9103   // R2 = len
 9104   // R3 = len >> 3
 9105   // V0 = 0
 9106   // v1 = loaded 8 bytes
 9107   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9108   address generate_large_byte_array_inflate() {
 9109     __ align(CodeEntryAlignment);
 9110     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 9111     StubCodeMark mark(this, stub_id);
 9112     address entry = __ pc();
 9113     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9114     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9115     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9116 
 9117     // do one more 8-byte read to have address 16-byte aligned in most cases
 9118     // also use single store instruction
 9119     __ ldrd(v2, __ post(src, 8));
 9120     __ sub(octetCounter, octetCounter, 2);
 9121     __ zip1(v1, __ T16B, v1, v0);
 9122     __ zip1(v2, __ T16B, v2, v0);
 9123     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9124     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9125     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9126     __ br(__ LE, LOOP_START);
 9127     __ b(LOOP_PRFM_START);
 9128     __ bind(LOOP_PRFM);
 9129       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9130     __ bind(LOOP_PRFM_START);
 9131       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9132       __ sub(octetCounter, octetCounter, 8);
 9133       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9134       inflate_and_store_2_fp_registers(true, v3, v4);
 9135       inflate_and_store_2_fp_registers(true, v5, v6);
 9136       __ br(__ GT, LOOP_PRFM);
 9137       __ cmp(octetCounter, (u1)8);
 9138       __ br(__ LT, DONE);
 9139     __ bind(LOOP);
 9140       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9141       __ bind(LOOP_START);
 9142       __ sub(octetCounter, octetCounter, 8);
 9143       __ cmp(octetCounter, (u1)8);
 9144       inflate_and_store_2_fp_registers(false, v3, v4);
 9145       inflate_and_store_2_fp_registers(false, v5, v6);
 9146       __ br(__ GE, LOOP);
 9147     __ bind(DONE);
 9148       __ ret(lr);
 9149     return entry;
 9150   }
 9151 
 9152   /**
 9153    *  Arguments:
 9154    *
 9155    *  Input:
 9156    *  c_rarg0   - current state address
 9157    *  c_rarg1   - H key address
 9158    *  c_rarg2   - data address
 9159    *  c_rarg3   - number of blocks
 9160    *
 9161    *  Output:
 9162    *  Updated state at c_rarg0
 9163    */
 9164   address generate_ghash_processBlocks() {
 9165     // Bafflingly, GCM uses little-endian for the byte order, but
 9166     // big-endian for the bit order.  For example, the polynomial 1 is
 9167     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9168     //
 9169     // So, we must either reverse the bytes in each word and do
 9170     // everything big-endian or reverse the bits in each byte and do
 9171     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9172     // the bits in each byte (we have an instruction, RBIT, to do
 9173     // that) and keep the data in little-endian bit order through the
 9174     // calculation, bit-reversing the inputs and outputs.
 9175 
 9176     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 9177     StubCodeMark mark(this, stub_id);
 9178     __ align(wordSize * 2);
 9179     address p = __ pc();
 9180     __ emit_int64(0x87);  // The low-order bits of the field
 9181                           // polynomial (i.e. p = z^7+z^2+z+1)
 9182                           // repeated in the low and high parts of a
 9183                           // 128-bit vector
 9184     __ emit_int64(0x87);
 9185 
 9186     __ align(CodeEntryAlignment);
 9187     address start = __ pc();
 9188 
 9189     Register state   = c_rarg0;
 9190     Register subkeyH = c_rarg1;
 9191     Register data    = c_rarg2;
 9192     Register blocks  = c_rarg3;
 9193 
 9194     FloatRegister vzr = v30;
 9195     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9196 
 9197     __ ldrq(v24, p);    // The field polynomial
 9198 
 9199     __ ldrq(v0, Address(state));
 9200     __ ldrq(v1, Address(subkeyH));
 9201 
 9202     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9203     __ rbit(v0, __ T16B, v0);
 9204     __ rev64(v1, __ T16B, v1);
 9205     __ rbit(v1, __ T16B, v1);
 9206 
 9207     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9208     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9209 
 9210     {
 9211       Label L_ghash_loop;
 9212       __ bind(L_ghash_loop);
 9213 
 9214       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9215                                                  // reversing each byte
 9216       __ rbit(v2, __ T16B, v2);
 9217       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9218 
 9219       // Multiply state in v2 by subkey in v1
 9220       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9221                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9222                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9223       // Reduce v7:v5 by the field polynomial
 9224       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9225 
 9226       __ sub(blocks, blocks, 1);
 9227       __ cbnz(blocks, L_ghash_loop);
 9228     }
 9229 
 9230     // The bit-reversed result is at this point in v0
 9231     __ rev64(v0, __ T16B, v0);
 9232     __ rbit(v0, __ T16B, v0);
 9233 
 9234     __ st1(v0, __ T16B, state);
 9235     __ ret(lr);
 9236 
 9237     return start;
 9238   }
 9239 
 9240   address generate_ghash_processBlocks_wide() {
 9241     address small = generate_ghash_processBlocks();
 9242 
 9243     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 9244     StubCodeMark mark(this, stub_id);
 9245     __ align(wordSize * 2);
 9246     address p = __ pc();
 9247     __ emit_int64(0x87);  // The low-order bits of the field
 9248                           // polynomial (i.e. p = z^7+z^2+z+1)
 9249                           // repeated in the low and high parts of a
 9250                           // 128-bit vector
 9251     __ emit_int64(0x87);
 9252 
 9253     __ align(CodeEntryAlignment);
 9254     address start = __ pc();
 9255 
 9256     Register state   = c_rarg0;
 9257     Register subkeyH = c_rarg1;
 9258     Register data    = c_rarg2;
 9259     Register blocks  = c_rarg3;
 9260 
 9261     const int unroll = 4;
 9262 
 9263     __ cmp(blocks, (unsigned char)(unroll * 2));
 9264     __ br(__ LT, small);
 9265 
 9266     if (unroll > 1) {
 9267     // Save state before entering routine
 9268       __ sub(sp, sp, 4 * 16);
 9269       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9270       __ sub(sp, sp, 4 * 16);
 9271       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9272     }
 9273 
 9274     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9275 
 9276     if (unroll > 1) {
 9277       // And restore state
 9278       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9279       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9280     }
 9281 
 9282     __ cmp(blocks, (unsigned char)0);
 9283     __ br(__ GT, small);
 9284 
 9285     __ ret(lr);
 9286 
 9287     return start;
 9288   }
 9289 
 9290   void generate_base64_encode_simdround(Register src, Register dst,
 9291         FloatRegister codec, u8 size) {
 9292 
 9293     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9294     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9295     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9296 
 9297     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9298 
 9299     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9300 
 9301     __ ushr(ind0, arrangement, in0,  2);
 9302 
 9303     __ ushr(ind1, arrangement, in1,  2);
 9304     __ shl(in0,   arrangement, in0,  6);
 9305     __ orr(ind1,  arrangement, ind1, in0);
 9306     __ ushr(ind1, arrangement, ind1, 2);
 9307 
 9308     __ ushr(ind2, arrangement, in2,  4);
 9309     __ shl(in1,   arrangement, in1,  4);
 9310     __ orr(ind2,  arrangement, in1,  ind2);
 9311     __ ushr(ind2, arrangement, ind2, 2);
 9312 
 9313     __ shl(ind3,  arrangement, in2,  2);
 9314     __ ushr(ind3, arrangement, ind3, 2);
 9315 
 9316     __ tbl(out0,  arrangement, codec,  4, ind0);
 9317     __ tbl(out1,  arrangement, codec,  4, ind1);
 9318     __ tbl(out2,  arrangement, codec,  4, ind2);
 9319     __ tbl(out3,  arrangement, codec,  4, ind3);
 9320 
 9321     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9322   }
 9323 
 9324    /**
 9325    *  Arguments:
 9326    *
 9327    *  Input:
 9328    *  c_rarg0   - src_start
 9329    *  c_rarg1   - src_offset
 9330    *  c_rarg2   - src_length
 9331    *  c_rarg3   - dest_start
 9332    *  c_rarg4   - dest_offset
 9333    *  c_rarg5   - isURL
 9334    *
 9335    */
 9336   address generate_base64_encodeBlock() {
 9337 
 9338     static const char toBase64[64] = {
 9339       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9340       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9341       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9342       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9343       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9344     };
 9345 
 9346     static const char toBase64URL[64] = {
 9347       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9348       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9349       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9350       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9351       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9352     };
 9353 
 9354     __ align(CodeEntryAlignment);
 9355     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 9356     StubCodeMark mark(this, stub_id);
 9357     address start = __ pc();
 9358 
 9359     Register src   = c_rarg0;  // source array
 9360     Register soff  = c_rarg1;  // source start offset
 9361     Register send  = c_rarg2;  // source end offset
 9362     Register dst   = c_rarg3;  // dest array
 9363     Register doff  = c_rarg4;  // position for writing to dest array
 9364     Register isURL = c_rarg5;  // Base64 or URL character set
 9365 
 9366     // c_rarg6 and c_rarg7 are free to use as temps
 9367     Register codec  = c_rarg6;
 9368     Register length = c_rarg7;
 9369 
 9370     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9371 
 9372     __ add(src, src, soff);
 9373     __ add(dst, dst, doff);
 9374     __ sub(length, send, soff);
 9375 
 9376     // load the codec base address
 9377     __ lea(codec, ExternalAddress((address) toBase64));
 9378     __ cbz(isURL, ProcessData);
 9379     __ lea(codec, ExternalAddress((address) toBase64URL));
 9380 
 9381     __ BIND(ProcessData);
 9382 
 9383     // too short to formup a SIMD loop, roll back
 9384     __ cmp(length, (u1)24);
 9385     __ br(Assembler::LT, Process3B);
 9386 
 9387     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9388 
 9389     __ BIND(Process48B);
 9390     __ cmp(length, (u1)48);
 9391     __ br(Assembler::LT, Process24B);
 9392     generate_base64_encode_simdround(src, dst, v0, 16);
 9393     __ sub(length, length, 48);
 9394     __ b(Process48B);
 9395 
 9396     __ BIND(Process24B);
 9397     __ cmp(length, (u1)24);
 9398     __ br(Assembler::LT, SIMDExit);
 9399     generate_base64_encode_simdround(src, dst, v0, 8);
 9400     __ sub(length, length, 24);
 9401 
 9402     __ BIND(SIMDExit);
 9403     __ cbz(length, Exit);
 9404 
 9405     __ BIND(Process3B);
 9406     //  3 src bytes, 24 bits
 9407     __ ldrb(r10, __ post(src, 1));
 9408     __ ldrb(r11, __ post(src, 1));
 9409     __ ldrb(r12, __ post(src, 1));
 9410     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9411     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9412     // codec index
 9413     __ ubfmw(r15, r12, 18, 23);
 9414     __ ubfmw(r14, r12, 12, 17);
 9415     __ ubfmw(r13, r12, 6,  11);
 9416     __ andw(r12,  r12, 63);
 9417     // get the code based on the codec
 9418     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9419     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9420     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9421     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9422     __ strb(r15, __ post(dst, 1));
 9423     __ strb(r14, __ post(dst, 1));
 9424     __ strb(r13, __ post(dst, 1));
 9425     __ strb(r12, __ post(dst, 1));
 9426     __ sub(length, length, 3);
 9427     __ cbnz(length, Process3B);
 9428 
 9429     __ BIND(Exit);
 9430     __ ret(lr);
 9431 
 9432     return start;
 9433   }
 9434 
 9435   void generate_base64_decode_simdround(Register src, Register dst,
 9436         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9437 
 9438     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9439     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9440 
 9441     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9442     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9443 
 9444     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9445 
 9446     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9447 
 9448     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9449 
 9450     // we need unsigned saturating subtract, to make sure all input values
 9451     // in range [0, 63] will have 0U value in the higher half lookup
 9452     __ uqsubv(decH0, __ T16B, in0, v27);
 9453     __ uqsubv(decH1, __ T16B, in1, v27);
 9454     __ uqsubv(decH2, __ T16B, in2, v27);
 9455     __ uqsubv(decH3, __ T16B, in3, v27);
 9456 
 9457     // lower half lookup
 9458     __ tbl(decL0, arrangement, codecL, 4, in0);
 9459     __ tbl(decL1, arrangement, codecL, 4, in1);
 9460     __ tbl(decL2, arrangement, codecL, 4, in2);
 9461     __ tbl(decL3, arrangement, codecL, 4, in3);
 9462 
 9463     // higher half lookup
 9464     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9465     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9466     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9467     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9468 
 9469     // combine lower and higher
 9470     __ orr(decL0, arrangement, decL0, decH0);
 9471     __ orr(decL1, arrangement, decL1, decH1);
 9472     __ orr(decL2, arrangement, decL2, decH2);
 9473     __ orr(decL3, arrangement, decL3, decH3);
 9474 
 9475     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9476     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9477     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9478     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9479     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9480     __ orr(in0, arrangement, decH0, decH1);
 9481     __ orr(in1, arrangement, decH2, decH3);
 9482     __ orr(in2, arrangement, in0,   in1);
 9483     __ umaxv(in3, arrangement, in2);
 9484     __ umov(rscratch2, in3, __ B, 0);
 9485 
 9486     // get the data to output
 9487     __ shl(out0,  arrangement, decL0, 2);
 9488     __ ushr(out1, arrangement, decL1, 4);
 9489     __ orr(out0,  arrangement, out0,  out1);
 9490     __ shl(out1,  arrangement, decL1, 4);
 9491     __ ushr(out2, arrangement, decL2, 2);
 9492     __ orr(out1,  arrangement, out1,  out2);
 9493     __ shl(out2,  arrangement, decL2, 6);
 9494     __ orr(out2,  arrangement, out2,  decL3);
 9495 
 9496     __ cbz(rscratch2, NoIllegalData);
 9497 
 9498     // handle illegal input
 9499     __ umov(r10, in2, __ D, 0);
 9500     if (size == 16) {
 9501       __ cbnz(r10, ErrorInLowerHalf);
 9502 
 9503       // illegal input is in higher half, store the lower half now.
 9504       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9505 
 9506       __ umov(r10, in2,  __ D, 1);
 9507       __ umov(r11, out0, __ D, 1);
 9508       __ umov(r12, out1, __ D, 1);
 9509       __ umov(r13, out2, __ D, 1);
 9510       __ b(StoreLegalData);
 9511 
 9512       __ BIND(ErrorInLowerHalf);
 9513     }
 9514     __ umov(r11, out0, __ D, 0);
 9515     __ umov(r12, out1, __ D, 0);
 9516     __ umov(r13, out2, __ D, 0);
 9517 
 9518     __ BIND(StoreLegalData);
 9519     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 9520     __ strb(r11, __ post(dst, 1));
 9521     __ strb(r12, __ post(dst, 1));
 9522     __ strb(r13, __ post(dst, 1));
 9523     __ lsr(r10, r10, 8);
 9524     __ lsr(r11, r11, 8);
 9525     __ lsr(r12, r12, 8);
 9526     __ lsr(r13, r13, 8);
 9527     __ b(StoreLegalData);
 9528 
 9529     __ BIND(NoIllegalData);
 9530     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 9531   }
 9532 
 9533 
 9534    /**
 9535    *  Arguments:
 9536    *
 9537    *  Input:
 9538    *  c_rarg0   - src_start
 9539    *  c_rarg1   - src_offset
 9540    *  c_rarg2   - src_length
 9541    *  c_rarg3   - dest_start
 9542    *  c_rarg4   - dest_offset
 9543    *  c_rarg5   - isURL
 9544    *  c_rarg6   - isMIME
 9545    *
 9546    */
 9547   address generate_base64_decodeBlock() {
 9548 
 9549     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 9550     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 9551     // titled "Base64 decoding".
 9552 
 9553     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 9554     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 9555     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 9556     static const uint8_t fromBase64ForNoSIMD[256] = {
 9557       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9558       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9559       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9560        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9561       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9562        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 9563       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9564        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9565       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9566       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9567       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9568       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9569       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9570       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9571       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9572       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9573     };
 9574 
 9575     static const uint8_t fromBase64URLForNoSIMD[256] = {
 9576       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9577       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9578       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9579        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9580       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9581        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 9582       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9583        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9584       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9585       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9586       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9587       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9588       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9589       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9590       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9591       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9592     };
 9593 
 9594     // A legal value of base64 code is in range [0, 127].  We need two lookups
 9595     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 9596     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 9597     // table vector lookup use tbx, out of range indices are unchanged in
 9598     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 9599     // The value of index 64 is set to 0, so that we know that we already get the
 9600     // decoded data with the 1st lookup.
 9601     static const uint8_t fromBase64ForSIMD[128] = {
 9602       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9603       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9604       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9605        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9606         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9607        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9608       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9609        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9610     };
 9611 
 9612     static const uint8_t fromBase64URLForSIMD[128] = {
 9613       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9614       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9616        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9617         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9618        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9619        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9620        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9621     };
 9622 
 9623     __ align(CodeEntryAlignment);
 9624     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 9625     StubCodeMark mark(this, stub_id);
 9626     address start = __ pc();
 9627 
 9628     Register src    = c_rarg0;  // source array
 9629     Register soff   = c_rarg1;  // source start offset
 9630     Register send   = c_rarg2;  // source end offset
 9631     Register dst    = c_rarg3;  // dest array
 9632     Register doff   = c_rarg4;  // position for writing to dest array
 9633     Register isURL  = c_rarg5;  // Base64 or URL character set
 9634     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 9635 
 9636     Register length = send;    // reuse send as length of source data to process
 9637 
 9638     Register simd_codec   = c_rarg6;
 9639     Register nosimd_codec = c_rarg7;
 9640 
 9641     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 9642 
 9643     __ enter();
 9644 
 9645     __ add(src, src, soff);
 9646     __ add(dst, dst, doff);
 9647 
 9648     __ mov(doff, dst);
 9649 
 9650     __ sub(length, send, soff);
 9651     __ bfm(length, zr, 0, 1);
 9652 
 9653     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 9654     __ cbz(isURL, ProcessData);
 9655     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 9656 
 9657     __ BIND(ProcessData);
 9658     __ mov(rscratch1, length);
 9659     __ cmp(length, (u1)144); // 144 = 80 + 64
 9660     __ br(Assembler::LT, Process4B);
 9661 
 9662     // In the MIME case, the line length cannot be more than 76
 9663     // bytes (see RFC 2045). This is too short a block for SIMD
 9664     // to be worthwhile, so we use non-SIMD here.
 9665     __ movw(rscratch1, 79);
 9666 
 9667     __ BIND(Process4B);
 9668     __ ldrw(r14, __ post(src, 4));
 9669     __ ubfxw(r10, r14, 0,  8);
 9670     __ ubfxw(r11, r14, 8,  8);
 9671     __ ubfxw(r12, r14, 16, 8);
 9672     __ ubfxw(r13, r14, 24, 8);
 9673     // get the de-code
 9674     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 9675     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 9676     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 9677     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 9678     // error detection, 255u indicates an illegal input
 9679     __ orrw(r14, r10, r11);
 9680     __ orrw(r15, r12, r13);
 9681     __ orrw(r14, r14, r15);
 9682     __ tbnz(r14, 7, Exit);
 9683     // recover the data
 9684     __ lslw(r14, r10, 10);
 9685     __ bfiw(r14, r11, 4, 6);
 9686     __ bfmw(r14, r12, 2, 5);
 9687     __ rev16w(r14, r14);
 9688     __ bfiw(r13, r12, 6, 2);
 9689     __ strh(r14, __ post(dst, 2));
 9690     __ strb(r13, __ post(dst, 1));
 9691     // non-simd loop
 9692     __ subsw(rscratch1, rscratch1, 4);
 9693     __ br(Assembler::GT, Process4B);
 9694 
 9695     // if exiting from PreProcess80B, rscratch1 == -1;
 9696     // otherwise, rscratch1 == 0.
 9697     __ cbzw(rscratch1, Exit);
 9698     __ sub(length, length, 80);
 9699 
 9700     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 9701     __ cbz(isURL, SIMDEnter);
 9702     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 9703 
 9704     __ BIND(SIMDEnter);
 9705     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 9706     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 9707     __ mov(rscratch1, 63);
 9708     __ dup(v27, __ T16B, rscratch1);
 9709 
 9710     __ BIND(Process64B);
 9711     __ cmp(length, (u1)64);
 9712     __ br(Assembler::LT, Process32B);
 9713     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 9714     __ sub(length, length, 64);
 9715     __ b(Process64B);
 9716 
 9717     __ BIND(Process32B);
 9718     __ cmp(length, (u1)32);
 9719     __ br(Assembler::LT, SIMDExit);
 9720     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 9721     __ sub(length, length, 32);
 9722     __ b(Process32B);
 9723 
 9724     __ BIND(SIMDExit);
 9725     __ cbz(length, Exit);
 9726     __ movw(rscratch1, length);
 9727     __ b(Process4B);
 9728 
 9729     __ BIND(Exit);
 9730     __ sub(c_rarg0, dst, doff);
 9731 
 9732     __ leave();
 9733     __ ret(lr);
 9734 
 9735     return start;
 9736   }
 9737 
 9738   // Support for spin waits.
 9739   address generate_spin_wait() {
 9740     __ align(CodeEntryAlignment);
 9741     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 9742     StubCodeMark mark(this, stub_id);
 9743     address start = __ pc();
 9744 
 9745     __ spin_wait();
 9746     __ ret(lr);
 9747 
 9748     return start;
 9749   }
 9750 
 9751   void generate_lookup_secondary_supers_table_stub() {
 9752     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 9753     StubCodeMark mark(this, stub_id);
 9754 
 9755     const Register
 9756       r_super_klass  = r0,
 9757       r_array_base   = r1,
 9758       r_array_length = r2,
 9759       r_array_index  = r3,
 9760       r_sub_klass    = r4,
 9761       r_bitmap       = rscratch2,
 9762       result         = r5;
 9763     const FloatRegister
 9764       vtemp          = v0;
 9765 
 9766     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 9767       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 9768       Label L_success;
 9769       __ enter();
 9770       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 9771                                              r_array_base, r_array_length, r_array_index,
 9772                                              vtemp, result, slot,
 9773                                              /*stub_is_near*/true);
 9774       __ leave();
 9775       __ ret(lr);
 9776     }
 9777   }
 9778 
 9779   // Slow path implementation for UseSecondarySupersTable.
 9780   address generate_lookup_secondary_supers_table_slow_path_stub() {
 9781     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 9782     StubCodeMark mark(this, stub_id);
 9783 
 9784     address start = __ pc();
 9785     const Register
 9786       r_super_klass  = r0,        // argument
 9787       r_array_base   = r1,        // argument
 9788       temp1          = r2,        // temp
 9789       r_array_index  = r3,        // argument
 9790       r_bitmap       = rscratch2, // argument
 9791       result         = r5;        // argument
 9792 
 9793     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 9794     __ ret(lr);
 9795 
 9796     return start;
 9797   }
 9798 
 9799 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9800 
 9801   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 9802   //
 9803   // If LSE is in use, generate LSE versions of all the stubs. The
 9804   // non-LSE versions are in atomic_aarch64.S.
 9805 
 9806   // class AtomicStubMark records the entry point of a stub and the
 9807   // stub pointer which will point to it. The stub pointer is set to
 9808   // the entry point when ~AtomicStubMark() is called, which must be
 9809   // after ICache::invalidate_range. This ensures safe publication of
 9810   // the generated code.
 9811   class AtomicStubMark {
 9812     address _entry_point;
 9813     aarch64_atomic_stub_t *_stub;
 9814     MacroAssembler *_masm;
 9815   public:
 9816     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 9817       _masm = masm;
 9818       __ align(32);
 9819       _entry_point = __ pc();
 9820       _stub = stub;
 9821     }
 9822     ~AtomicStubMark() {
 9823       *_stub = (aarch64_atomic_stub_t)_entry_point;
 9824     }
 9825   };
 9826 
 9827   // NB: For memory_order_conservative we need a trailing membar after
 9828   // LSE atomic operations but not a leading membar.
 9829   //
 9830   // We don't need a leading membar because a clause in the Arm ARM
 9831   // says:
 9832   //
 9833   //   Barrier-ordered-before
 9834   //
 9835   //   Barrier instructions order prior Memory effects before subsequent
 9836   //   Memory effects generated by the same Observer. A read or a write
 9837   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 9838   //   Observer if and only if RW1 appears in program order before RW 2
 9839   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 9840   //   instruction with both Acquire and Release semantics.
 9841   //
 9842   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 9843   // and Release semantics, therefore we don't need a leading
 9844   // barrier. However, there is no corresponding Barrier-ordered-after
 9845   // relationship, therefore we need a trailing membar to prevent a
 9846   // later store or load from being reordered with the store in an
 9847   // atomic instruction.
 9848   //
 9849   // This was checked by using the herd7 consistency model simulator
 9850   // (http://diy.inria.fr/) with this test case:
 9851   //
 9852   // AArch64 LseCas
 9853   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 9854   // P0 | P1;
 9855   // LDR W4, [X2] | MOV W3, #0;
 9856   // DMB LD       | MOV W4, #1;
 9857   // LDR W3, [X1] | CASAL W3, W4, [X1];
 9858   //              | DMB ISH;
 9859   //              | STR W4, [X2];
 9860   // exists
 9861   // (0:X3=0 /\ 0:X4=1)
 9862   //
 9863   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 9864   // with the store to x in P1. Without the DMB in P1 this may happen.
 9865   //
 9866   // At the time of writing we don't know of any AArch64 hardware that
 9867   // reorders stores in this way, but the Reference Manual permits it.
 9868 
 9869   void gen_cas_entry(Assembler::operand_size size,
 9870                      atomic_memory_order order) {
 9871     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 9872       exchange_val = c_rarg2;
 9873     bool acquire, release;
 9874     switch (order) {
 9875       case memory_order_relaxed:
 9876         acquire = false;
 9877         release = false;
 9878         break;
 9879       case memory_order_release:
 9880         acquire = false;
 9881         release = true;
 9882         break;
 9883       default:
 9884         acquire = true;
 9885         release = true;
 9886         break;
 9887     }
 9888     __ mov(prev, compare_val);
 9889     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 9890     if (order == memory_order_conservative) {
 9891       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9892     }
 9893     if (size == Assembler::xword) {
 9894       __ mov(r0, prev);
 9895     } else {
 9896       __ movw(r0, prev);
 9897     }
 9898     __ ret(lr);
 9899   }
 9900 
 9901   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 9902     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 9903     // If not relaxed, then default to conservative.  Relaxed is the only
 9904     // case we use enough to be worth specializing.
 9905     if (order == memory_order_relaxed) {
 9906       __ ldadd(size, incr, prev, addr);
 9907     } else {
 9908       __ ldaddal(size, incr, prev, addr);
 9909       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9910     }
 9911     if (size == Assembler::xword) {
 9912       __ mov(r0, prev);
 9913     } else {
 9914       __ movw(r0, prev);
 9915     }
 9916     __ ret(lr);
 9917   }
 9918 
 9919   void gen_swpal_entry(Assembler::operand_size size) {
 9920     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 9921     __ swpal(size, incr, prev, addr);
 9922     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9923     if (size == Assembler::xword) {
 9924       __ mov(r0, prev);
 9925     } else {
 9926       __ movw(r0, prev);
 9927     }
 9928     __ ret(lr);
 9929   }
 9930 
 9931   void generate_atomic_entry_points() {
 9932     if (! UseLSE) {
 9933       return;
 9934     }
 9935     __ align(CodeEntryAlignment);
 9936     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 9937     StubCodeMark mark(this, stub_id);
 9938     address first_entry = __ pc();
 9939 
 9940     // ADD, memory_order_conservative
 9941     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 9942     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 9943     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 9944     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 9945 
 9946     // ADD, memory_order_relaxed
 9947     AtomicStubMark mark_fetch_add_4_relaxed
 9948       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 9949     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 9950     AtomicStubMark mark_fetch_add_8_relaxed
 9951       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 9952     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 9953 
 9954     // XCHG, memory_order_conservative
 9955     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 9956     gen_swpal_entry(Assembler::word);
 9957     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 9958     gen_swpal_entry(Assembler::xword);
 9959 
 9960     // CAS, memory_order_conservative
 9961     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 9962     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 9963     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 9964     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 9965     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 9966     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 9967 
 9968     // CAS, memory_order_relaxed
 9969     AtomicStubMark mark_cmpxchg_1_relaxed
 9970       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 9971     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 9972     AtomicStubMark mark_cmpxchg_4_relaxed
 9973       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 9974     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 9975     AtomicStubMark mark_cmpxchg_8_relaxed
 9976       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 9977     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 9978 
 9979     AtomicStubMark mark_cmpxchg_4_release
 9980       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 9981     gen_cas_entry(MacroAssembler::word, memory_order_release);
 9982     AtomicStubMark mark_cmpxchg_8_release
 9983       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 9984     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 9985 
 9986     AtomicStubMark mark_cmpxchg_4_seq_cst
 9987       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 9988     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 9989     AtomicStubMark mark_cmpxchg_8_seq_cst
 9990       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 9991     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 9992 
 9993     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 9994   }
 9995 #endif // LINUX
 9996 
 9997   address generate_cont_thaw(Continuation::thaw_kind kind) {
 9998     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 9999     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10000 
10001     address start = __ pc();
10002 
10003     if (return_barrier) {
10004       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10005       __ mov(sp, rscratch1);
10006     }
10007     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10008 
10009     if (return_barrier) {
10010       // preserve possible return value from a method returning to the return barrier
10011       __ fmovd(rscratch1, v0);
10012       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10013     }
10014 
10015     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10016     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10017     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10018 
10019     if (return_barrier) {
10020       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10021       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10022       __ fmovd(v0, rscratch1);
10023     }
10024     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10025 
10026 
10027     Label thaw_success;
10028     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10029     __ cbnz(rscratch2, thaw_success);
10030     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10031     __ br(rscratch1);
10032     __ bind(thaw_success);
10033 
10034     // make room for the thawed frames
10035     __ sub(rscratch1, sp, rscratch2);
10036     __ andr(rscratch1, rscratch1, -16); // align
10037     __ mov(sp, rscratch1);
10038 
10039     if (return_barrier) {
10040       // save original return value -- again
10041       __ fmovd(rscratch1, v0);
10042       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10043     }
10044 
10045     // If we want, we can templatize thaw by kind, and have three different entries
10046     __ movw(c_rarg1, (uint32_t)kind);
10047 
10048     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10049     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10050 
10051     if (return_barrier) {
10052       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10053       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10054       __ fmovd(v0, rscratch1);
10055     } else {
10056       __ mov(r0, zr); // return 0 (success) from doYield
10057     }
10058 
10059     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10060     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10061     __ mov(rfp, sp);
10062 
10063     if (return_barrier_exception) {
10064       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10065       __ authenticate_return_address(c_rarg1);
10066       __ verify_oop(r0);
10067       // save return value containing the exception oop in callee-saved R19
10068       __ mov(r19, r0);
10069 
10070       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10071 
10072       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10073       // __ reinitialize_ptrue();
10074 
10075       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10076 
10077       __ mov(r1, r0); // the exception handler
10078       __ mov(r0, r19); // restore return value containing the exception oop
10079       __ verify_oop(r0);
10080 
10081       __ leave();
10082       __ mov(r3, lr);
10083       __ br(r1); // the exception handler
10084     } else {
10085       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10086       __ leave();
10087       __ ret(lr);
10088     }
10089 
10090     return start;
10091   }
10092 
10093   address generate_cont_thaw() {
10094     if (!Continuations::enabled()) return nullptr;
10095 
10096     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
10097     StubCodeMark mark(this, stub_id);
10098     address start = __ pc();
10099     generate_cont_thaw(Continuation::thaw_top);
10100     return start;
10101   }
10102 
10103   address generate_cont_returnBarrier() {
10104     if (!Continuations::enabled()) return nullptr;
10105 
10106     // TODO: will probably need multiple return barriers depending on return type
10107     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
10108     StubCodeMark mark(this, stub_id);
10109     address start = __ pc();
10110 
10111     generate_cont_thaw(Continuation::thaw_return_barrier);
10112 
10113     return start;
10114   }
10115 
10116   address generate_cont_returnBarrier_exception() {
10117     if (!Continuations::enabled()) return nullptr;
10118 
10119     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
10120     StubCodeMark mark(this, stub_id);
10121     address start = __ pc();
10122 
10123     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10124 
10125     return start;
10126   }
10127 
10128   address generate_cont_preempt_stub() {
10129     if (!Continuations::enabled()) return nullptr;
10130     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
10131     StubCodeMark mark(this, stub_id);
10132     address start = __ pc();
10133 
10134     __ reset_last_Java_frame(true);
10135 
10136     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10137     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10138     __ mov(sp, rscratch2);
10139 
10140     Label preemption_cancelled;
10141     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10142     __ cbnz(rscratch1, preemption_cancelled);
10143 
10144     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10145     SharedRuntime::continuation_enter_cleanup(_masm);
10146     __ leave();
10147     __ ret(lr);
10148 
10149     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10150     __ bind(preemption_cancelled);
10151     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10152     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10153     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10154     __ ldr(rscratch1, Address(rscratch1));
10155     __ br(rscratch1);
10156 
10157     return start;
10158   }
10159 
10160   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10161   // are represented as long[5], with BITS_PER_LIMB = 26.
10162   // Pack five 26-bit limbs into three 64-bit registers.
10163   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10164     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10165     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10166     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10167     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10168 
10169     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10170     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10171     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10172     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10173 
10174     if (dest2->is_valid()) {
10175       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10176     } else {
10177 #ifdef ASSERT
10178       Label OK;
10179       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10180       __ br(__ EQ, OK);
10181       __ stop("high bits of Poly1305 integer should be zero");
10182       __ should_not_reach_here();
10183       __ bind(OK);
10184 #endif
10185     }
10186   }
10187 
10188   // As above, but return only a 128-bit integer, packed into two
10189   // 64-bit registers.
10190   void pack_26(Register dest0, Register dest1, Register src) {
10191     pack_26(dest0, dest1, noreg, src);
10192   }
10193 
10194   // Multiply and multiply-accumulate unsigned 64-bit registers.
10195   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10196     __ mul(prod_lo, n, m);
10197     __ umulh(prod_hi, n, m);
10198   }
10199   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10200     wide_mul(rscratch1, rscratch2, n, m);
10201     __ adds(sum_lo, sum_lo, rscratch1);
10202     __ adc(sum_hi, sum_hi, rscratch2);
10203   }
10204 
10205   // Poly1305, RFC 7539
10206 
10207   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10208   // description of the tricks used to simplify and accelerate this
10209   // computation.
10210 
10211   address generate_poly1305_processBlocks() {
10212     __ align(CodeEntryAlignment);
10213     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
10214     StubCodeMark mark(this, stub_id);
10215     address start = __ pc();
10216     Label here;
10217     __ enter();
10218     RegSet callee_saved = RegSet::range(r19, r28);
10219     __ push(callee_saved, sp);
10220 
10221     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10222 
10223     // Arguments
10224     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10225 
10226     // R_n is the 128-bit randomly-generated key, packed into two
10227     // registers.  The caller passes this key to us as long[5], with
10228     // BITS_PER_LIMB = 26.
10229     const Register R_0 = *++regs, R_1 = *++regs;
10230     pack_26(R_0, R_1, r_start);
10231 
10232     // RR_n is (R_n >> 2) * 5
10233     const Register RR_0 = *++regs, RR_1 = *++regs;
10234     __ lsr(RR_0, R_0, 2);
10235     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10236     __ lsr(RR_1, R_1, 2);
10237     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10238 
10239     // U_n is the current checksum
10240     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10241     pack_26(U_0, U_1, U_2, acc_start);
10242 
10243     static constexpr int BLOCK_LENGTH = 16;
10244     Label DONE, LOOP;
10245 
10246     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10247     __ br(Assembler::LT, DONE); {
10248       __ bind(LOOP);
10249 
10250       // S_n is to be the sum of U_n and the next block of data
10251       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10252       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10253       __ adds(S_0, U_0, S_0);
10254       __ adcs(S_1, U_1, S_1);
10255       __ adc(S_2, U_2, zr);
10256       __ add(S_2, S_2, 1);
10257 
10258       const Register U_0HI = *++regs, U_1HI = *++regs;
10259 
10260       // NB: this logic depends on some of the special properties of
10261       // Poly1305 keys. In particular, because we know that the top
10262       // four bits of R_0 and R_1 are zero, we can add together
10263       // partial products without any risk of needing to propagate a
10264       // carry out.
10265       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10266       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10267       __ andr(U_2, R_0, 3);
10268       __ mul(U_2, S_2, U_2);
10269 
10270       // Recycle registers S_0, S_1, S_2
10271       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10272 
10273       // Partial reduction mod 2**130 - 5
10274       __ adds(U_1, U_0HI, U_1);
10275       __ adc(U_2, U_1HI, U_2);
10276       // Sum now in U_2:U_1:U_0.
10277       // Dead: U_0HI, U_1HI.
10278       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10279 
10280       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10281 
10282       // First, U_2:U_1:U_0 += (U_2 >> 2)
10283       __ lsr(rscratch1, U_2, 2);
10284       __ andr(U_2, U_2, (u8)3);
10285       __ adds(U_0, U_0, rscratch1);
10286       __ adcs(U_1, U_1, zr);
10287       __ adc(U_2, U_2, zr);
10288       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10289       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10290       __ adcs(U_1, U_1, zr);
10291       __ adc(U_2, U_2, zr);
10292 
10293       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10294       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10295       __ br(~ Assembler::LT, LOOP);
10296     }
10297 
10298     // Further reduce modulo 2^130 - 5
10299     __ lsr(rscratch1, U_2, 2);
10300     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10301     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10302     __ adcs(U_1, U_1, zr);
10303     __ andr(U_2, U_2, (u1)3);
10304     __ adc(U_2, U_2, zr);
10305 
10306     // Unpack the sum into five 26-bit limbs and write to memory.
10307     __ ubfiz(rscratch1, U_0, 0, 26);
10308     __ ubfx(rscratch2, U_0, 26, 26);
10309     __ stp(rscratch1, rscratch2, Address(acc_start));
10310     __ ubfx(rscratch1, U_0, 52, 12);
10311     __ bfi(rscratch1, U_1, 12, 14);
10312     __ ubfx(rscratch2, U_1, 14, 26);
10313     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10314     __ ubfx(rscratch1, U_1, 40, 24);
10315     __ bfi(rscratch1, U_2, 24, 3);
10316     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10317 
10318     __ bind(DONE);
10319     __ pop(callee_saved, sp);
10320     __ leave();
10321     __ ret(lr);
10322 
10323     return start;
10324   }
10325 
10326   // exception handler for upcall stubs
10327   address generate_upcall_stub_exception_handler() {
10328     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
10329     StubCodeMark mark(this, stub_id);
10330     address start = __ pc();
10331 
10332     // Native caller has no idea how to handle exceptions,
10333     // so we just crash here. Up to callee to catch exceptions.
10334     __ verify_oop(r0);
10335     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10336     __ blr(rscratch1);
10337     __ should_not_reach_here();
10338 
10339     return start;
10340   }
10341 
10342   // load Method* target of MethodHandle
10343   // j_rarg0 = jobject receiver
10344   // rmethod = result
10345   address generate_upcall_stub_load_target() {
10346     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
10347     StubCodeMark mark(this, stub_id);
10348     address start = __ pc();
10349 
10350     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10351       // Load target method from receiver
10352     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10353     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10354     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10355     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10356                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10357                       noreg, noreg);
10358     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10359 
10360     __ ret(lr);
10361 
10362     return start;
10363   }
10364 
10365 #undef __
10366 #define __ masm->
10367 
10368   class MontgomeryMultiplyGenerator : public MacroAssembler {
10369 
10370     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10371       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10372 
10373     RegSet _toSave;
10374     bool _squaring;
10375 
10376   public:
10377     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10378       : MacroAssembler(as->code()), _squaring(squaring) {
10379 
10380       // Register allocation
10381 
10382       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10383       Pa_base = *regs;       // Argument registers
10384       if (squaring)
10385         Pb_base = Pa_base;
10386       else
10387         Pb_base = *++regs;
10388       Pn_base = *++regs;
10389       Rlen= *++regs;
10390       inv = *++regs;
10391       Pm_base = *++regs;
10392 
10393                           // Working registers:
10394       Ra =  *++regs;        // The current digit of a, b, n, and m.
10395       Rb =  *++regs;
10396       Rm =  *++regs;
10397       Rn =  *++regs;
10398 
10399       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10400       Pb =  *++regs;
10401       Pm =  *++regs;
10402       Pn =  *++regs;
10403 
10404       t0 =  *++regs;        // Three registers which form a
10405       t1 =  *++regs;        // triple-precision accumuator.
10406       t2 =  *++regs;
10407 
10408       Ri =  *++regs;        // Inner and outer loop indexes.
10409       Rj =  *++regs;
10410 
10411       Rhi_ab = *++regs;     // Product registers: low and high parts
10412       Rlo_ab = *++regs;     // of a*b and m*n.
10413       Rhi_mn = *++regs;
10414       Rlo_mn = *++regs;
10415 
10416       // r19 and up are callee-saved.
10417       _toSave = RegSet::range(r19, *regs) + Pm_base;
10418     }
10419 
10420   private:
10421     void save_regs() {
10422       push(_toSave, sp);
10423     }
10424 
10425     void restore_regs() {
10426       pop(_toSave, sp);
10427     }
10428 
10429     template <typename T>
10430     void unroll_2(Register count, T block) {
10431       Label loop, end, odd;
10432       tbnz(count, 0, odd);
10433       cbz(count, end);
10434       align(16);
10435       bind(loop);
10436       (this->*block)();
10437       bind(odd);
10438       (this->*block)();
10439       subs(count, count, 2);
10440       br(Assembler::GT, loop);
10441       bind(end);
10442     }
10443 
10444     template <typename T>
10445     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10446       Label loop, end, odd;
10447       tbnz(count, 0, odd);
10448       cbz(count, end);
10449       align(16);
10450       bind(loop);
10451       (this->*block)(d, s, tmp);
10452       bind(odd);
10453       (this->*block)(d, s, tmp);
10454       subs(count, count, 2);
10455       br(Assembler::GT, loop);
10456       bind(end);
10457     }
10458 
10459     void pre1(RegisterOrConstant i) {
10460       block_comment("pre1");
10461       // Pa = Pa_base;
10462       // Pb = Pb_base + i;
10463       // Pm = Pm_base;
10464       // Pn = Pn_base + i;
10465       // Ra = *Pa;
10466       // Rb = *Pb;
10467       // Rm = *Pm;
10468       // Rn = *Pn;
10469       ldr(Ra, Address(Pa_base));
10470       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10471       ldr(Rm, Address(Pm_base));
10472       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10473       lea(Pa, Address(Pa_base));
10474       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10475       lea(Pm, Address(Pm_base));
10476       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10477 
10478       // Zero the m*n result.
10479       mov(Rhi_mn, zr);
10480       mov(Rlo_mn, zr);
10481     }
10482 
10483     // The core multiply-accumulate step of a Montgomery
10484     // multiplication.  The idea is to schedule operations as a
10485     // pipeline so that instructions with long latencies (loads and
10486     // multiplies) have time to complete before their results are
10487     // used.  This most benefits in-order implementations of the
10488     // architecture but out-of-order ones also benefit.
10489     void step() {
10490       block_comment("step");
10491       // MACC(Ra, Rb, t0, t1, t2);
10492       // Ra = *++Pa;
10493       // Rb = *--Pb;
10494       umulh(Rhi_ab, Ra, Rb);
10495       mul(Rlo_ab, Ra, Rb);
10496       ldr(Ra, pre(Pa, wordSize));
10497       ldr(Rb, pre(Pb, -wordSize));
10498       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10499                                        // previous iteration.
10500       // MACC(Rm, Rn, t0, t1, t2);
10501       // Rm = *++Pm;
10502       // Rn = *--Pn;
10503       umulh(Rhi_mn, Rm, Rn);
10504       mul(Rlo_mn, Rm, Rn);
10505       ldr(Rm, pre(Pm, wordSize));
10506       ldr(Rn, pre(Pn, -wordSize));
10507       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10508     }
10509 
10510     void post1() {
10511       block_comment("post1");
10512 
10513       // MACC(Ra, Rb, t0, t1, t2);
10514       // Ra = *++Pa;
10515       // Rb = *--Pb;
10516       umulh(Rhi_ab, Ra, Rb);
10517       mul(Rlo_ab, Ra, Rb);
10518       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10519       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10520 
10521       // *Pm = Rm = t0 * inv;
10522       mul(Rm, t0, inv);
10523       str(Rm, Address(Pm));
10524 
10525       // MACC(Rm, Rn, t0, t1, t2);
10526       // t0 = t1; t1 = t2; t2 = 0;
10527       umulh(Rhi_mn, Rm, Rn);
10528 
10529 #ifndef PRODUCT
10530       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10531       {
10532         mul(Rlo_mn, Rm, Rn);
10533         add(Rlo_mn, t0, Rlo_mn);
10534         Label ok;
10535         cbz(Rlo_mn, ok); {
10536           stop("broken Montgomery multiply");
10537         } bind(ok);
10538       }
10539 #endif
10540       // We have very carefully set things up so that
10541       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10542       // the lower half of Rm * Rn because we know the result already:
10543       // it must be -t0.  t0 + (-t0) must generate a carry iff
10544       // t0 != 0.  So, rather than do a mul and an adds we just set
10545       // the carry flag iff t0 is nonzero.
10546       //
10547       // mul(Rlo_mn, Rm, Rn);
10548       // adds(zr, t0, Rlo_mn);
10549       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10550       adcs(t0, t1, Rhi_mn);
10551       adc(t1, t2, zr);
10552       mov(t2, zr);
10553     }
10554 
10555     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
10556       block_comment("pre2");
10557       // Pa = Pa_base + i-len;
10558       // Pb = Pb_base + len;
10559       // Pm = Pm_base + i-len;
10560       // Pn = Pn_base + len;
10561 
10562       if (i.is_register()) {
10563         sub(Rj, i.as_register(), len);
10564       } else {
10565         mov(Rj, i.as_constant());
10566         sub(Rj, Rj, len);
10567       }
10568       // Rj == i-len
10569 
10570       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
10571       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
10572       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10573       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
10574 
10575       // Ra = *++Pa;
10576       // Rb = *--Pb;
10577       // Rm = *++Pm;
10578       // Rn = *--Pn;
10579       ldr(Ra, pre(Pa, wordSize));
10580       ldr(Rb, pre(Pb, -wordSize));
10581       ldr(Rm, pre(Pm, wordSize));
10582       ldr(Rn, pre(Pn, -wordSize));
10583 
10584       mov(Rhi_mn, zr);
10585       mov(Rlo_mn, zr);
10586     }
10587 
10588     void post2(RegisterOrConstant i, RegisterOrConstant len) {
10589       block_comment("post2");
10590       if (i.is_constant()) {
10591         mov(Rj, i.as_constant()-len.as_constant());
10592       } else {
10593         sub(Rj, i.as_register(), len);
10594       }
10595 
10596       adds(t0, t0, Rlo_mn); // The pending m*n, low part
10597 
10598       // As soon as we know the least significant digit of our result,
10599       // store it.
10600       // Pm_base[i-len] = t0;
10601       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10602 
10603       // t0 = t1; t1 = t2; t2 = 0;
10604       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
10605       adc(t1, t2, zr);
10606       mov(t2, zr);
10607     }
10608 
10609     // A carry in t0 after Montgomery multiplication means that we
10610     // should subtract multiples of n from our result in m.  We'll
10611     // keep doing that until there is no carry.
10612     void normalize(RegisterOrConstant len) {
10613       block_comment("normalize");
10614       // while (t0)
10615       //   t0 = sub(Pm_base, Pn_base, t0, len);
10616       Label loop, post, again;
10617       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
10618       cbz(t0, post); {
10619         bind(again); {
10620           mov(i, zr);
10621           mov(cnt, len);
10622           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10623           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10624           subs(zr, zr, zr); // set carry flag, i.e. no borrow
10625           align(16);
10626           bind(loop); {
10627             sbcs(Rm, Rm, Rn);
10628             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10629             add(i, i, 1);
10630             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10631             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10632             sub(cnt, cnt, 1);
10633           } cbnz(cnt, loop);
10634           sbc(t0, t0, zr);
10635         } cbnz(t0, again);
10636       } bind(post);
10637     }
10638 
10639     // Move memory at s to d, reversing words.
10640     //    Increments d to end of copied memory
10641     //    Destroys tmp1, tmp2
10642     //    Preserves len
10643     //    Leaves s pointing to the address which was in d at start
10644     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
10645       assert(tmp1->encoding() < r19->encoding(), "register corruption");
10646       assert(tmp2->encoding() < r19->encoding(), "register corruption");
10647 
10648       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
10649       mov(tmp1, len);
10650       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
10651       sub(s, d, len, ext::uxtw, LogBytesPerWord);
10652     }
10653     // where
10654     void reverse1(Register d, Register s, Register tmp) {
10655       ldr(tmp, pre(s, -wordSize));
10656       ror(tmp, tmp, 32);
10657       str(tmp, post(d, wordSize));
10658     }
10659 
10660     void step_squaring() {
10661       // An extra ACC
10662       step();
10663       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10664     }
10665 
10666     void last_squaring(RegisterOrConstant i) {
10667       Label dont;
10668       // if ((i & 1) == 0) {
10669       tbnz(i.as_register(), 0, dont); {
10670         // MACC(Ra, Rb, t0, t1, t2);
10671         // Ra = *++Pa;
10672         // Rb = *--Pb;
10673         umulh(Rhi_ab, Ra, Rb);
10674         mul(Rlo_ab, Ra, Rb);
10675         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10676       } bind(dont);
10677     }
10678 
10679     void extra_step_squaring() {
10680       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10681 
10682       // MACC(Rm, Rn, t0, t1, t2);
10683       // Rm = *++Pm;
10684       // Rn = *--Pn;
10685       umulh(Rhi_mn, Rm, Rn);
10686       mul(Rlo_mn, Rm, Rn);
10687       ldr(Rm, pre(Pm, wordSize));
10688       ldr(Rn, pre(Pn, -wordSize));
10689     }
10690 
10691     void post1_squaring() {
10692       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10693 
10694       // *Pm = Rm = t0 * inv;
10695       mul(Rm, t0, inv);
10696       str(Rm, Address(Pm));
10697 
10698       // MACC(Rm, Rn, t0, t1, t2);
10699       // t0 = t1; t1 = t2; t2 = 0;
10700       umulh(Rhi_mn, Rm, Rn);
10701 
10702 #ifndef PRODUCT
10703       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10704       {
10705         mul(Rlo_mn, Rm, Rn);
10706         add(Rlo_mn, t0, Rlo_mn);
10707         Label ok;
10708         cbz(Rlo_mn, ok); {
10709           stop("broken Montgomery multiply");
10710         } bind(ok);
10711       }
10712 #endif
10713       // We have very carefully set things up so that
10714       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10715       // the lower half of Rm * Rn because we know the result already:
10716       // it must be -t0.  t0 + (-t0) must generate a carry iff
10717       // t0 != 0.  So, rather than do a mul and an adds we just set
10718       // the carry flag iff t0 is nonzero.
10719       //
10720       // mul(Rlo_mn, Rm, Rn);
10721       // adds(zr, t0, Rlo_mn);
10722       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10723       adcs(t0, t1, Rhi_mn);
10724       adc(t1, t2, zr);
10725       mov(t2, zr);
10726     }
10727 
10728     void acc(Register Rhi, Register Rlo,
10729              Register t0, Register t1, Register t2) {
10730       adds(t0, t0, Rlo);
10731       adcs(t1, t1, Rhi);
10732       adc(t2, t2, zr);
10733     }
10734 
10735   public:
10736     /**
10737      * Fast Montgomery multiplication.  The derivation of the
10738      * algorithm is in A Cryptographic Library for the Motorola
10739      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
10740      *
10741      * Arguments:
10742      *
10743      * Inputs for multiplication:
10744      *   c_rarg0   - int array elements a
10745      *   c_rarg1   - int array elements b
10746      *   c_rarg2   - int array elements n (the modulus)
10747      *   c_rarg3   - int length
10748      *   c_rarg4   - int inv
10749      *   c_rarg5   - int array elements m (the result)
10750      *
10751      * Inputs for squaring:
10752      *   c_rarg0   - int array elements a
10753      *   c_rarg1   - int array elements n (the modulus)
10754      *   c_rarg2   - int length
10755      *   c_rarg3   - int inv
10756      *   c_rarg4   - int array elements m (the result)
10757      *
10758      */
10759     address generate_multiply() {
10760       Label argh, nothing;
10761       bind(argh);
10762       stop("MontgomeryMultiply total_allocation must be <= 8192");
10763 
10764       align(CodeEntryAlignment);
10765       address entry = pc();
10766 
10767       cbzw(Rlen, nothing);
10768 
10769       enter();
10770 
10771       // Make room.
10772       cmpw(Rlen, 512);
10773       br(Assembler::HI, argh);
10774       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
10775       andr(sp, Ra, -2 * wordSize);
10776 
10777       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
10778 
10779       {
10780         // Copy input args, reversing as we go.  We use Ra as a
10781         // temporary variable.
10782         reverse(Ra, Pa_base, Rlen, t0, t1);
10783         if (!_squaring)
10784           reverse(Ra, Pb_base, Rlen, t0, t1);
10785         reverse(Ra, Pn_base, Rlen, t0, t1);
10786       }
10787 
10788       // Push all call-saved registers and also Pm_base which we'll need
10789       // at the end.
10790       save_regs();
10791 
10792 #ifndef PRODUCT
10793       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
10794       {
10795         ldr(Rn, Address(Pn_base, 0));
10796         mul(Rlo_mn, Rn, inv);
10797         subs(zr, Rlo_mn, -1);
10798         Label ok;
10799         br(EQ, ok); {
10800           stop("broken inverse in Montgomery multiply");
10801         } bind(ok);
10802       }
10803 #endif
10804 
10805       mov(Pm_base, Ra);
10806 
10807       mov(t0, zr);
10808       mov(t1, zr);
10809       mov(t2, zr);
10810 
10811       block_comment("for (int i = 0; i < len; i++) {");
10812       mov(Ri, zr); {
10813         Label loop, end;
10814         cmpw(Ri, Rlen);
10815         br(Assembler::GE, end);
10816 
10817         bind(loop);
10818         pre1(Ri);
10819 
10820         block_comment("  for (j = i; j; j--) {"); {
10821           movw(Rj, Ri);
10822           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10823         } block_comment("  } // j");
10824 
10825         post1();
10826         addw(Ri, Ri, 1);
10827         cmpw(Ri, Rlen);
10828         br(Assembler::LT, loop);
10829         bind(end);
10830         block_comment("} // i");
10831       }
10832 
10833       block_comment("for (int i = len; i < 2*len; i++) {");
10834       mov(Ri, Rlen); {
10835         Label loop, end;
10836         cmpw(Ri, Rlen, Assembler::LSL, 1);
10837         br(Assembler::GE, end);
10838 
10839         bind(loop);
10840         pre2(Ri, Rlen);
10841 
10842         block_comment("  for (j = len*2-i-1; j; j--) {"); {
10843           lslw(Rj, Rlen, 1);
10844           subw(Rj, Rj, Ri);
10845           subw(Rj, Rj, 1);
10846           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10847         } block_comment("  } // j");
10848 
10849         post2(Ri, Rlen);
10850         addw(Ri, Ri, 1);
10851         cmpw(Ri, Rlen, Assembler::LSL, 1);
10852         br(Assembler::LT, loop);
10853         bind(end);
10854       }
10855       block_comment("} // i");
10856 
10857       normalize(Rlen);
10858 
10859       mov(Ra, Pm_base);  // Save Pm_base in Ra
10860       restore_regs();  // Restore caller's Pm_base
10861 
10862       // Copy our result into caller's Pm_base
10863       reverse(Pm_base, Ra, Rlen, t0, t1);
10864 
10865       leave();
10866       bind(nothing);
10867       ret(lr);
10868 
10869       return entry;
10870     }
10871     // In C, approximately:
10872 
10873     // void
10874     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
10875     //                     julong Pn_base[], julong Pm_base[],
10876     //                     julong inv, int len) {
10877     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
10878     //   julong *Pa, *Pb, *Pn, *Pm;
10879     //   julong Ra, Rb, Rn, Rm;
10880 
10881     //   int i;
10882 
10883     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
10884 
10885     //   for (i = 0; i < len; i++) {
10886     //     int j;
10887 
10888     //     Pa = Pa_base;
10889     //     Pb = Pb_base + i;
10890     //     Pm = Pm_base;
10891     //     Pn = Pn_base + i;
10892 
10893     //     Ra = *Pa;
10894     //     Rb = *Pb;
10895     //     Rm = *Pm;
10896     //     Rn = *Pn;
10897 
10898     //     int iters = i;
10899     //     for (j = 0; iters--; j++) {
10900     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
10901     //       MACC(Ra, Rb, t0, t1, t2);
10902     //       Ra = *++Pa;
10903     //       Rb = *--Pb;
10904     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
10905     //       MACC(Rm, Rn, t0, t1, t2);
10906     //       Rm = *++Pm;
10907     //       Rn = *--Pn;
10908     //     }
10909 
10910     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
10911     //     MACC(Ra, Rb, t0, t1, t2);
10912     //     *Pm = Rm = t0 * inv;
10913     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
10914     //     MACC(Rm, Rn, t0, t1, t2);
10915 
10916     //     assert(t0 == 0, "broken Montgomery multiply");
10917 
10918     //     t0 = t1; t1 = t2; t2 = 0;
10919     //   }
10920 
10921     //   for (i = len; i < 2*len; i++) {
10922     //     int j;
10923 
10924     //     Pa = Pa_base + i-len;
10925     //     Pb = Pb_base + len;
10926     //     Pm = Pm_base + i-len;
10927     //     Pn = Pn_base + len;
10928 
10929     //     Ra = *++Pa;
10930     //     Rb = *--Pb;
10931     //     Rm = *++Pm;
10932     //     Rn = *--Pn;
10933 
10934     //     int iters = len*2-i-1;
10935     //     for (j = i-len+1; iters--; j++) {
10936     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
10937     //       MACC(Ra, Rb, t0, t1, t2);
10938     //       Ra = *++Pa;
10939     //       Rb = *--Pb;
10940     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
10941     //       MACC(Rm, Rn, t0, t1, t2);
10942     //       Rm = *++Pm;
10943     //       Rn = *--Pn;
10944     //     }
10945 
10946     //     Pm_base[i-len] = t0;
10947     //     t0 = t1; t1 = t2; t2 = 0;
10948     //   }
10949 
10950     //   while (t0)
10951     //     t0 = sub(Pm_base, Pn_base, t0, len);
10952     // }
10953 
10954     /**
10955      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
10956      * multiplies than Montgomery multiplication so it should be up to
10957      * 25% faster.  However, its loop control is more complex and it
10958      * may actually run slower on some machines.
10959      *
10960      * Arguments:
10961      *
10962      * Inputs:
10963      *   c_rarg0   - int array elements a
10964      *   c_rarg1   - int array elements n (the modulus)
10965      *   c_rarg2   - int length
10966      *   c_rarg3   - int inv
10967      *   c_rarg4   - int array elements m (the result)
10968      *
10969      */
10970     address generate_square() {
10971       Label argh;
10972       bind(argh);
10973       stop("MontgomeryMultiply total_allocation must be <= 8192");
10974 
10975       align(CodeEntryAlignment);
10976       address entry = pc();
10977 
10978       enter();
10979 
10980       // Make room.
10981       cmpw(Rlen, 512);
10982       br(Assembler::HI, argh);
10983       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
10984       andr(sp, Ra, -2 * wordSize);
10985 
10986       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
10987 
10988       {
10989         // Copy input args, reversing as we go.  We use Ra as a
10990         // temporary variable.
10991         reverse(Ra, Pa_base, Rlen, t0, t1);
10992         reverse(Ra, Pn_base, Rlen, t0, t1);
10993       }
10994 
10995       // Push all call-saved registers and also Pm_base which we'll need
10996       // at the end.
10997       save_regs();
10998 
10999       mov(Pm_base, Ra);
11000 
11001       mov(t0, zr);
11002       mov(t1, zr);
11003       mov(t2, zr);
11004 
11005       block_comment("for (int i = 0; i < len; i++) {");
11006       mov(Ri, zr); {
11007         Label loop, end;
11008         bind(loop);
11009         cmp(Ri, Rlen);
11010         br(Assembler::GE, end);
11011 
11012         pre1(Ri);
11013 
11014         block_comment("for (j = (i+1)/2; j; j--) {"); {
11015           add(Rj, Ri, 1);
11016           lsr(Rj, Rj, 1);
11017           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11018         } block_comment("  } // j");
11019 
11020         last_squaring(Ri);
11021 
11022         block_comment("  for (j = i/2; j; j--) {"); {
11023           lsr(Rj, Ri, 1);
11024           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11025         } block_comment("  } // j");
11026 
11027         post1_squaring();
11028         add(Ri, Ri, 1);
11029         cmp(Ri, Rlen);
11030         br(Assembler::LT, loop);
11031 
11032         bind(end);
11033         block_comment("} // i");
11034       }
11035 
11036       block_comment("for (int i = len; i < 2*len; i++) {");
11037       mov(Ri, Rlen); {
11038         Label loop, end;
11039         bind(loop);
11040         cmp(Ri, Rlen, Assembler::LSL, 1);
11041         br(Assembler::GE, end);
11042 
11043         pre2(Ri, Rlen);
11044 
11045         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11046           lsl(Rj, Rlen, 1);
11047           sub(Rj, Rj, Ri);
11048           sub(Rj, Rj, 1);
11049           lsr(Rj, Rj, 1);
11050           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11051         } block_comment("  } // j");
11052 
11053         last_squaring(Ri);
11054 
11055         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11056           lsl(Rj, Rlen, 1);
11057           sub(Rj, Rj, Ri);
11058           lsr(Rj, Rj, 1);
11059           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11060         } block_comment("  } // j");
11061 
11062         post2(Ri, Rlen);
11063         add(Ri, Ri, 1);
11064         cmp(Ri, Rlen, Assembler::LSL, 1);
11065 
11066         br(Assembler::LT, loop);
11067         bind(end);
11068         block_comment("} // i");
11069       }
11070 
11071       normalize(Rlen);
11072 
11073       mov(Ra, Pm_base);  // Save Pm_base in Ra
11074       restore_regs();  // Restore caller's Pm_base
11075 
11076       // Copy our result into caller's Pm_base
11077       reverse(Pm_base, Ra, Rlen, t0, t1);
11078 
11079       leave();
11080       ret(lr);
11081 
11082       return entry;
11083     }
11084     // In C, approximately:
11085 
11086     // void
11087     // montgomery_square(julong Pa_base[], julong Pn_base[],
11088     //                   julong Pm_base[], julong inv, int len) {
11089     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11090     //   julong *Pa, *Pb, *Pn, *Pm;
11091     //   julong Ra, Rb, Rn, Rm;
11092 
11093     //   int i;
11094 
11095     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11096 
11097     //   for (i = 0; i < len; i++) {
11098     //     int j;
11099 
11100     //     Pa = Pa_base;
11101     //     Pb = Pa_base + i;
11102     //     Pm = Pm_base;
11103     //     Pn = Pn_base + i;
11104 
11105     //     Ra = *Pa;
11106     //     Rb = *Pb;
11107     //     Rm = *Pm;
11108     //     Rn = *Pn;
11109 
11110     //     int iters = (i+1)/2;
11111     //     for (j = 0; iters--; j++) {
11112     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11113     //       MACC2(Ra, Rb, t0, t1, t2);
11114     //       Ra = *++Pa;
11115     //       Rb = *--Pb;
11116     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11117     //       MACC(Rm, Rn, t0, t1, t2);
11118     //       Rm = *++Pm;
11119     //       Rn = *--Pn;
11120     //     }
11121     //     if ((i & 1) == 0) {
11122     //       assert(Ra == Pa_base[j], "must be");
11123     //       MACC(Ra, Ra, t0, t1, t2);
11124     //     }
11125     //     iters = i/2;
11126     //     assert(iters == i-j, "must be");
11127     //     for (; iters--; j++) {
11128     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11129     //       MACC(Rm, Rn, t0, t1, t2);
11130     //       Rm = *++Pm;
11131     //       Rn = *--Pn;
11132     //     }
11133 
11134     //     *Pm = Rm = t0 * inv;
11135     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11136     //     MACC(Rm, Rn, t0, t1, t2);
11137 
11138     //     assert(t0 == 0, "broken Montgomery multiply");
11139 
11140     //     t0 = t1; t1 = t2; t2 = 0;
11141     //   }
11142 
11143     //   for (i = len; i < 2*len; i++) {
11144     //     int start = i-len+1;
11145     //     int end = start + (len - start)/2;
11146     //     int j;
11147 
11148     //     Pa = Pa_base + i-len;
11149     //     Pb = Pa_base + len;
11150     //     Pm = Pm_base + i-len;
11151     //     Pn = Pn_base + len;
11152 
11153     //     Ra = *++Pa;
11154     //     Rb = *--Pb;
11155     //     Rm = *++Pm;
11156     //     Rn = *--Pn;
11157 
11158     //     int iters = (2*len-i-1)/2;
11159     //     assert(iters == end-start, "must be");
11160     //     for (j = start; iters--; j++) {
11161     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11162     //       MACC2(Ra, Rb, t0, t1, t2);
11163     //       Ra = *++Pa;
11164     //       Rb = *--Pb;
11165     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11166     //       MACC(Rm, Rn, t0, t1, t2);
11167     //       Rm = *++Pm;
11168     //       Rn = *--Pn;
11169     //     }
11170     //     if ((i & 1) == 0) {
11171     //       assert(Ra == Pa_base[j], "must be");
11172     //       MACC(Ra, Ra, t0, t1, t2);
11173     //     }
11174     //     iters =  (2*len-i)/2;
11175     //     assert(iters == len-j, "must be");
11176     //     for (; iters--; j++) {
11177     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11178     //       MACC(Rm, Rn, t0, t1, t2);
11179     //       Rm = *++Pm;
11180     //       Rn = *--Pn;
11181     //     }
11182     //     Pm_base[i-len] = t0;
11183     //     t0 = t1; t1 = t2; t2 = 0;
11184     //   }
11185 
11186     //   while (t0)
11187     //     t0 = sub(Pm_base, Pn_base, t0, len);
11188     // }
11189   };
11190 
11191   void generate_vector_math_stubs() {
11192     // Get native vector math stub routine addresses
11193     void* libsleef = nullptr;
11194     char ebuf[1024];
11195     char dll_name[JVM_MAXPATHLEN];
11196     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
11197       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
11198     }
11199     if (libsleef == nullptr) {
11200       log_info(library)("Failed to load native vector math library, %s!", ebuf);
11201       return;
11202     }
11203     // Method naming convention
11204     //   All the methods are named as <OP><T><N>_<U><suffix>
11205     //   Where:
11206     //     <OP>     is the operation name, e.g. sin
11207     //     <T>      is optional to indicate float/double
11208     //              "f/d" for vector float/double operation
11209     //     <N>      is the number of elements in the vector
11210     //              "2/4" for neon, and "x" for sve
11211     //     <U>      is the precision level
11212     //              "u10/u05" represents 1.0/0.5 ULP error bounds
11213     //               We use "u10" for all operations by default
11214     //               But for those functions do not have u10 support, we use "u05" instead
11215     //     <suffix> indicates neon/sve
11216     //              "sve/advsimd" for sve/neon implementations
11217     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
11218     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
11219     //
11220     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
11221 
11222     // Math vector stubs implemented with SVE for scalable vector size.
11223     if (UseSVE > 0) {
11224       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
11225         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
11226         // Skip "tanh" because there is performance regression
11227         if (vop == VectorSupport::VECTOR_OP_TANH) {
11228           continue;
11229         }
11230 
11231         // The native library does not support u10 level of "hypot".
11232         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
11233 
11234         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
11235         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
11236 
11237         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
11238         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
11239       }
11240     }
11241 
11242     // Math vector stubs implemented with NEON for 64/128 bits vector size.
11243     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
11244       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
11245       // Skip "tanh" because there is performance regression
11246       if (vop == VectorSupport::VECTOR_OP_TANH) {
11247         continue;
11248       }
11249 
11250       // The native library does not support u10 level of "hypot".
11251       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
11252 
11253       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
11254       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
11255 
11256       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
11257       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
11258 
11259       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
11260       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
11261     }
11262   }
11263 
11264   // Call here from the interpreter or compiled code to either load
11265   // multiple returned values from the inline type instance being
11266   // returned to registers or to store returned values to a newly
11267   // allocated inline type instance.
11268   address generate_return_value_stub(address destination, const char* name, bool has_res) {
11269     // We need to save all registers the calling convention may use so
11270     // the runtime calls read or update those registers. This needs to
11271     // be in sync with SharedRuntime::java_return_convention().
11272     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11273     enum layout {
11274       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
11275       j_rarg6_off, j_rarg6_2,
11276       j_rarg5_off, j_rarg5_2,
11277       j_rarg4_off, j_rarg4_2,
11278       j_rarg3_off, j_rarg3_2,
11279       j_rarg2_off, j_rarg2_2,
11280       j_rarg1_off, j_rarg1_2,
11281       j_rarg0_off, j_rarg0_2,
11282 
11283       j_farg7_off, j_farg7_2,
11284       j_farg6_off, j_farg6_2,
11285       j_farg5_off, j_farg5_2,
11286       j_farg4_off, j_farg4_2,
11287       j_farg3_off, j_farg3_2,
11288       j_farg2_off, j_farg2_2,
11289       j_farg1_off, j_farg1_2,
11290       j_farg0_off, j_farg0_2,
11291 
11292       rfp_off, rfp_off2,
11293       return_off, return_off2,
11294 
11295       framesize // inclusive of return address
11296     };
11297 
11298     CodeBuffer code(name, 512, 64);
11299     MacroAssembler* masm = new MacroAssembler(&code);
11300 
11301     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11302     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11303     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11304     int frame_size_in_words = frame_size_in_bytes / wordSize;
11305 
11306     OopMapSet* oop_maps = new OopMapSet();
11307     OopMap* map = new OopMap(frame_size_in_slots, 0);
11308 
11309     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11310     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11311     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11312     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11313     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11314     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11315     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11316     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11317 
11318     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11319     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11320     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11321     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11322     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11323     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11324     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11325     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11326 
11327     address start = __ pc();
11328 
11329     __ enter(); // Save FP and LR before call
11330 
11331     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11332     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11333     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11334     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11335 
11336     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11337     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11338     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11339     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11340 
11341     int frame_complete = __ offset();
11342 
11343     // Set up last_Java_sp and last_Java_fp
11344     address the_pc = __ pc();
11345     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11346 
11347     // Call runtime
11348     __ mov(c_rarg1, r0);
11349     __ mov(c_rarg0, rthread);
11350 
11351     __ mov(rscratch1, destination);
11352     __ blr(rscratch1);
11353 
11354     oop_maps->add_gc_map(the_pc - start, map);
11355 
11356     __ reset_last_Java_frame(false);
11357 
11358     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11359     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11360     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11361     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11362 
11363     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11364     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11365     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11366     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11367 
11368     __ leave();
11369 
11370     // check for pending exceptions
11371     Label pending;
11372     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11373     __ cbnz(rscratch1, pending);
11374 
11375     if (has_res) {
11376       __ get_vm_result_oop(r0, rthread);
11377     }
11378 
11379     __ ret(lr);
11380 
11381     __ bind(pending);
11382     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11383 
11384     // -------------
11385     // make sure all code is generated
11386     masm->flush();
11387 
11388     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11389     return stub->entry_point();
11390   }
11391 
11392   // Initialization
11393   void generate_initial_stubs() {
11394     // Generate initial stubs and initializes the entry points
11395 
11396     // entry points that exist in all platforms Note: This is code
11397     // that could be shared among different platforms - however the
11398     // benefit seems to be smaller than the disadvantage of having a
11399     // much more complicated generator structure. See also comment in
11400     // stubRoutines.hpp.
11401 
11402     StubRoutines::_forward_exception_entry = generate_forward_exception();
11403 
11404     StubRoutines::_call_stub_entry =
11405       generate_call_stub(StubRoutines::_call_stub_return_address);
11406 
11407     // is referenced by megamorphic call
11408     StubRoutines::_catch_exception_entry = generate_catch_exception();
11409 
11410     // Initialize table for copy memory (arraycopy) check.
11411     if (UnsafeMemoryAccess::_table == nullptr) {
11412       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11413     }
11414 
11415     if (UseCRC32Intrinsics) {
11416       // set table address before stub generation which use it
11417       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
11418       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11419     }
11420 
11421     if (UseCRC32CIntrinsics) {
11422       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11423     }
11424 
11425     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11426       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11427     }
11428 
11429     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11430       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11431     }
11432 
11433     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11434         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11435       StubRoutines::_hf2f = generate_float16ToFloat();
11436       StubRoutines::_f2hf = generate_floatToFloat16();
11437     }
11438 
11439     if (InlineTypeReturnedAsFields) {
11440       StubRoutines::_load_inline_type_fields_in_regs =
11441          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11442       StubRoutines::_store_inline_type_fields_to_buf =
11443          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11444     }
11445 
11446   }
11447 
11448   void generate_continuation_stubs() {
11449     // Continuation stubs:
11450     StubRoutines::_cont_thaw          = generate_cont_thaw();
11451     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11452     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11453     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11454   }
11455 
11456   void generate_final_stubs() {
11457     // support for verify_oop (must happen after universe_init)
11458     if (VerifyOops) {
11459       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11460     }
11461 
11462     // arraycopy stubs used by compilers
11463     generate_arraycopy_stubs();
11464 
11465     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11466 
11467     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11468 
11469     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11470     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11471 
11472 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11473 
11474     generate_atomic_entry_points();
11475 
11476 #endif // LINUX
11477 
11478 #ifdef COMPILER2
11479     if (UseSecondarySupersTable) {
11480       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11481       if (! InlineSecondarySupersTest) {
11482         generate_lookup_secondary_supers_table_stub();
11483       }
11484     }
11485 #endif
11486 
11487     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11488   }
11489 
11490   void generate_compiler_stubs() {
11491 #if COMPILER2_OR_JVMCI
11492 
11493     if (UseSVE == 0) {
11494       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
11495     }
11496 
11497     // array equals stub for large arrays.
11498     if (!UseSimpleArrayEquals) {
11499       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11500     }
11501 
11502     // arrays_hascode stub for large arrays.
11503     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11504     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11505     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11506     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11507     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11508 
11509     // byte_array_inflate stub for large arrays.
11510     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11511 
11512     // countPositives stub for large arrays.
11513     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11514 
11515     generate_compare_long_strings();
11516 
11517     generate_string_indexof_stubs();
11518 
11519 #ifdef COMPILER2
11520     if (UseMultiplyToLenIntrinsic) {
11521       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11522     }
11523 
11524     if (UseSquareToLenIntrinsic) {
11525       StubRoutines::_squareToLen = generate_squareToLen();
11526     }
11527 
11528     if (UseMulAddIntrinsic) {
11529       StubRoutines::_mulAdd = generate_mulAdd();
11530     }
11531 
11532     if (UseSIMDForBigIntegerShiftIntrinsics) {
11533       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11534       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11535     }
11536 
11537     if (UseMontgomeryMultiplyIntrinsic) {
11538       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
11539       StubCodeMark mark(this, stub_id);
11540       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11541       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11542     }
11543 
11544     if (UseMontgomerySquareIntrinsic) {
11545       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
11546       StubCodeMark mark(this, stub_id);
11547       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11548       // We use generate_multiply() rather than generate_square()
11549       // because it's faster for the sizes of modulus we care about.
11550       StubRoutines::_montgomerySquare = g.generate_multiply();
11551     }
11552 
11553     generate_vector_math_stubs();
11554 
11555 #endif // COMPILER2
11556 
11557     if (UseChaCha20Intrinsics) {
11558       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11559     }
11560 
11561     if (UseKyberIntrinsics) {
11562       StubRoutines::_kyberNtt = generate_kyberNtt();
11563       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11564       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11565       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11566       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11567       StubRoutines::_kyber12To16 = generate_kyber12To16();
11568       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11569     }
11570 
11571     if (UseDilithiumIntrinsics) {
11572       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11573       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11574       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11575       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11576       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11577     }
11578 
11579     if (UseBASE64Intrinsics) {
11580         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11581         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11582     }
11583 
11584     // data cache line writeback
11585     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11586     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11587 
11588     if (UseAESIntrinsics) {
11589       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11590       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11591       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11592       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11593       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11594     }
11595     if (UseGHASHIntrinsics) {
11596       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11597       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11598     }
11599     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11600       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11601     }
11602 
11603     if (UseMD5Intrinsics) {
11604       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
11605       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
11606     }
11607     if (UseSHA1Intrinsics) {
11608       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
11609       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
11610     }
11611     if (UseSHA256Intrinsics) {
11612       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
11613       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
11614     }
11615     if (UseSHA512Intrinsics) {
11616       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
11617       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
11618     }
11619     if (UseSHA3Intrinsics) {
11620       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
11621       StubRoutines::_double_keccak         = generate_double_keccak();
11622       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
11623     }
11624 
11625     if (UsePoly1305Intrinsics) {
11626       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11627     }
11628 
11629     // generate Adler32 intrinsics code
11630     if (UseAdler32Intrinsics) {
11631       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11632     }
11633 
11634 #endif // COMPILER2_OR_JVMCI
11635   }
11636 
11637  public:
11638   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
11639     switch(blob_id) {
11640     case initial_id:
11641       generate_initial_stubs();
11642       break;
11643      case continuation_id:
11644       generate_continuation_stubs();
11645       break;
11646     case compiler_id:
11647       generate_compiler_stubs();
11648       break;
11649     case final_id:
11650       generate_final_stubs();
11651       break;
11652     default:
11653       fatal("unexpected blob id: %d", blob_id);
11654       break;
11655     };
11656   }
11657 }; // end class declaration
11658 
11659 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
11660   StubGenerator g(code, blob_id);
11661 }
11662 
11663 
11664 #if defined (LINUX)
11665 
11666 // Define pointers to atomic stubs and initialize them to point to the
11667 // code in atomic_aarch64.S.
11668 
11669 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11670   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11671     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11672   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11673     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11674 
11675 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11676 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11677 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11678 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11679 DEFAULT_ATOMIC_OP(xchg, 4, )
11680 DEFAULT_ATOMIC_OP(xchg, 8, )
11681 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11682 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11683 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11684 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11685 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11686 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11687 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11688 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11689 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11690 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11691 
11692 #undef DEFAULT_ATOMIC_OP
11693 
11694 #endif // LINUX