1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubId stub_id = StubId::stubgen_call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     // All of j_rargN may be used to return inline type fields so be careful
  332     // not to clobber those.
  333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  334     // assignment of Rresult below.
  335     Register Rresult = r14, Rresult_type = r15;
  336     __ ldr(Rresult, result);
  337     Label is_long, is_float, is_double, check_prim, exit;
  338     __ ldr(Rresult_type, result_type);
  339     __ cmp(Rresult_type, (u1)T_OBJECT);
  340     __ br(Assembler::EQ, check_prim);
  341     __ cmp(Rresult_type, (u1)T_LONG);
  342     __ br(Assembler::EQ, is_long);
  343     __ cmp(Rresult_type, (u1)T_FLOAT);
  344     __ br(Assembler::EQ, is_float);
  345     __ cmp(Rresult_type, (u1)T_DOUBLE);
  346     __ br(Assembler::EQ, is_double);
  347 
  348     // handle T_INT case
  349     __ strw(r0, Address(Rresult));
  350 
  351     __ BIND(exit);
  352 
  353     // pop parameters
  354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  355 
  356 #ifdef ASSERT
  357     // verify that threads correspond
  358     {
  359       Label L, S;
  360       __ ldr(rscratch1, thread);
  361       __ cmp(rthread, rscratch1);
  362       __ br(Assembler::NE, S);
  363       __ get_thread(rscratch1);
  364       __ cmp(rthread, rscratch1);
  365       __ br(Assembler::EQ, L);
  366       __ BIND(S);
  367       __ stop("StubRoutines::call_stub: threads must correspond");
  368       __ BIND(L);
  369     }
  370 #endif
  371 
  372     __ pop_cont_fastpath(rthread);
  373 
  374     // restore callee-save registers
  375     __ ldpd(v15, v14,  d15_save);
  376     __ ldpd(v13, v12,  d13_save);
  377     __ ldpd(v11, v10,  d11_save);
  378     __ ldpd(v9,  v8,   d9_save);
  379 
  380     __ ldp(r28, r27,   r28_save);
  381     __ ldp(r26, r25,   r26_save);
  382     __ ldp(r24, r23,   r24_save);
  383     __ ldp(r22, r21,   r22_save);
  384     __ ldp(r20, r19,   r20_save);
  385 
  386     // restore fpcr
  387     __ ldr(rscratch1,  fpcr_save);
  388     __ set_fpcr(rscratch1);
  389 
  390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  391     __ ldrw(c_rarg2, result_type);
  392     __ ldr(c_rarg3,  method);
  393     __ ldp(c_rarg4, c_rarg5,  entry_point);
  394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  395 
  396     // leave frame and return to caller
  397     __ leave();
  398     __ ret(lr);
  399 
  400     // handle return types different from T_INT
  401     __ BIND(check_prim);
  402     if (InlineTypeReturnedAsFields) {
  403       // Check for scalarized return value
  404       __ tbz(r0, 0, is_long);
  405       // Load pack handler address
  406       __ andr(rscratch1, r0, -2);
  407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
  408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  409       __ blr(rscratch1);
  410       __ b(exit);
  411     }
  412 
  413     __ BIND(is_long);
  414     __ str(r0, Address(Rresult, 0));
  415     __ br(Assembler::AL, exit);
  416 
  417     __ BIND(is_float);
  418     __ strs(j_farg0, Address(Rresult, 0));
  419     __ br(Assembler::AL, exit);
  420 
  421     __ BIND(is_double);
  422     __ strd(j_farg0, Address(Rresult, 0));
  423     __ br(Assembler::AL, exit);
  424 
  425     return start;
  426   }
  427 
  428   // Return point for a Java call if there's an exception thrown in
  429   // Java code.  The exception is caught and transformed into a
  430   // pending exception stored in JavaThread that can be tested from
  431   // within the VM.
  432   //
  433   // Note: Usually the parameters are removed by the callee. In case
  434   // of an exception crossing an activation frame boundary, that is
  435   // not the case if the callee is compiled code => need to setup the
  436   // rsp.
  437   //
  438   // r0: exception oop
  439 
  440   address generate_catch_exception() {
  441     StubId stub_id = StubId::stubgen_catch_exception_id;
  442     StubCodeMark mark(this, stub_id);
  443     address start = __ pc();
  444 
  445     // same as in generate_call_stub():
  446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  447     const Address thread        (rfp, thread_off         * wordSize);
  448 
  449 #ifdef ASSERT
  450     // verify that threads correspond
  451     {
  452       Label L, S;
  453       __ ldr(rscratch1, thread);
  454       __ cmp(rthread, rscratch1);
  455       __ br(Assembler::NE, S);
  456       __ get_thread(rscratch1);
  457       __ cmp(rthread, rscratch1);
  458       __ br(Assembler::EQ, L);
  459       __ bind(S);
  460       __ stop("StubRoutines::catch_exception: threads must correspond");
  461       __ bind(L);
  462     }
  463 #endif
  464 
  465     // set pending exception
  466     __ verify_oop(r0);
  467 
  468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  469     __ mov(rscratch1, (address)__FILE__);
  470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  471     __ movw(rscratch1, (int)__LINE__);
  472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  473 
  474     // complete return to VM
  475     assert(StubRoutines::_call_stub_return_address != nullptr,
  476            "_call_stub_return_address must have been generated before");
  477     __ b(StubRoutines::_call_stub_return_address);
  478 
  479     return start;
  480   }
  481 
  482   // Continuation point for runtime calls returning with a pending
  483   // exception.  The pending exception check happened in the runtime
  484   // or native call stub.  The pending exception in Thread is
  485   // converted into a Java-level exception.
  486   //
  487   // Contract with Java-level exception handlers:
  488   // r0: exception
  489   // r3: throwing pc
  490   //
  491   // NOTE: At entry of this stub, exception-pc must be in LR !!
  492 
  493   // NOTE: this is always used as a jump target within generated code
  494   // so it just needs to be generated code with no x86 prolog
  495 
  496   address generate_forward_exception() {
  497     StubId stub_id = StubId::stubgen_forward_exception_id;
  498     StubCodeMark mark(this, stub_id);
  499     address start = __ pc();
  500 
  501     // Upon entry, LR points to the return address returning into
  502     // Java (interpreted or compiled) code; i.e., the return address
  503     // becomes the throwing pc.
  504     //
  505     // Arguments pushed before the runtime call are still on the stack
  506     // but the exception handler will reset the stack pointer ->
  507     // ignore them.  A potential result in registers can be ignored as
  508     // well.
  509 
  510 #ifdef ASSERT
  511     // make sure this code is only executed if there is a pending exception
  512     {
  513       Label L;
  514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  515       __ cbnz(rscratch1, L);
  516       __ stop("StubRoutines::forward exception: no pending exception (1)");
  517       __ bind(L);
  518     }
  519 #endif
  520 
  521     // compute exception handler into r19
  522 
  523     // call the VM to find the handler address associated with the
  524     // caller address. pass thread in r0 and caller pc (ret address)
  525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  526     // the stack.
  527     __ mov(c_rarg1, lr);
  528     // lr will be trashed by the VM call so we move it to R19
  529     // (callee-saved) because we also need to pass it to the handler
  530     // returned by this call.
  531     __ mov(r19, lr);
  532     BLOCK_COMMENT("call exception_handler_for_return_address");
  533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  534                          SharedRuntime::exception_handler_for_return_address),
  535                     rthread, c_rarg1);
  536     // Reinitialize the ptrue predicate register, in case the external runtime
  537     // call clobbers ptrue reg, as we may return to SVE compiled code.
  538     __ reinitialize_ptrue();
  539 
  540     // we should not really care that lr is no longer the callee
  541     // address. we saved the value the handler needs in r19 so we can
  542     // just copy it to r3. however, the C2 handler will push its own
  543     // frame and then calls into the VM and the VM code asserts that
  544     // the PC for the frame above the handler belongs to a compiled
  545     // Java method. So, we restore lr here to satisfy that assert.
  546     __ mov(lr, r19);
  547     // setup r0 & r3 & clear pending exception
  548     __ mov(r3, r19);
  549     __ mov(r19, r0);
  550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  552 
  553 #ifdef ASSERT
  554     // make sure exception is set
  555     {
  556       Label L;
  557       __ cbnz(r0, L);
  558       __ stop("StubRoutines::forward exception: no pending exception (2)");
  559       __ bind(L);
  560     }
  561 #endif
  562 
  563     // continue at exception handler
  564     // r0: exception
  565     // r3: throwing pc
  566     // r19: exception handler
  567     __ verify_oop(r0);
  568     __ br(r19);
  569 
  570     return start;
  571   }
  572 
  573   // Non-destructive plausibility checks for oops
  574   //
  575   // Arguments:
  576   //    r0: oop to verify
  577   //    rscratch1: error message
  578   //
  579   // Stack after saving c_rarg3:
  580   //    [tos + 0]: saved c_rarg3
  581   //    [tos + 1]: saved c_rarg2
  582   //    [tos + 2]: saved lr
  583   //    [tos + 3]: saved rscratch2
  584   //    [tos + 4]: saved r0
  585   //    [tos + 5]: saved rscratch1
  586   address generate_verify_oop() {
  587     StubId stub_id = StubId::stubgen_verify_oop_id;
  588     StubCodeMark mark(this, stub_id);
  589     address start = __ pc();
  590 
  591     Label exit, error;
  592 
  593     // save c_rarg2 and c_rarg3
  594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  595 
  596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  598     __ ldr(c_rarg3, Address(c_rarg2));
  599     __ add(c_rarg3, c_rarg3, 1);
  600     __ str(c_rarg3, Address(c_rarg2));
  601 
  602     // object is in r0
  603     // make sure object is 'reasonable'
  604     __ cbz(r0, exit); // if obj is null it is OK
  605 
  606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  608 
  609     // return if everything seems ok
  610     __ bind(exit);
  611 
  612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  613     __ ret(lr);
  614 
  615     // handle errors
  616     __ bind(error);
  617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  618 
  619     __ push(RegSet::range(r0, r29), sp);
  620     // debug(char* msg, int64_t pc, int64_t regs[])
  621     __ mov(c_rarg0, rscratch1);      // pass address of error message
  622     __ mov(c_rarg1, lr);             // pass return address
  623     __ mov(c_rarg2, sp);             // pass address of regs on stack
  624 #ifndef PRODUCT
  625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  626 #endif
  627     BLOCK_COMMENT("call MacroAssembler::debug");
  628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  629     __ blr(rscratch1);
  630     __ hlt(0);
  631 
  632     return start;
  633   }
  634 
  635   // Generate indices for iota vector.
  636   address generate_iota_indices(StubId stub_id) {
  637     __ align(CodeEntryAlignment);
  638     StubCodeMark mark(this, stub_id);
  639     address start = __ pc();
  640     // B
  641     __ emit_data64(0x0706050403020100, relocInfo::none);
  642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  643     // H
  644     __ emit_data64(0x0003000200010000, relocInfo::none);
  645     __ emit_data64(0x0007000600050004, relocInfo::none);
  646     // S
  647     __ emit_data64(0x0000000100000000, relocInfo::none);
  648     __ emit_data64(0x0000000300000002, relocInfo::none);
  649     // D
  650     __ emit_data64(0x0000000000000000, relocInfo::none);
  651     __ emit_data64(0x0000000000000001, relocInfo::none);
  652     // S - FP
  653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  655     // D - FP
  656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  658     return start;
  659   }
  660 
  661   // The inner part of zero_words().  This is the bulk operation,
  662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  663   // caller is responsible for zeroing the last few words.
  664   //
  665   // Inputs:
  666   // r10: the HeapWord-aligned base address of an array to zero.
  667   // r11: the count in HeapWords, r11 > 0.
  668   //
  669   // Returns r10 and r11, adjusted for the caller to clear.
  670   // r10: the base address of the tail of words left to clear.
  671   // r11: the number of words in the tail.
  672   //      r11 < MacroAssembler::zero_words_block_size.
  673 
  674   address generate_zero_blocks() {
  675     Label done;
  676     Label base_aligned;
  677 
  678     Register base = r10, cnt = r11;
  679 
  680     __ align(CodeEntryAlignment);
  681     StubId stub_id = StubId::stubgen_zero_blocks_id;
  682     StubCodeMark mark(this, stub_id);
  683     address start = __ pc();
  684 
  685     if (UseBlockZeroing) {
  686       int zva_length = VM_Version::zva_length();
  687 
  688       // Ensure ZVA length can be divided by 16. This is required by
  689       // the subsequent operations.
  690       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  691 
  692       __ tbz(base, 3, base_aligned);
  693       __ str(zr, Address(__ post(base, 8)));
  694       __ sub(cnt, cnt, 1);
  695       __ bind(base_aligned);
  696 
  697       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  698       // alignment.
  699       Label small;
  700       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  701       __ subs(rscratch1, cnt, low_limit >> 3);
  702       __ br(Assembler::LT, small);
  703       __ zero_dcache_blocks(base, cnt);
  704       __ bind(small);
  705     }
  706 
  707     {
  708       // Number of stp instructions we'll unroll
  709       const int unroll =
  710         MacroAssembler::zero_words_block_size / 2;
  711       // Clear the remaining blocks.
  712       Label loop;
  713       __ subs(cnt, cnt, unroll * 2);
  714       __ br(Assembler::LT, done);
  715       __ bind(loop);
  716       for (int i = 0; i < unroll; i++)
  717         __ stp(zr, zr, __ post(base, 16));
  718       __ subs(cnt, cnt, unroll * 2);
  719       __ br(Assembler::GE, loop);
  720       __ bind(done);
  721       __ add(cnt, cnt, unroll * 2);
  722     }
  723 
  724     __ ret(lr);
  725 
  726     return start;
  727   }
  728 
  729 
  730   typedef enum {
  731     copy_forwards = 1,
  732     copy_backwards = -1
  733   } copy_direction;
  734 
  735   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  736   // for arraycopy stubs.
  737   class ArrayCopyBarrierSetHelper : StackObj {
  738     BarrierSetAssembler* _bs_asm;
  739     MacroAssembler* _masm;
  740     DecoratorSet _decorators;
  741     BasicType _type;
  742     Register _gct1;
  743     Register _gct2;
  744     Register _gct3;
  745     FloatRegister _gcvt1;
  746     FloatRegister _gcvt2;
  747     FloatRegister _gcvt3;
  748 
  749   public:
  750     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  751                               DecoratorSet decorators,
  752                               BasicType type,
  753                               Register gct1,
  754                               Register gct2,
  755                               Register gct3,
  756                               FloatRegister gcvt1,
  757                               FloatRegister gcvt2,
  758                               FloatRegister gcvt3)
  759       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  760         _masm(masm),
  761         _decorators(decorators),
  762         _type(type),
  763         _gct1(gct1),
  764         _gct2(gct2),
  765         _gct3(gct3),
  766         _gcvt1(gcvt1),
  767         _gcvt2(gcvt2),
  768         _gcvt3(gcvt3) {
  769     }
  770 
  771     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  772       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  773                             dst1, dst2, src,
  774                             _gct1, _gct2, _gcvt1);
  775     }
  776 
  777     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  778       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  779                              dst, src1, src2,
  780                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  781     }
  782 
  783     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  784       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  785                             dst1, dst2, src,
  786                             _gct1);
  787     }
  788 
  789     void copy_store_at_16(Address dst, Register src1, Register src2) {
  790       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  791                              dst, src1, src2,
  792                              _gct1, _gct2, _gct3);
  793     }
  794 
  795     void copy_load_at_8(Register dst, Address src) {
  796       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  797                             dst, noreg, src,
  798                             _gct1);
  799     }
  800 
  801     void copy_store_at_8(Address dst, Register src) {
  802       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  803                              dst, src, noreg,
  804                              _gct1, _gct2, _gct3);
  805     }
  806   };
  807 
  808   // Bulk copy of blocks of 8 words.
  809   //
  810   // count is a count of words.
  811   //
  812   // Precondition: count >= 8
  813   //
  814   // Postconditions:
  815   //
  816   // The least significant bit of count contains the remaining count
  817   // of words to copy.  The rest of count is trash.
  818   //
  819   // s and d are adjusted to point to the remaining words to copy
  820   //
  821   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
  822     BasicType type;
  823     copy_direction direction;
  824 
  825     switch (stub_id) {
  826     case StubId::stubgen_copy_byte_f_id:
  827       direction = copy_forwards;
  828       type = T_BYTE;
  829       break;
  830     case StubId::stubgen_copy_byte_b_id:
  831       direction = copy_backwards;
  832       type = T_BYTE;
  833       break;
  834     case StubId::stubgen_copy_oop_f_id:
  835       direction = copy_forwards;
  836       type = T_OBJECT;
  837       break;
  838     case StubId::stubgen_copy_oop_b_id:
  839       direction = copy_backwards;
  840       type = T_OBJECT;
  841       break;
  842     case StubId::stubgen_copy_oop_uninit_f_id:
  843       direction = copy_forwards;
  844       type = T_OBJECT;
  845       break;
  846     case StubId::stubgen_copy_oop_uninit_b_id:
  847       direction = copy_backwards;
  848       type = T_OBJECT;
  849       break;
  850     default:
  851       ShouldNotReachHere();
  852     }
  853 
  854     int unit = wordSize * direction;
  855     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  856 
  857     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  858       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  859     const Register stride = r14;
  860     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  861     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  862     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  863 
  864     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  865     assert_different_registers(s, d, count, rscratch1, rscratch2);
  866 
  867     Label again, drain;
  868 
  869     __ align(CodeEntryAlignment);
  870 
  871     StubCodeMark mark(this, stub_id);
  872 
  873     address start = __ pc();
  874 
  875     Label unaligned_copy_long;
  876     if (AvoidUnalignedAccesses) {
  877       __ tbnz(d, 3, unaligned_copy_long);
  878     }
  879 
  880     if (direction == copy_forwards) {
  881       __ sub(s, s, bias);
  882       __ sub(d, d, bias);
  883     }
  884 
  885 #ifdef ASSERT
  886     // Make sure we are never given < 8 words
  887     {
  888       Label L;
  889       __ cmp(count, (u1)8);
  890       __ br(Assembler::GE, L);
  891       __ stop("genrate_copy_longs called with < 8 words");
  892       __ bind(L);
  893     }
  894 #endif
  895 
  896     // Fill 8 registers
  897     if (UseSIMDForMemoryOps) {
  898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  899       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  900     } else {
  901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  902       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  903       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  905     }
  906 
  907     __ subs(count, count, 16);
  908     __ br(Assembler::LO, drain);
  909 
  910     int prefetch = PrefetchCopyIntervalInBytes;
  911     bool use_stride = false;
  912     if (direction == copy_backwards) {
  913       use_stride = prefetch > 256;
  914       prefetch = -prefetch;
  915       if (use_stride) __ mov(stride, prefetch);
  916     }
  917 
  918     __ bind(again);
  919 
  920     if (PrefetchCopyIntervalInBytes > 0)
  921       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  922 
  923     if (UseSIMDForMemoryOps) {
  924       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  925       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  926       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  927       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  928     } else {
  929       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  930       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  931       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  932       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  933       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  934       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  937     }
  938 
  939     __ subs(count, count, 8);
  940     __ br(Assembler::HS, again);
  941 
  942     // Drain
  943     __ bind(drain);
  944     if (UseSIMDForMemoryOps) {
  945       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  946       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  947     } else {
  948       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  950       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  951       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  952     }
  953 
  954     {
  955       Label L1, L2;
  956       __ tbz(count, exact_log2(4), L1);
  957       if (UseSIMDForMemoryOps) {
  958         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  959         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  960       } else {
  961         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  962         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  963         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  964         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  965       }
  966       __ bind(L1);
  967 
  968       if (direction == copy_forwards) {
  969         __ add(s, s, bias);
  970         __ add(d, d, bias);
  971       }
  972 
  973       __ tbz(count, 1, L2);
  974       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  975       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  976       __ bind(L2);
  977     }
  978 
  979     __ ret(lr);
  980 
  981     if (AvoidUnalignedAccesses) {
  982       Label drain, again;
  983       // Register order for storing. Order is different for backward copy.
  984 
  985       __ bind(unaligned_copy_long);
  986 
  987       // source address is even aligned, target odd aligned
  988       //
  989       // when forward copying word pairs we read long pairs at offsets
  990       // {0, 2, 4, 6} (in long words). when backwards copying we read
  991       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  992       // address by -2 in the forwards case so we can compute the
  993       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  994       // or -1.
  995       //
  996       // when forward copying we need to store 1 word, 3 pairs and
  997       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  998       // zero offset We adjust the destination by -1 which means we
  999       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1000       //
 1001       // When backwards copyng we need to store 1 word, 3 pairs and
 1002       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1003       // offsets {1, 3, 5, 7, 8} * unit.
 1004 
 1005       if (direction == copy_forwards) {
 1006         __ sub(s, s, 16);
 1007         __ sub(d, d, 8);
 1008       }
 1009 
 1010       // Fill 8 registers
 1011       //
 1012       // for forwards copy s was offset by -16 from the original input
 1013       // value of s so the register contents are at these offsets
 1014       // relative to the 64 bit block addressed by that original input
 1015       // and so on for each successive 64 byte block when s is updated
 1016       //
 1017       // t0 at offset 0,  t1 at offset 8
 1018       // t2 at offset 16, t3 at offset 24
 1019       // t4 at offset 32, t5 at offset 40
 1020       // t6 at offset 48, t7 at offset 56
 1021 
 1022       // for backwards copy s was not offset so the register contents
 1023       // are at these offsets into the preceding 64 byte block
 1024       // relative to that original input and so on for each successive
 1025       // preceding 64 byte block when s is updated. this explains the
 1026       // slightly counter-intuitive looking pattern of register usage
 1027       // in the stp instructions for backwards copy.
 1028       //
 1029       // t0 at offset -16, t1 at offset -8
 1030       // t2 at offset -32, t3 at offset -24
 1031       // t4 at offset -48, t5 at offset -40
 1032       // t6 at offset -64, t7 at offset -56
 1033 
 1034       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1035       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1036       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1037       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1038 
 1039       __ subs(count, count, 16);
 1040       __ br(Assembler::LO, drain);
 1041 
 1042       int prefetch = PrefetchCopyIntervalInBytes;
 1043       bool use_stride = false;
 1044       if (direction == copy_backwards) {
 1045         use_stride = prefetch > 256;
 1046         prefetch = -prefetch;
 1047         if (use_stride) __ mov(stride, prefetch);
 1048       }
 1049 
 1050       __ bind(again);
 1051 
 1052       if (PrefetchCopyIntervalInBytes > 0)
 1053         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1054 
 1055       if (direction == copy_forwards) {
 1056         // allowing for the offset of -8 the store instructions place
 1057         // registers into the target 64 bit block at the following
 1058         // offsets
 1059         //
 1060         // t0 at offset 0
 1061         // t1 at offset 8,  t2 at offset 16
 1062         // t3 at offset 24, t4 at offset 32
 1063         // t5 at offset 40, t6 at offset 48
 1064         // t7 at offset 56
 1065 
 1066         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1067         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1068         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1069         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1070         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1071         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1072         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1073         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1074         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1075       } else {
 1076         // d was not offset when we started so the registers are
 1077         // written into the 64 bit block preceding d with the following
 1078         // offsets
 1079         //
 1080         // t1 at offset -8
 1081         // t3 at offset -24, t0 at offset -16
 1082         // t5 at offset -48, t2 at offset -32
 1083         // t7 at offset -56, t4 at offset -48
 1084         //                   t6 at offset -64
 1085         //
 1086         // note that this matches the offsets previously noted for the
 1087         // loads
 1088 
 1089         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1090         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1091         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1092         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1093         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1094         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1095         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1096         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1097         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1098       }
 1099 
 1100       __ subs(count, count, 8);
 1101       __ br(Assembler::HS, again);
 1102 
 1103       // Drain
 1104       //
 1105       // this uses the same pattern of offsets and register arguments
 1106       // as above
 1107       __ bind(drain);
 1108       if (direction == copy_forwards) {
 1109         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1110         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1111         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1112         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1113         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1114       } else {
 1115         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1116         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1117         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1118         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1119         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1120       }
 1121       // now we need to copy any remaining part block which may
 1122       // include a 4 word block subblock and/or a 2 word subblock.
 1123       // bits 2 and 1 in the count are the tell-tale for whether we
 1124       // have each such subblock
 1125       {
 1126         Label L1, L2;
 1127         __ tbz(count, exact_log2(4), L1);
 1128         // this is the same as above but copying only 4 longs hence
 1129         // with only one intervening stp between the str instructions
 1130         // but note that the offsets and registers still follow the
 1131         // same pattern
 1132         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1133         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1134         if (direction == copy_forwards) {
 1135           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1136           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1137           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1141           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1142         }
 1143         __ bind(L1);
 1144 
 1145         __ tbz(count, 1, L2);
 1146         // this is the same as above but copying only 2 longs hence
 1147         // there is no intervening stp between the str instructions
 1148         // but note that the offset and register patterns are still
 1149         // the same
 1150         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1151         if (direction == copy_forwards) {
 1152           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1153           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1154         } else {
 1155           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1156           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1157         }
 1158         __ bind(L2);
 1159 
 1160         // for forwards copy we need to re-adjust the offsets we
 1161         // applied so that s and d are follow the last words written
 1162 
 1163         if (direction == copy_forwards) {
 1164           __ add(s, s, 16);
 1165           __ add(d, d, 8);
 1166         }
 1167 
 1168       }
 1169 
 1170       __ ret(lr);
 1171     }
 1172 
 1173     return start;
 1174   }
 1175 
 1176   // Small copy: less than 16 bytes.
 1177   //
 1178   // NB: Ignores all of the bits of count which represent more than 15
 1179   // bytes, so a caller doesn't have to mask them.
 1180 
 1181   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1182     bool is_backwards = step < 0;
 1183     size_t granularity = g_uabs(step);
 1184     int direction = is_backwards ? -1 : 1;
 1185 
 1186     Label Lword, Lint, Lshort, Lbyte;
 1187 
 1188     assert(granularity
 1189            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1190 
 1191     const Register t0 = r3;
 1192     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1193     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1194 
 1195     // ??? I don't know if this bit-test-and-branch is the right thing
 1196     // to do.  It does a lot of jumping, resulting in several
 1197     // mispredicted branches.  It might make more sense to do this
 1198     // with something like Duff's device with a single computed branch.
 1199 
 1200     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1201     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1202     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1203     __ bind(Lword);
 1204 
 1205     if (granularity <= sizeof (jint)) {
 1206       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1207       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1208       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1209       __ bind(Lint);
 1210     }
 1211 
 1212     if (granularity <= sizeof (jshort)) {
 1213       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1214       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1215       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1216       __ bind(Lshort);
 1217     }
 1218 
 1219     if (granularity <= sizeof (jbyte)) {
 1220       __ tbz(count, 0, Lbyte);
 1221       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1222       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1223       __ bind(Lbyte);
 1224     }
 1225   }
 1226 
 1227   // All-singing all-dancing memory copy.
 1228   //
 1229   // Copy count units of memory from s to d.  The size of a unit is
 1230   // step, which can be positive or negative depending on the direction
 1231   // of copy.  If is_aligned is false, we align the source address.
 1232   //
 1233 
 1234   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1235                    Register s, Register d, Register count, int step) {
 1236     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1237     bool is_backwards = step < 0;
 1238     unsigned int granularity = g_uabs(step);
 1239     const Register t0 = r3, t1 = r4;
 1240 
 1241     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1242     // load all the data before writing anything
 1243     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1244     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1245     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1246     const Register send = r17, dend = r16;
 1247     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1248     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1249     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1250 
 1251     if (PrefetchCopyIntervalInBytes > 0)
 1252       __ prfm(Address(s, 0), PLDL1KEEP);
 1253     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1254     __ br(Assembler::HI, copy_big);
 1255 
 1256     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1257     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1258 
 1259     __ cmp(count, u1(16/granularity));
 1260     __ br(Assembler::LS, copy16);
 1261 
 1262     __ cmp(count, u1(64/granularity));
 1263     __ br(Assembler::HI, copy80);
 1264 
 1265     __ cmp(count, u1(32/granularity));
 1266     __ br(Assembler::LS, copy32);
 1267 
 1268     // 33..64 bytes
 1269     if (UseSIMDForMemoryOps) {
 1270       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1271       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1272       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1273       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1274     } else {
 1275       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1276       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1277       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1278       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1279 
 1280       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1281       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1282       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1283       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1284     }
 1285     __ b(finish);
 1286 
 1287     // 17..32 bytes
 1288     __ bind(copy32);
 1289     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1290     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1291 
 1292     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1293     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1294     __ b(finish);
 1295 
 1296     // 65..80/96 bytes
 1297     // (96 bytes if SIMD because we do 32 byes per instruction)
 1298     __ bind(copy80);
 1299     if (UseSIMDForMemoryOps) {
 1300       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1301       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1302       // Unaligned pointers can be an issue for copying.
 1303       // The issue has more chances to happen when granularity of data is
 1304       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1305       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1306       // The most performance drop has been seen for the range 65-80 bytes.
 1307       // For such cases using the pair of ldp/stp instead of the third pair of
 1308       // ldpq/stpq fixes the performance issue.
 1309       if (granularity < sizeof (jint)) {
 1310         Label copy96;
 1311         __ cmp(count, u1(80/granularity));
 1312         __ br(Assembler::HI, copy96);
 1313         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1314 
 1315         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1316         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1317 
 1318         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1319         __ b(finish);
 1320 
 1321         __ bind(copy96);
 1322       }
 1323       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1324 
 1325       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1326       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1327 
 1328       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1329     } else {
 1330       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1331       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1332       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1333       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1334       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1335 
 1336       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1337       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1338       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1339       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1340       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1341     }
 1342     __ b(finish);
 1343 
 1344     // 0..16 bytes
 1345     __ bind(copy16);
 1346     __ cmp(count, u1(8/granularity));
 1347     __ br(Assembler::LO, copy8);
 1348 
 1349     // 8..16 bytes
 1350     bs.copy_load_at_8(t0, Address(s, 0));
 1351     bs.copy_load_at_8(t1, Address(send, -8));
 1352     bs.copy_store_at_8(Address(d, 0), t0);
 1353     bs.copy_store_at_8(Address(dend, -8), t1);
 1354     __ b(finish);
 1355 
 1356     if (granularity < 8) {
 1357       // 4..7 bytes
 1358       __ bind(copy8);
 1359       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1360       __ ldrw(t0, Address(s, 0));
 1361       __ ldrw(t1, Address(send, -4));
 1362       __ strw(t0, Address(d, 0));
 1363       __ strw(t1, Address(dend, -4));
 1364       __ b(finish);
 1365       if (granularity < 4) {
 1366         // 0..3 bytes
 1367         __ bind(copy4);
 1368         __ cbz(count, finish); // get rid of 0 case
 1369         if (granularity == 2) {
 1370           __ ldrh(t0, Address(s, 0));
 1371           __ strh(t0, Address(d, 0));
 1372         } else { // granularity == 1
 1373           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1374           // the first and last byte.
 1375           // Handle the 3 byte case by loading and storing base + count/2
 1376           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1377           // This does means in the 1 byte case we load/store the same
 1378           // byte 3 times.
 1379           __ lsr(count, count, 1);
 1380           __ ldrb(t0, Address(s, 0));
 1381           __ ldrb(t1, Address(send, -1));
 1382           __ ldrb(t2, Address(s, count));
 1383           __ strb(t0, Address(d, 0));
 1384           __ strb(t1, Address(dend, -1));
 1385           __ strb(t2, Address(d, count));
 1386         }
 1387         __ b(finish);
 1388       }
 1389     }
 1390 
 1391     __ bind(copy_big);
 1392     if (is_backwards) {
 1393       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1394       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1395     }
 1396 
 1397     // Now we've got the small case out of the way we can align the
 1398     // source address on a 2-word boundary.
 1399 
 1400     // Here we will materialize a count in r15, which is used by copy_memory_small
 1401     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1402     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1403     // can not be used as a temp register, as it contains the count.
 1404 
 1405     Label aligned;
 1406 
 1407     if (is_aligned) {
 1408       // We may have to adjust by 1 word to get s 2-word-aligned.
 1409       __ tbz(s, exact_log2(wordSize), aligned);
 1410       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1411       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1412       __ sub(count, count, wordSize/granularity);
 1413     } else {
 1414       if (is_backwards) {
 1415         __ andr(r15, s, 2 * wordSize - 1);
 1416       } else {
 1417         __ neg(r15, s);
 1418         __ andr(r15, r15, 2 * wordSize - 1);
 1419       }
 1420       // r15 is the byte adjustment needed to align s.
 1421       __ cbz(r15, aligned);
 1422       int shift = exact_log2(granularity);
 1423       if (shift > 0) {
 1424         __ lsr(r15, r15, shift);
 1425       }
 1426       __ sub(count, count, r15);
 1427 
 1428 #if 0
 1429       // ?? This code is only correct for a disjoint copy.  It may or
 1430       // may not make sense to use it in that case.
 1431 
 1432       // Copy the first pair; s and d may not be aligned.
 1433       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1434       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1435 
 1436       // Align s and d, adjust count
 1437       if (is_backwards) {
 1438         __ sub(s, s, r15);
 1439         __ sub(d, d, r15);
 1440       } else {
 1441         __ add(s, s, r15);
 1442         __ add(d, d, r15);
 1443       }
 1444 #else
 1445       copy_memory_small(decorators, type, s, d, r15, step);
 1446 #endif
 1447     }
 1448 
 1449     __ bind(aligned);
 1450 
 1451     // s is now 2-word-aligned.
 1452 
 1453     // We have a count of units and some trailing bytes. Adjust the
 1454     // count and do a bulk copy of words. If the shift is zero
 1455     // perform a move instead to benefit from zero latency moves.
 1456     int shift = exact_log2(wordSize/granularity);
 1457     if (shift > 0) {
 1458       __ lsr(r15, count, shift);
 1459     } else {
 1460       __ mov(r15, count);
 1461     }
 1462     if (direction == copy_forwards) {
 1463       if (type != T_OBJECT) {
 1464         __ bl(StubRoutines::aarch64::copy_byte_f());
 1465       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1466         __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
 1467       } else {
 1468         __ bl(StubRoutines::aarch64::copy_oop_f());
 1469       }
 1470     } else {
 1471       if (type != T_OBJECT) {
 1472         __ bl(StubRoutines::aarch64::copy_byte_b());
 1473       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1474         __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
 1475       } else {
 1476         __ bl(StubRoutines::aarch64::copy_oop_b());
 1477       }
 1478     }
 1479 
 1480     // And the tail.
 1481     copy_memory_small(decorators, type, s, d, count, step);
 1482 
 1483     if (granularity >= 8) __ bind(copy8);
 1484     if (granularity >= 4) __ bind(copy4);
 1485     __ bind(finish);
 1486   }
 1487 
 1488 
 1489   void clobber_registers() {
 1490 #ifdef ASSERT
 1491     RegSet clobbered
 1492       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1493     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1494     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1495     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1496       __ mov(*it, rscratch1);
 1497     }
 1498 #endif
 1499 
 1500   }
 1501 
 1502   // Scan over array at a for count oops, verifying each one.
 1503   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1504   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1505     Label loop, end;
 1506     __ mov(rscratch1, a);
 1507     __ mov(rscratch2, zr);
 1508     __ bind(loop);
 1509     __ cmp(rscratch2, count);
 1510     __ br(Assembler::HS, end);
 1511     if (size == wordSize) {
 1512       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1513       __ verify_oop(temp);
 1514     } else {
 1515       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1516       __ decode_heap_oop(temp); // calls verify_oop
 1517     }
 1518     __ add(rscratch2, rscratch2, 1);
 1519     __ b(loop);
 1520     __ bind(end);
 1521   }
 1522 
 1523   // Arguments:
 1524   //   stub_id - is used to name the stub and identify all details of
 1525   //             how to perform the copy.
 1526   //
 1527   //   entry - is assigned to the stub's post push entry point unless
 1528   //           it is null
 1529   //
 1530   // Inputs:
 1531   //   c_rarg0   - source array address
 1532   //   c_rarg1   - destination array address
 1533   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1534   //
 1535   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1536   // the hardware handle it.  The two dwords within qwords that span
 1537   // cache line boundaries will still be loaded and stored atomically.
 1538   //
 1539   // Side Effects: nopush_entry is set to the (post push) entry point
 1540   //               so it can be used by the corresponding conjoint
 1541   //               copy method
 1542   //
 1543   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1544     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1545     RegSet saved_reg = RegSet::of(s, d, count);
 1546     int size;
 1547     bool aligned;
 1548     bool is_oop;
 1549     bool dest_uninitialized;
 1550     switch (stub_id) {
 1551     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1552       size = sizeof(jbyte);
 1553       aligned = false;
 1554       is_oop = false;
 1555       dest_uninitialized = false;
 1556       break;
 1557     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1558       size = sizeof(jbyte);
 1559       aligned = true;
 1560       is_oop = false;
 1561       dest_uninitialized = false;
 1562       break;
 1563     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1564       size = sizeof(jshort);
 1565       aligned = false;
 1566       is_oop = false;
 1567       dest_uninitialized = false;
 1568       break;
 1569     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1570       size = sizeof(jshort);
 1571       aligned = true;
 1572       is_oop = false;
 1573       dest_uninitialized = false;
 1574       break;
 1575     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1576       size = sizeof(jint);
 1577       aligned = false;
 1578       is_oop = false;
 1579       dest_uninitialized = false;
 1580       break;
 1581     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1582       size = sizeof(jint);
 1583       aligned = true;
 1584       is_oop = false;
 1585       dest_uninitialized = false;
 1586       break;
 1587     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1588       // since this is always aligned we can (should!) use the same
 1589       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1590       ShouldNotReachHere();
 1591       break;
 1592     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1593       size = sizeof(jlong);
 1594       aligned = true;
 1595       is_oop = false;
 1596       dest_uninitialized = false;
 1597       break;
 1598     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1599       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1600       aligned = !UseCompressedOops;
 1601       is_oop = true;
 1602       dest_uninitialized = false;
 1603       break;
 1604     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1605       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1606       aligned = !UseCompressedOops;
 1607       is_oop = true;
 1608       dest_uninitialized = false;
 1609       break;
 1610     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1611       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1612       aligned = !UseCompressedOops;
 1613       is_oop = true;
 1614       dest_uninitialized = true;
 1615       break;
 1616     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1617       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1618       aligned = !UseCompressedOops;
 1619       is_oop = true;
 1620       dest_uninitialized = true;
 1621       break;
 1622     default:
 1623       ShouldNotReachHere();
 1624       break;
 1625     }
 1626 
 1627     __ align(CodeEntryAlignment);
 1628     StubCodeMark mark(this, stub_id);
 1629     address start = __ pc();
 1630     __ enter();
 1631 
 1632     if (nopush_entry != nullptr) {
 1633       *nopush_entry = __ pc();
 1634       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1635       BLOCK_COMMENT("Entry:");
 1636     }
 1637 
 1638     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1639     if (dest_uninitialized) {
 1640       decorators |= IS_DEST_UNINITIALIZED;
 1641     }
 1642     if (aligned) {
 1643       decorators |= ARRAYCOPY_ALIGNED;
 1644     }
 1645 
 1646     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1647     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1648 
 1649     if (is_oop) {
 1650       // save regs before copy_memory
 1651       __ push(RegSet::of(d, count), sp);
 1652     }
 1653     {
 1654       // UnsafeMemoryAccess page error: continue after unsafe access
 1655       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1656       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1657       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1658     }
 1659 
 1660     if (is_oop) {
 1661       __ pop(RegSet::of(d, count), sp);
 1662       if (VerifyOops)
 1663         verify_oop_array(size, d, count, r16);
 1664     }
 1665 
 1666     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1667 
 1668     __ leave();
 1669     __ mov(r0, zr); // return 0
 1670     __ ret(lr);
 1671     return start;
 1672   }
 1673 
 1674   // Arguments:
 1675   //   stub_id - is used to name the stub and identify all details of
 1676   //             how to perform the copy.
 1677   //
 1678   //   nooverlap_target - identifes the (post push) entry for the
 1679   //             corresponding disjoint copy routine which can be
 1680   //             jumped to if the ranges do not actually overlap
 1681   //
 1682   //   entry - is assigned to the stub's post push entry point unless
 1683   //           it is null
 1684   //
 1685   //
 1686   // Inputs:
 1687   //   c_rarg0   - source array address
 1688   //   c_rarg1   - destination array address
 1689   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1690   //
 1691   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1692   // the hardware handle it.  The two dwords within qwords that span
 1693   // cache line boundaries will still be loaded and stored atomically.
 1694   //
 1695   // Side Effects:
 1696   //   nopush_entry is set to the no-overlap entry point so it can be
 1697   //   used by some other conjoint copy method
 1698   //
 1699   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1700     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1701     RegSet saved_regs = RegSet::of(s, d, count);
 1702     int size;
 1703     bool aligned;
 1704     bool is_oop;
 1705     bool dest_uninitialized;
 1706     switch (stub_id) {
 1707     case StubId::stubgen_jbyte_arraycopy_id:
 1708       size = sizeof(jbyte);
 1709       aligned = false;
 1710       is_oop = false;
 1711       dest_uninitialized = false;
 1712       break;
 1713     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1714       size = sizeof(jbyte);
 1715       aligned = true;
 1716       is_oop = false;
 1717       dest_uninitialized = false;
 1718       break;
 1719     case StubId::stubgen_jshort_arraycopy_id:
 1720       size = sizeof(jshort);
 1721       aligned = false;
 1722       is_oop = false;
 1723       dest_uninitialized = false;
 1724       break;
 1725     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1726       size = sizeof(jshort);
 1727       aligned = true;
 1728       is_oop = false;
 1729       dest_uninitialized = false;
 1730       break;
 1731     case StubId::stubgen_jint_arraycopy_id:
 1732       size = sizeof(jint);
 1733       aligned = false;
 1734       is_oop = false;
 1735       dest_uninitialized = false;
 1736       break;
 1737     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1738       size = sizeof(jint);
 1739       aligned = true;
 1740       is_oop = false;
 1741       dest_uninitialized = false;
 1742       break;
 1743     case StubId::stubgen_jlong_arraycopy_id:
 1744       // since this is always aligned we can (should!) use the same
 1745       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1746       ShouldNotReachHere();
 1747       break;
 1748     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1749       size = sizeof(jlong);
 1750       aligned = true;
 1751       is_oop = false;
 1752       dest_uninitialized = false;
 1753       break;
 1754     case StubId::stubgen_oop_arraycopy_id:
 1755       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1756       aligned = !UseCompressedOops;
 1757       is_oop = true;
 1758       dest_uninitialized = false;
 1759       break;
 1760     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1761       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1762       aligned = !UseCompressedOops;
 1763       is_oop = true;
 1764       dest_uninitialized = false;
 1765       break;
 1766     case StubId::stubgen_oop_arraycopy_uninit_id:
 1767       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1768       aligned = !UseCompressedOops;
 1769       is_oop = true;
 1770       dest_uninitialized = true;
 1771       break;
 1772     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1773       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1774       aligned = !UseCompressedOops;
 1775       is_oop = true;
 1776       dest_uninitialized = true;
 1777       break;
 1778     default:
 1779       ShouldNotReachHere();
 1780     }
 1781 
 1782     StubCodeMark mark(this, stub_id);
 1783     address start = __ pc();
 1784     __ enter();
 1785 
 1786     if (nopush_entry != nullptr) {
 1787       *nopush_entry = __ pc();
 1788       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1789       BLOCK_COMMENT("Entry:");
 1790     }
 1791 
 1792     // use fwd copy when (d-s) above_equal (count*size)
 1793     Label L_overlapping;
 1794     __ sub(rscratch1, d, s);
 1795     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1796     __ br(Assembler::LO, L_overlapping);
 1797     __ b(RuntimeAddress(nooverlap_target));
 1798     __ bind(L_overlapping);
 1799 
 1800     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1801     if (dest_uninitialized) {
 1802       decorators |= IS_DEST_UNINITIALIZED;
 1803     }
 1804     if (aligned) {
 1805       decorators |= ARRAYCOPY_ALIGNED;
 1806     }
 1807 
 1808     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1809     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1810 
 1811     if (is_oop) {
 1812       // save regs before copy_memory
 1813       __ push(RegSet::of(d, count), sp);
 1814     }
 1815     {
 1816       // UnsafeMemoryAccess page error: continue after unsafe access
 1817       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1818       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1819       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1820     }
 1821     if (is_oop) {
 1822       __ pop(RegSet::of(d, count), sp);
 1823       if (VerifyOops)
 1824         verify_oop_array(size, d, count, r16);
 1825     }
 1826     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1827     __ leave();
 1828     __ mov(r0, zr); // return 0
 1829     __ ret(lr);
 1830     return start;
 1831   }
 1832 
 1833   // Helper for generating a dynamic type check.
 1834   // Smashes rscratch1, rscratch2.
 1835   void generate_type_check(Register sub_klass,
 1836                            Register super_check_offset,
 1837                            Register super_klass,
 1838                            Register temp1,
 1839                            Register temp2,
 1840                            Register result,
 1841                            Label& L_success) {
 1842     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1843 
 1844     BLOCK_COMMENT("type_check:");
 1845 
 1846     Label L_miss;
 1847 
 1848     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1849                                      super_check_offset);
 1850     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1851 
 1852     // Fall through on failure!
 1853     __ BIND(L_miss);
 1854   }
 1855 
 1856   //
 1857   //  Generate checkcasting array copy stub
 1858   //
 1859   //  Input:
 1860   //    c_rarg0   - source array address
 1861   //    c_rarg1   - destination array address
 1862   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1863   //    c_rarg3   - size_t ckoff (super_check_offset)
 1864   //    c_rarg4   - oop ckval (super_klass)
 1865   //
 1866   //  Output:
 1867   //    r0 ==  0  -  success
 1868   //    r0 == -1^K - failure, where K is partial transfer count
 1869   //
 1870   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 1871     bool dest_uninitialized;
 1872     switch (stub_id) {
 1873     case StubId::stubgen_checkcast_arraycopy_id:
 1874       dest_uninitialized = false;
 1875       break;
 1876     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1877       dest_uninitialized = true;
 1878       break;
 1879     default:
 1880       ShouldNotReachHere();
 1881     }
 1882 
 1883     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1884 
 1885     // Input registers (after setup_arg_regs)
 1886     const Register from        = c_rarg0;   // source array address
 1887     const Register to          = c_rarg1;   // destination array address
 1888     const Register count       = c_rarg2;   // elementscount
 1889     const Register ckoff       = c_rarg3;   // super_check_offset
 1890     const Register ckval       = c_rarg4;   // super_klass
 1891 
 1892     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1893 
 1894     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1895     const Register copied_oop  = r22;       // actual oop copied
 1896     const Register count_save  = r21;       // orig elementscount
 1897     const Register start_to    = r20;       // destination array start address
 1898     const Register r19_klass   = r19;       // oop._klass
 1899 
 1900     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1901     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1902 
 1903     //---------------------------------------------------------------
 1904     // Assembler stub will be used for this call to arraycopy
 1905     // if the two arrays are subtypes of Object[] but the
 1906     // destination array type is not equal to or a supertype
 1907     // of the source type.  Each element must be separately
 1908     // checked.
 1909 
 1910     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1911                                copied_oop, r19_klass, count_save);
 1912 
 1913     __ align(CodeEntryAlignment);
 1914     StubCodeMark mark(this, stub_id);
 1915     address start = __ pc();
 1916 
 1917     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1918 
 1919 #ifdef ASSERT
 1920     // caller guarantees that the arrays really are different
 1921     // otherwise, we would have to make conjoint checks
 1922     { Label L;
 1923       __ b(L);                  // conjoint check not yet implemented
 1924       __ stop("checkcast_copy within a single array");
 1925       __ bind(L);
 1926     }
 1927 #endif //ASSERT
 1928 
 1929     // Caller of this entry point must set up the argument registers.
 1930     if (nopush_entry != nullptr) {
 1931       *nopush_entry = __ pc();
 1932       BLOCK_COMMENT("Entry:");
 1933     }
 1934 
 1935      // Empty array:  Nothing to do.
 1936     __ cbz(count, L_done);
 1937     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1938 
 1939 #ifdef ASSERT
 1940     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1941     // The ckoff and ckval must be mutually consistent,
 1942     // even though caller generates both.
 1943     { Label L;
 1944       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1945       __ ldrw(start_to, Address(ckval, sco_offset));
 1946       __ cmpw(ckoff, start_to);
 1947       __ br(Assembler::EQ, L);
 1948       __ stop("super_check_offset inconsistent");
 1949       __ bind(L);
 1950     }
 1951 #endif //ASSERT
 1952 
 1953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1954     bool is_oop = true;
 1955     int element_size = UseCompressedOops ? 4 : 8;
 1956     if (dest_uninitialized) {
 1957       decorators |= IS_DEST_UNINITIALIZED;
 1958     }
 1959 
 1960     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1961     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1962 
 1963     // save the original count
 1964     __ mov(count_save, count);
 1965 
 1966     // Copy from low to high addresses
 1967     __ mov(start_to, to);              // Save destination array start address
 1968     __ b(L_load_element);
 1969 
 1970     // ======== begin loop ========
 1971     // (Loop is rotated; its entry is L_load_element.)
 1972     // Loop control:
 1973     //   for (; count != 0; count--) {
 1974     //     copied_oop = load_heap_oop(from++);
 1975     //     ... generate_type_check ...;
 1976     //     store_heap_oop(to++, copied_oop);
 1977     //   }
 1978     __ align(OptoLoopAlignment);
 1979 
 1980     __ BIND(L_store_element);
 1981     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1982                       __ post(to, element_size), copied_oop, noreg,
 1983                       gct1, gct2, gct3);
 1984     __ sub(count, count, 1);
 1985     __ cbz(count, L_do_card_marks);
 1986 
 1987     // ======== loop entry is here ========
 1988     __ BIND(L_load_element);
 1989     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1990                      copied_oop, noreg, __ post(from, element_size),
 1991                      gct1);
 1992     __ cbz(copied_oop, L_store_element);
 1993 
 1994     __ load_klass(r19_klass, copied_oop);// query the object klass
 1995 
 1996     BLOCK_COMMENT("type_check:");
 1997     generate_type_check(/*sub_klass*/r19_klass,
 1998                         /*super_check_offset*/ckoff,
 1999                         /*super_klass*/ckval,
 2000                         /*r_array_base*/gct1,
 2001                         /*temp2*/gct2,
 2002                         /*result*/r10, L_store_element);
 2003 
 2004     // Fall through on failure!
 2005 
 2006     // ======== end loop ========
 2007 
 2008     // It was a real error; we must depend on the caller to finish the job.
 2009     // Register count = remaining oops, count_orig = total oops.
 2010     // Emit GC store barriers for the oops we have copied and report
 2011     // their number to the caller.
 2012 
 2013     __ subs(count, count_save, count);     // K = partially copied oop count
 2014     __ eon(count, count, zr);              // report (-1^K) to caller
 2015     __ br(Assembler::EQ, L_done_pop);
 2016 
 2017     __ BIND(L_do_card_marks);
 2018     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2019 
 2020     __ bind(L_done_pop);
 2021     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2022     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2023 
 2024     __ bind(L_done);
 2025     __ mov(r0, count);
 2026     __ leave();
 2027     __ ret(lr);
 2028 
 2029     return start;
 2030   }
 2031 
 2032   // Perform range checks on the proposed arraycopy.
 2033   // Kills temp, but nothing else.
 2034   // Also, clean the sign bits of src_pos and dst_pos.
 2035   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2036                               Register src_pos, // source position (c_rarg1)
 2037                               Register dst,     // destination array oo (c_rarg2)
 2038                               Register dst_pos, // destination position (c_rarg3)
 2039                               Register length,
 2040                               Register temp,
 2041                               Label& L_failed) {
 2042     BLOCK_COMMENT("arraycopy_range_checks:");
 2043 
 2044     assert_different_registers(rscratch1, temp);
 2045 
 2046     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2047     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2048     __ addw(temp, length, src_pos);
 2049     __ cmpw(temp, rscratch1);
 2050     __ br(Assembler::HI, L_failed);
 2051 
 2052     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2053     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2054     __ addw(temp, length, dst_pos);
 2055     __ cmpw(temp, rscratch1);
 2056     __ br(Assembler::HI, L_failed);
 2057 
 2058     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2059     __ movw(src_pos, src_pos);
 2060     __ movw(dst_pos, dst_pos);
 2061 
 2062     BLOCK_COMMENT("arraycopy_range_checks done");
 2063   }
 2064 
 2065   // These stubs get called from some dumb test routine.
 2066   // I'll write them properly when they're called from
 2067   // something that's actually doing something.
 2068   static void fake_arraycopy_stub(address src, address dst, int count) {
 2069     assert(count == 0, "huh?");
 2070   }
 2071 
 2072 
 2073   //
 2074   //  Generate 'unsafe' array copy stub
 2075   //  Though just as safe as the other stubs, it takes an unscaled
 2076   //  size_t argument instead of an element count.
 2077   //
 2078   //  Input:
 2079   //    c_rarg0   - source array address
 2080   //    c_rarg1   - destination array address
 2081   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2082   //
 2083   // Examines the alignment of the operands and dispatches
 2084   // to a long, int, short, or byte copy loop.
 2085   //
 2086   address generate_unsafe_copy(address byte_copy_entry,
 2087                                address short_copy_entry,
 2088                                address int_copy_entry,
 2089                                address long_copy_entry) {
 2090     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2091 
 2092     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2093     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2094 
 2095     __ align(CodeEntryAlignment);
 2096     StubCodeMark mark(this, stub_id);
 2097     address start = __ pc();
 2098     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2099 
 2100     // bump this on entry, not on exit:
 2101     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2102 
 2103     __ orr(rscratch1, s, d);
 2104     __ orr(rscratch1, rscratch1, count);
 2105 
 2106     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2107     __ cbz(rscratch1, L_long_aligned);
 2108     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2109     __ cbz(rscratch1, L_int_aligned);
 2110     __ tbz(rscratch1, 0, L_short_aligned);
 2111     __ b(RuntimeAddress(byte_copy_entry));
 2112 
 2113     __ BIND(L_short_aligned);
 2114     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2115     __ b(RuntimeAddress(short_copy_entry));
 2116     __ BIND(L_int_aligned);
 2117     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2118     __ b(RuntimeAddress(int_copy_entry));
 2119     __ BIND(L_long_aligned);
 2120     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2121     __ b(RuntimeAddress(long_copy_entry));
 2122 
 2123     return start;
 2124   }
 2125 
 2126   //
 2127   //  Generate generic array copy stubs
 2128   //
 2129   //  Input:
 2130   //    c_rarg0    -  src oop
 2131   //    c_rarg1    -  src_pos (32-bits)
 2132   //    c_rarg2    -  dst oop
 2133   //    c_rarg3    -  dst_pos (32-bits)
 2134   //    c_rarg4    -  element count (32-bits)
 2135   //
 2136   //  Output:
 2137   //    r0 ==  0  -  success
 2138   //    r0 == -1^K - failure, where K is partial transfer count
 2139   //
 2140   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2141                                 address int_copy_entry, address oop_copy_entry,
 2142                                 address long_copy_entry, address checkcast_copy_entry) {
 2143     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2144 
 2145     Label L_failed, L_objArray;
 2146     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2147 
 2148     // Input registers
 2149     const Register src        = c_rarg0;  // source array oop
 2150     const Register src_pos    = c_rarg1;  // source position
 2151     const Register dst        = c_rarg2;  // destination array oop
 2152     const Register dst_pos    = c_rarg3;  // destination position
 2153     const Register length     = c_rarg4;
 2154 
 2155 
 2156     // Registers used as temps
 2157     const Register dst_klass  = c_rarg5;
 2158 
 2159     __ align(CodeEntryAlignment);
 2160 
 2161     StubCodeMark mark(this, stub_id);
 2162 
 2163     address start = __ pc();
 2164 
 2165     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2166 
 2167     // bump this on entry, not on exit:
 2168     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2169 
 2170     //-----------------------------------------------------------------------
 2171     // Assembler stub will be used for this call to arraycopy
 2172     // if the following conditions are met:
 2173     //
 2174     // (1) src and dst must not be null.
 2175     // (2) src_pos must not be negative.
 2176     // (3) dst_pos must not be negative.
 2177     // (4) length  must not be negative.
 2178     // (5) src klass and dst klass should be the same and not null.
 2179     // (6) src and dst should be arrays.
 2180     // (7) src_pos + length must not exceed length of src.
 2181     // (8) dst_pos + length must not exceed length of dst.
 2182     //
 2183 
 2184     //  if (src == nullptr) return -1;
 2185     __ cbz(src, L_failed);
 2186 
 2187     //  if (src_pos < 0) return -1;
 2188     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     //  if (dst == nullptr) return -1;
 2191     __ cbz(dst, L_failed);
 2192 
 2193     //  if (dst_pos < 0) return -1;
 2194     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2195 
 2196     // registers used as temp
 2197     const Register scratch_length    = r16; // elements count to copy
 2198     const Register scratch_src_klass = r17; // array klass
 2199     const Register lh                = r15; // layout helper
 2200 
 2201     //  if (length < 0) return -1;
 2202     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2203     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2204 
 2205     __ load_klass(scratch_src_klass, src);
 2206 #ifdef ASSERT
 2207     //  assert(src->klass() != nullptr);
 2208     {
 2209       BLOCK_COMMENT("assert klasses not null {");
 2210       Label L1, L2;
 2211       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2212       __ bind(L1);
 2213       __ stop("broken null klass");
 2214       __ bind(L2);
 2215       __ load_klass(rscratch1, dst);
 2216       __ cbz(rscratch1, L1);     // this would be broken also
 2217       BLOCK_COMMENT("} assert klasses not null done");
 2218     }
 2219 #endif
 2220 
 2221     // Load layout helper (32-bits)
 2222     //
 2223     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2224     // 32        30    24            16              8     2                 0
 2225     //
 2226     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2227     //
 2228 
 2229     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2230 
 2231     // Handle objArrays completely differently...
 2232     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2233     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2234     __ movw(rscratch1, objArray_lh);
 2235     __ eorw(rscratch2, lh, rscratch1);
 2236     __ cbzw(rscratch2, L_objArray);
 2237 
 2238     //  if (src->klass() != dst->klass()) return -1;
 2239     __ load_klass(rscratch2, dst);
 2240     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2241     __ cbnz(rscratch2, L_failed);
 2242 
 2243     // Check for flat inline type array -> return -1
 2244     __ test_flat_array_oop(src, rscratch2, L_failed);
 2245 
 2246     // Check for null-free (non-flat) inline type array -> handle as object array
 2247     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2248 
 2249     //  if (!src->is_Array()) return -1;
 2250     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2251 
 2252     // At this point, it is known to be a typeArray (array_tag 0x3).
 2253 #ifdef ASSERT
 2254     {
 2255       BLOCK_COMMENT("assert primitive array {");
 2256       Label L;
 2257       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2258       __ cmpw(lh, rscratch2);
 2259       __ br(Assembler::GE, L);
 2260       __ stop("must be a primitive array");
 2261       __ bind(L);
 2262       BLOCK_COMMENT("} assert primitive array done");
 2263     }
 2264 #endif
 2265 
 2266     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2267                            rscratch2, L_failed);
 2268 
 2269     // TypeArrayKlass
 2270     //
 2271     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2272     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2273     //
 2274 
 2275     const Register rscratch1_offset = rscratch1;    // array offset
 2276     const Register r15_elsize = lh; // element size
 2277 
 2278     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2279            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2280     __ add(src, src, rscratch1_offset);           // src array offset
 2281     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2282     BLOCK_COMMENT("choose copy loop based on element size");
 2283 
 2284     // next registers should be set before the jump to corresponding stub
 2285     const Register from     = c_rarg0;  // source array address
 2286     const Register to       = c_rarg1;  // destination array address
 2287     const Register count    = c_rarg2;  // elements count
 2288 
 2289     // 'from', 'to', 'count' registers should be set in such order
 2290     // since they are the same as 'src', 'src_pos', 'dst'.
 2291 
 2292     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2293 
 2294     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2295     // size in bytes).  We do a simple bitwise binary search.
 2296   __ BIND(L_copy_bytes);
 2297     __ tbnz(r15_elsize, 1, L_copy_ints);
 2298     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2299     __ lea(from, Address(src, src_pos));// src_addr
 2300     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2301     __ movw(count, scratch_length); // length
 2302     __ b(RuntimeAddress(byte_copy_entry));
 2303 
 2304   __ BIND(L_copy_shorts);
 2305     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2307     __ movw(count, scratch_length); // length
 2308     __ b(RuntimeAddress(short_copy_entry));
 2309 
 2310   __ BIND(L_copy_ints);
 2311     __ tbnz(r15_elsize, 0, L_copy_longs);
 2312     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2313     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2314     __ movw(count, scratch_length); // length
 2315     __ b(RuntimeAddress(int_copy_entry));
 2316 
 2317   __ BIND(L_copy_longs);
 2318 #ifdef ASSERT
 2319     {
 2320       BLOCK_COMMENT("assert long copy {");
 2321       Label L;
 2322       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2323       __ cmpw(r15_elsize, LogBytesPerLong);
 2324       __ br(Assembler::EQ, L);
 2325       __ stop("must be long copy, but elsize is wrong");
 2326       __ bind(L);
 2327       BLOCK_COMMENT("} assert long copy done");
 2328     }
 2329 #endif
 2330     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2331     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2332     __ movw(count, scratch_length); // length
 2333     __ b(RuntimeAddress(long_copy_entry));
 2334 
 2335     // ObjArrayKlass
 2336   __ BIND(L_objArray);
 2337     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2338 
 2339     Label L_plain_copy, L_checkcast_copy;
 2340     //  test array classes for subtyping
 2341     __ load_klass(r15, dst);
 2342     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2343     __ br(Assembler::NE, L_checkcast_copy);
 2344 
 2345     // Identically typed arrays can be copied without element-wise checks.
 2346     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                            rscratch2, L_failed);
 2348 
 2349     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2350     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2351     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2352     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353     __ movw(count, scratch_length); // length
 2354   __ BIND(L_plain_copy);
 2355     __ b(RuntimeAddress(oop_copy_entry));
 2356 
 2357   __ BIND(L_checkcast_copy);
 2358     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2359     {
 2360       // Before looking at dst.length, make sure dst is also an objArray.
 2361       __ ldrw(rscratch1, Address(r15, lh_offset));
 2362       __ movw(rscratch2, objArray_lh);
 2363       __ eorw(rscratch1, rscratch1, rscratch2);
 2364       __ cbnzw(rscratch1, L_failed);
 2365 
 2366       // It is safe to examine both src.length and dst.length.
 2367       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2368                              r15, L_failed);
 2369 
 2370       __ load_klass(dst_klass, dst); // reload
 2371 
 2372       // Marshal the base address arguments now, freeing registers.
 2373       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2374       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2375       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2376       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2377       __ movw(count, length);           // length (reloaded)
 2378       Register sco_temp = c_rarg3;      // this register is free now
 2379       assert_different_registers(from, to, count, sco_temp,
 2380                                  dst_klass, scratch_src_klass);
 2381       // assert_clean_int(count, sco_temp);
 2382 
 2383       // Generate the type check.
 2384       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2386 
 2387       // Smashes rscratch1, rscratch2
 2388       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2389                           L_plain_copy);
 2390 
 2391       // Fetch destination element klass from the ObjArrayKlass header.
 2392       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2393       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2394       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2395 
 2396       // the checkcast_copy loop needs two extra arguments:
 2397       assert(c_rarg3 == sco_temp, "#3 already in place");
 2398       // Set up arguments for checkcast_copy_entry.
 2399       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2400       __ b(RuntimeAddress(checkcast_copy_entry));
 2401     }
 2402 
 2403   __ BIND(L_failed);
 2404     __ mov(r0, -1);
 2405     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2406     __ ret(lr);
 2407 
 2408     return start;
 2409   }
 2410 
 2411   //
 2412   // Generate stub for array fill. If "aligned" is true, the
 2413   // "to" address is assumed to be heapword aligned.
 2414   //
 2415   // Arguments for generated stub:
 2416   //   to:    c_rarg0
 2417   //   value: c_rarg1
 2418   //   count: c_rarg2 treated as signed
 2419   //
 2420   address generate_fill(StubId stub_id) {
 2421     BasicType t;
 2422     bool aligned;
 2423 
 2424     switch (stub_id) {
 2425     case StubId::stubgen_jbyte_fill_id:
 2426       t = T_BYTE;
 2427       aligned = false;
 2428       break;
 2429     case StubId::stubgen_jshort_fill_id:
 2430       t = T_SHORT;
 2431       aligned = false;
 2432       break;
 2433     case StubId::stubgen_jint_fill_id:
 2434       t = T_INT;
 2435       aligned = false;
 2436       break;
 2437     case StubId::stubgen_arrayof_jbyte_fill_id:
 2438       t = T_BYTE;
 2439       aligned = true;
 2440       break;
 2441     case StubId::stubgen_arrayof_jshort_fill_id:
 2442       t = T_SHORT;
 2443       aligned = true;
 2444       break;
 2445     case StubId::stubgen_arrayof_jint_fill_id:
 2446       t = T_INT;
 2447       aligned = true;
 2448       break;
 2449     default:
 2450       ShouldNotReachHere();
 2451     };
 2452 
 2453     __ align(CodeEntryAlignment);
 2454     StubCodeMark mark(this, stub_id);
 2455     address start = __ pc();
 2456 
 2457     BLOCK_COMMENT("Entry:");
 2458 
 2459     const Register to        = c_rarg0;  // source array address
 2460     const Register value     = c_rarg1;  // value
 2461     const Register count     = c_rarg2;  // elements count
 2462 
 2463     const Register bz_base = r10;        // base for block_zero routine
 2464     const Register cnt_words = r11;      // temp register
 2465 
 2466     __ enter();
 2467 
 2468     Label L_fill_elements, L_exit1;
 2469 
 2470     int shift = -1;
 2471     switch (t) {
 2472       case T_BYTE:
 2473         shift = 0;
 2474         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2475         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2476         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2477         __ br(Assembler::LO, L_fill_elements);
 2478         break;
 2479       case T_SHORT:
 2480         shift = 1;
 2481         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2482         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2483         __ br(Assembler::LO, L_fill_elements);
 2484         break;
 2485       case T_INT:
 2486         shift = 2;
 2487         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2488         __ br(Assembler::LO, L_fill_elements);
 2489         break;
 2490       default: ShouldNotReachHere();
 2491     }
 2492 
 2493     // Align source address at 8 bytes address boundary.
 2494     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2495     if (!aligned) {
 2496       switch (t) {
 2497         case T_BYTE:
 2498           // One byte misalignment happens only for byte arrays.
 2499           __ tbz(to, 0, L_skip_align1);
 2500           __ strb(value, Address(__ post(to, 1)));
 2501           __ subw(count, count, 1);
 2502           __ bind(L_skip_align1);
 2503           // Fallthrough
 2504         case T_SHORT:
 2505           // Two bytes misalignment happens only for byte and short (char) arrays.
 2506           __ tbz(to, 1, L_skip_align2);
 2507           __ strh(value, Address(__ post(to, 2)));
 2508           __ subw(count, count, 2 >> shift);
 2509           __ bind(L_skip_align2);
 2510           // Fallthrough
 2511         case T_INT:
 2512           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2513           __ tbz(to, 2, L_skip_align4);
 2514           __ strw(value, Address(__ post(to, 4)));
 2515           __ subw(count, count, 4 >> shift);
 2516           __ bind(L_skip_align4);
 2517           break;
 2518         default: ShouldNotReachHere();
 2519       }
 2520     }
 2521 
 2522     //
 2523     //  Fill large chunks
 2524     //
 2525     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2526     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2527     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2528     if (UseBlockZeroing) {
 2529       Label non_block_zeroing, rest;
 2530       // If the fill value is zero we can use the fast zero_words().
 2531       __ cbnz(value, non_block_zeroing);
 2532       __ mov(bz_base, to);
 2533       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2534       address tpc = __ zero_words(bz_base, cnt_words);
 2535       if (tpc == nullptr) {
 2536         fatal("CodeCache is full at generate_fill");
 2537       }
 2538       __ b(rest);
 2539       __ bind(non_block_zeroing);
 2540       __ fill_words(to, cnt_words, value);
 2541       __ bind(rest);
 2542     } else {
 2543       __ fill_words(to, cnt_words, value);
 2544     }
 2545 
 2546     // Remaining count is less than 8 bytes. Fill it by a single store.
 2547     // Note that the total length is no less than 8 bytes.
 2548     if (t == T_BYTE || t == T_SHORT) {
 2549       Label L_exit1;
 2550       __ cbzw(count, L_exit1);
 2551       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2552       __ str(value, Address(to, -8));    // overwrite some elements
 2553       __ bind(L_exit1);
 2554       __ leave();
 2555       __ ret(lr);
 2556     }
 2557 
 2558     // Handle copies less than 8 bytes.
 2559     Label L_fill_2, L_fill_4, L_exit2;
 2560     __ bind(L_fill_elements);
 2561     switch (t) {
 2562       case T_BYTE:
 2563         __ tbz(count, 0, L_fill_2);
 2564         __ strb(value, Address(__ post(to, 1)));
 2565         __ bind(L_fill_2);
 2566         __ tbz(count, 1, L_fill_4);
 2567         __ strh(value, Address(__ post(to, 2)));
 2568         __ bind(L_fill_4);
 2569         __ tbz(count, 2, L_exit2);
 2570         __ strw(value, Address(to));
 2571         break;
 2572       case T_SHORT:
 2573         __ tbz(count, 0, L_fill_4);
 2574         __ strh(value, Address(__ post(to, 2)));
 2575         __ bind(L_fill_4);
 2576         __ tbz(count, 1, L_exit2);
 2577         __ strw(value, Address(to));
 2578         break;
 2579       case T_INT:
 2580         __ cbzw(count, L_exit2);
 2581         __ strw(value, Address(to));
 2582         break;
 2583       default: ShouldNotReachHere();
 2584     }
 2585     __ bind(L_exit2);
 2586     __ leave();
 2587     __ ret(lr);
 2588     return start;
 2589   }
 2590 
 2591   address generate_unsafecopy_common_error_exit() {
 2592     address start_pc = __ pc();
 2593       __ leave();
 2594       __ mov(r0, 0);
 2595       __ ret(lr);
 2596     return start_pc;
 2597   }
 2598 
 2599   //
 2600   //  Generate 'unsafe' set memory stub
 2601   //  Though just as safe as the other stubs, it takes an unscaled
 2602   //  size_t (# bytes) argument instead of an element count.
 2603   //
 2604   //  This fill operation is atomicity preserving: as long as the
 2605   //  address supplied is sufficiently aligned, all writes of up to 64
 2606   //  bits in size are single-copy atomic.
 2607   //
 2608   //  Input:
 2609   //    c_rarg0   - destination array address
 2610   //    c_rarg1   - byte count (size_t)
 2611   //    c_rarg2   - byte value
 2612   //
 2613   address generate_unsafe_setmemory() {
 2614     __ align(CodeEntryAlignment);
 2615     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2616     address start = __ pc();
 2617 
 2618     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2619     Label tail;
 2620 
 2621     UnsafeMemoryAccessMark umam(this, true, false);
 2622 
 2623     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2624 
 2625     __ dup(v0, __ T16B, value);
 2626 
 2627     if (AvoidUnalignedAccesses) {
 2628       __ cmp(count, (u1)16);
 2629       __ br(__ LO, tail);
 2630 
 2631       __ mov(rscratch1, 16);
 2632       __ andr(rscratch2, dest, 15);
 2633       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2634       __ strq(v0, Address(dest));
 2635       __ sub(count, count, rscratch1);
 2636       __ add(dest, dest, rscratch1);
 2637     }
 2638 
 2639     __ subs(count, count, (u1)64);
 2640     __ br(__ LO, tail);
 2641     {
 2642       Label again;
 2643       __ bind(again);
 2644       __ stpq(v0, v0, Address(dest));
 2645       __ stpq(v0, v0, Address(dest, 32));
 2646 
 2647       __ subs(count, count, 64);
 2648       __ add(dest, dest, 64);
 2649       __ br(__ HS, again);
 2650     }
 2651 
 2652     __ bind(tail);
 2653     // The count of bytes is off by 64, but we don't need to correct
 2654     // it because we're only going to use the least-significant few
 2655     // count bits from here on.
 2656     // __ add(count, count, 64);
 2657 
 2658     {
 2659       Label dont;
 2660       __ tbz(count, exact_log2(32), dont);
 2661       __ stpq(v0, v0, __ post(dest, 32));
 2662       __ bind(dont);
 2663     }
 2664     {
 2665       Label dont;
 2666       __ tbz(count, exact_log2(16), dont);
 2667       __ strq(v0, __ post(dest, 16));
 2668       __ bind(dont);
 2669     }
 2670     {
 2671       Label dont;
 2672       __ tbz(count, exact_log2(8), dont);
 2673       __ strd(v0, __ post(dest, 8));
 2674       __ bind(dont);
 2675     }
 2676 
 2677     Label finished;
 2678     __ tst(count, 7);
 2679     __ br(__ EQ, finished);
 2680 
 2681     {
 2682       Label dont;
 2683       __ tbz(count, exact_log2(4), dont);
 2684       __ strs(v0, __ post(dest, 4));
 2685       __ bind(dont);
 2686     }
 2687     {
 2688       Label dont;
 2689       __ tbz(count, exact_log2(2), dont);
 2690       __ bfi(value, value, 8, 8);
 2691       __ strh(value, __ post(dest, 2));
 2692       __ bind(dont);
 2693     }
 2694     {
 2695       Label dont;
 2696       __ tbz(count, exact_log2(1), dont);
 2697       __ strb(value, Address(dest));
 2698       __ bind(dont);
 2699     }
 2700 
 2701     __ bind(finished);
 2702     __ leave();
 2703     __ ret(lr);
 2704 
 2705     return start;
 2706   }
 2707 
 2708   address generate_data_cache_writeback() {
 2709     const Register line        = c_rarg0;  // address of line to write back
 2710 
 2711     __ align(CodeEntryAlignment);
 2712 
 2713     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2714     StubCodeMark mark(this, stub_id);
 2715 
 2716     address start = __ pc();
 2717     __ enter();
 2718     __ cache_wb(Address(line, 0));
 2719     __ leave();
 2720     __ ret(lr);
 2721 
 2722     return start;
 2723   }
 2724 
 2725   address generate_data_cache_writeback_sync() {
 2726     const Register is_pre     = c_rarg0;  // pre or post sync
 2727 
 2728     __ align(CodeEntryAlignment);
 2729 
 2730     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2731     StubCodeMark mark(this, stub_id);
 2732 
 2733     // pre wbsync is a no-op
 2734     // post wbsync translates to an sfence
 2735 
 2736     Label skip;
 2737     address start = __ pc();
 2738     __ enter();
 2739     __ cbnz(is_pre, skip);
 2740     __ cache_wbsync(false);
 2741     __ bind(skip);
 2742     __ leave();
 2743     __ ret(lr);
 2744 
 2745     return start;
 2746   }
 2747 
 2748   void generate_arraycopy_stubs() {
 2749     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 2750     // entry immediately following their stack push. This can be used
 2751     // as a post-push branch target for compatible stubs when they
 2752     // identify a special case that can be handled by the fallback
 2753     // stub e.g a disjoint copy stub may be use as a special case
 2754     // fallback for its compatible conjoint copy stub.
 2755     //
 2756     // A no push entry is always returned in the following local and
 2757     // then published by assigning to the appropriate entry field in
 2758     // class StubRoutines. The entry value is then passed to the
 2759     // generator for the compatible stub. That means the entry must be
 2760     // listed when saving to/restoring from the AOT cache, ensuring
 2761     // that the inter-stub jumps are noted at AOT-cache save and
 2762     // relocated at AOT cache load.
 2763     address nopush_entry;
 2764 
 2765     // generate the common exit first so later stubs can rely on it if
 2766     // they want an UnsafeMemoryAccess exit non-local to the stub
 2767     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2768     // register the stub as the default exit with class UnsafeMemoryAccess
 2769     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2770 
 2771     // generate and publish arch64-specific bulk copy routines first
 2772     // so we can call them from other copy stubs
 2773     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2774     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2775 
 2776     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2777     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2778 
 2779     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2780     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2781 
 2782     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2783 
 2784     //*** jbyte
 2785     // Always need aligned and unaligned versions
 2786     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2787     // disjoint nopush entry is needed by conjoint copy
 2788     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2789     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 2790     // conjoint nopush entry is needed by generic/unsafe copy
 2791     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 2792     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2793     // disjoint arrayof nopush entry is needed by conjoint copy
 2794     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2795     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 2796 
 2797     //*** jshort
 2798     // Always need aligned and unaligned versions
 2799     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 2800     // disjoint nopush entry is needed by conjoint copy
 2801     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 2802     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 2803     // conjoint nopush entry is used by generic/unsafe copy
 2804     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 2805     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 2806     // disjoint arrayof nopush entry is needed by conjoint copy
 2807     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 2808     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 2809 
 2810     //*** jint
 2811     // Aligned versions
 2812     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 2813     // disjoint arrayof nopush entry is needed by conjoint copy
 2814     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 2815     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 2816     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2817     // jint_arraycopy_nopush always points to the unaligned version
 2818     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 2819     // disjoint nopush entry is needed by conjoint copy
 2820     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 2821     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 2822     // conjoint nopush entry is needed by generic/unsafe copy
 2823     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 2824 
 2825     //*** jlong
 2826     // It is always aligned
 2827     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 2828     // disjoint arrayof nopush entry is needed by conjoint copy
 2829     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 2830     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 2831     // conjoint nopush entry is needed by generic/unsafe copy
 2832     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 2833     // disjoint normal/nopush and conjoint normal entries are not
 2834     // generated since the arrayof versions are the same
 2835     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2836     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 2837     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2838 
 2839     //*** oops
 2840     {
 2841       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2842         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 2843       // disjoint arrayof nopush entry is needed by conjoint copy
 2844       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 2845       StubRoutines::_arrayof_oop_arraycopy
 2846         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 2847       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 2848       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 2849       // Aligned versions without pre-barriers
 2850       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2851         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 2852       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 2853       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 2854       // note that we don't need a returned nopush entry because the
 2855       // generic/unsafe copy does not cater for uninit arrays.
 2856       StubRoutines::_arrayof_oop_arraycopy_uninit
 2857         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 2858     }
 2859 
 2860     // for oop copies reuse arrayof entries for non-arrayof cases
 2861     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2862     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 2863     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2864     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2865     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 2866     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2867 
 2868     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 2869     // checkcast nopush entry is needed by generic copy
 2870     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 2871     // note that we don't need a returned nopush entry because the
 2872     // generic copy does not cater for uninit arrays.
 2873     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2874 
 2875     // unsafe arraycopy may fallback on conjoint stubs
 2876     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2877                                                               StubRoutines::_jshort_arraycopy_nopush,
 2878                                                               StubRoutines::_jint_arraycopy_nopush,
 2879                                                               StubRoutines::_jlong_arraycopy_nopush);
 2880 
 2881     // generic arraycopy may fallback on conjoint stubs
 2882     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2883                                                                StubRoutines::_jshort_arraycopy_nopush,
 2884                                                                StubRoutines::_jint_arraycopy_nopush,
 2885                                                                StubRoutines::_oop_arraycopy_nopush,
 2886                                                                StubRoutines::_jlong_arraycopy_nopush,
 2887                                                                StubRoutines::_checkcast_arraycopy_nopush);
 2888 
 2889     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2890     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2891     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2892     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2893     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2894     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2895   }
 2896 
 2897   void generate_math_stubs() { Unimplemented(); }
 2898 
 2899   // Arguments:
 2900   //
 2901   // Inputs:
 2902   //   c_rarg0   - source byte array address
 2903   //   c_rarg1   - destination byte array address
 2904   //   c_rarg2   - K (key) in little endian int array
 2905   //
 2906   address generate_aescrypt_encryptBlock() {
 2907     __ align(CodeEntryAlignment);
 2908     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2909     StubCodeMark mark(this, stub_id);
 2910 
 2911     const Register from        = c_rarg0;  // source array address
 2912     const Register to          = c_rarg1;  // destination array address
 2913     const Register key         = c_rarg2;  // key array address
 2914     const Register keylen      = rscratch1;
 2915 
 2916     address start = __ pc();
 2917     __ enter();
 2918 
 2919     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2920 
 2921     __ aesenc_loadkeys(key, keylen);
 2922     __ aesecb_encrypt(from, to, keylen);
 2923 
 2924     __ mov(r0, 0);
 2925 
 2926     __ leave();
 2927     __ ret(lr);
 2928 
 2929     return start;
 2930   }
 2931 
 2932   // Arguments:
 2933   //
 2934   // Inputs:
 2935   //   c_rarg0   - source byte array address
 2936   //   c_rarg1   - destination byte array address
 2937   //   c_rarg2   - K (key) in little endian int array
 2938   //
 2939   address generate_aescrypt_decryptBlock() {
 2940     assert(UseAES, "need AES cryptographic extension support");
 2941     __ align(CodeEntryAlignment);
 2942     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2943     StubCodeMark mark(this, stub_id);
 2944     Label L_doLast;
 2945 
 2946     const Register from        = c_rarg0;  // source array address
 2947     const Register to          = c_rarg1;  // destination array address
 2948     const Register key         = c_rarg2;  // key array address
 2949     const Register keylen      = rscratch1;
 2950 
 2951     address start = __ pc();
 2952     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2953 
 2954     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2955 
 2956     __ aesecb_decrypt(from, to, key, keylen);
 2957 
 2958     __ mov(r0, 0);
 2959 
 2960     __ leave();
 2961     __ ret(lr);
 2962 
 2963     return start;
 2964   }
 2965 
 2966   // Arguments:
 2967   //
 2968   // Inputs:
 2969   //   c_rarg0   - source byte array address
 2970   //   c_rarg1   - destination byte array address
 2971   //   c_rarg2   - K (key) in little endian int array
 2972   //   c_rarg3   - r vector byte array address
 2973   //   c_rarg4   - input length
 2974   //
 2975   // Output:
 2976   //   x0        - input length
 2977   //
 2978   address generate_cipherBlockChaining_encryptAESCrypt() {
 2979     assert(UseAES, "need AES cryptographic extension support");
 2980     __ align(CodeEntryAlignment);
 2981     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2982     StubCodeMark mark(this, stub_id);
 2983 
 2984     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2985 
 2986     const Register from        = c_rarg0;  // source array address
 2987     const Register to          = c_rarg1;  // destination array address
 2988     const Register key         = c_rarg2;  // key array address
 2989     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2990                                            // and left with the results of the last encryption block
 2991     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2992     const Register keylen      = rscratch1;
 2993 
 2994     address start = __ pc();
 2995 
 2996       __ enter();
 2997 
 2998       __ movw(rscratch2, len_reg);
 2999 
 3000       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3001 
 3002       __ ld1(v0, __ T16B, rvec);
 3003 
 3004       __ cmpw(keylen, 52);
 3005       __ br(Assembler::CC, L_loadkeys_44);
 3006       __ br(Assembler::EQ, L_loadkeys_52);
 3007 
 3008       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3009       __ rev32(v17, __ T16B, v17);
 3010       __ rev32(v18, __ T16B, v18);
 3011     __ BIND(L_loadkeys_52);
 3012       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3013       __ rev32(v19, __ T16B, v19);
 3014       __ rev32(v20, __ T16B, v20);
 3015     __ BIND(L_loadkeys_44);
 3016       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3017       __ rev32(v21, __ T16B, v21);
 3018       __ rev32(v22, __ T16B, v22);
 3019       __ rev32(v23, __ T16B, v23);
 3020       __ rev32(v24, __ T16B, v24);
 3021       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3022       __ rev32(v25, __ T16B, v25);
 3023       __ rev32(v26, __ T16B, v26);
 3024       __ rev32(v27, __ T16B, v27);
 3025       __ rev32(v28, __ T16B, v28);
 3026       __ ld1(v29, v30, v31, __ T16B, key);
 3027       __ rev32(v29, __ T16B, v29);
 3028       __ rev32(v30, __ T16B, v30);
 3029       __ rev32(v31, __ T16B, v31);
 3030 
 3031     __ BIND(L_aes_loop);
 3032       __ ld1(v1, __ T16B, __ post(from, 16));
 3033       __ eor(v0, __ T16B, v0, v1);
 3034 
 3035       __ br(Assembler::CC, L_rounds_44);
 3036       __ br(Assembler::EQ, L_rounds_52);
 3037 
 3038       __ aese(v0, v17); __ aesmc(v0, v0);
 3039       __ aese(v0, v18); __ aesmc(v0, v0);
 3040     __ BIND(L_rounds_52);
 3041       __ aese(v0, v19); __ aesmc(v0, v0);
 3042       __ aese(v0, v20); __ aesmc(v0, v0);
 3043     __ BIND(L_rounds_44);
 3044       __ aese(v0, v21); __ aesmc(v0, v0);
 3045       __ aese(v0, v22); __ aesmc(v0, v0);
 3046       __ aese(v0, v23); __ aesmc(v0, v0);
 3047       __ aese(v0, v24); __ aesmc(v0, v0);
 3048       __ aese(v0, v25); __ aesmc(v0, v0);
 3049       __ aese(v0, v26); __ aesmc(v0, v0);
 3050       __ aese(v0, v27); __ aesmc(v0, v0);
 3051       __ aese(v0, v28); __ aesmc(v0, v0);
 3052       __ aese(v0, v29); __ aesmc(v0, v0);
 3053       __ aese(v0, v30);
 3054       __ eor(v0, __ T16B, v0, v31);
 3055 
 3056       __ st1(v0, __ T16B, __ post(to, 16));
 3057 
 3058       __ subw(len_reg, len_reg, 16);
 3059       __ cbnzw(len_reg, L_aes_loop);
 3060 
 3061       __ st1(v0, __ T16B, rvec);
 3062 
 3063       __ mov(r0, rscratch2);
 3064 
 3065       __ leave();
 3066       __ ret(lr);
 3067 
 3068       return start;
 3069   }
 3070 
 3071   // Arguments:
 3072   //
 3073   // Inputs:
 3074   //   c_rarg0   - source byte array address
 3075   //   c_rarg1   - destination byte array address
 3076   //   c_rarg2   - K (key) in little endian int array
 3077   //   c_rarg3   - r vector byte array address
 3078   //   c_rarg4   - input length
 3079   //
 3080   // Output:
 3081   //   r0        - input length
 3082   //
 3083   address generate_cipherBlockChaining_decryptAESCrypt() {
 3084     assert(UseAES, "need AES cryptographic extension support");
 3085     __ align(CodeEntryAlignment);
 3086     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3087     StubCodeMark mark(this, stub_id);
 3088 
 3089     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3090 
 3091     const Register from        = c_rarg0;  // source array address
 3092     const Register to          = c_rarg1;  // destination array address
 3093     const Register key         = c_rarg2;  // key array address
 3094     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3095                                            // and left with the results of the last encryption block
 3096     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3097     const Register keylen      = rscratch1;
 3098 
 3099     address start = __ pc();
 3100 
 3101       __ enter();
 3102 
 3103       __ movw(rscratch2, len_reg);
 3104 
 3105       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3106 
 3107       __ ld1(v2, __ T16B, rvec);
 3108 
 3109       __ ld1(v31, __ T16B, __ post(key, 16));
 3110       __ rev32(v31, __ T16B, v31);
 3111 
 3112       __ cmpw(keylen, 52);
 3113       __ br(Assembler::CC, L_loadkeys_44);
 3114       __ br(Assembler::EQ, L_loadkeys_52);
 3115 
 3116       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3117       __ rev32(v17, __ T16B, v17);
 3118       __ rev32(v18, __ T16B, v18);
 3119     __ BIND(L_loadkeys_52);
 3120       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3121       __ rev32(v19, __ T16B, v19);
 3122       __ rev32(v20, __ T16B, v20);
 3123     __ BIND(L_loadkeys_44);
 3124       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3125       __ rev32(v21, __ T16B, v21);
 3126       __ rev32(v22, __ T16B, v22);
 3127       __ rev32(v23, __ T16B, v23);
 3128       __ rev32(v24, __ T16B, v24);
 3129       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3130       __ rev32(v25, __ T16B, v25);
 3131       __ rev32(v26, __ T16B, v26);
 3132       __ rev32(v27, __ T16B, v27);
 3133       __ rev32(v28, __ T16B, v28);
 3134       __ ld1(v29, v30, __ T16B, key);
 3135       __ rev32(v29, __ T16B, v29);
 3136       __ rev32(v30, __ T16B, v30);
 3137 
 3138     __ BIND(L_aes_loop);
 3139       __ ld1(v0, __ T16B, __ post(from, 16));
 3140       __ orr(v1, __ T16B, v0, v0);
 3141 
 3142       __ br(Assembler::CC, L_rounds_44);
 3143       __ br(Assembler::EQ, L_rounds_52);
 3144 
 3145       __ aesd(v0, v17); __ aesimc(v0, v0);
 3146       __ aesd(v0, v18); __ aesimc(v0, v0);
 3147     __ BIND(L_rounds_52);
 3148       __ aesd(v0, v19); __ aesimc(v0, v0);
 3149       __ aesd(v0, v20); __ aesimc(v0, v0);
 3150     __ BIND(L_rounds_44);
 3151       __ aesd(v0, v21); __ aesimc(v0, v0);
 3152       __ aesd(v0, v22); __ aesimc(v0, v0);
 3153       __ aesd(v0, v23); __ aesimc(v0, v0);
 3154       __ aesd(v0, v24); __ aesimc(v0, v0);
 3155       __ aesd(v0, v25); __ aesimc(v0, v0);
 3156       __ aesd(v0, v26); __ aesimc(v0, v0);
 3157       __ aesd(v0, v27); __ aesimc(v0, v0);
 3158       __ aesd(v0, v28); __ aesimc(v0, v0);
 3159       __ aesd(v0, v29); __ aesimc(v0, v0);
 3160       __ aesd(v0, v30);
 3161       __ eor(v0, __ T16B, v0, v31);
 3162       __ eor(v0, __ T16B, v0, v2);
 3163 
 3164       __ st1(v0, __ T16B, __ post(to, 16));
 3165       __ orr(v2, __ T16B, v1, v1);
 3166 
 3167       __ subw(len_reg, len_reg, 16);
 3168       __ cbnzw(len_reg, L_aes_loop);
 3169 
 3170       __ st1(v2, __ T16B, rvec);
 3171 
 3172       __ mov(r0, rscratch2);
 3173 
 3174       __ leave();
 3175       __ ret(lr);
 3176 
 3177     return start;
 3178   }
 3179 
 3180   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3181   // Inputs: 128-bits. in is preserved.
 3182   // The least-significant 64-bit word is in the upper dword of each vector.
 3183   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3184   // Output: result
 3185   void be_add_128_64(FloatRegister result, FloatRegister in,
 3186                      FloatRegister inc, FloatRegister tmp) {
 3187     assert_different_registers(result, tmp, inc);
 3188 
 3189     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3190                                            // input
 3191     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3192     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3193                                            // MSD == 0 (must be!) to LSD
 3194     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3195   }
 3196 
 3197   // CTR AES crypt.
 3198   // Arguments:
 3199   //
 3200   // Inputs:
 3201   //   c_rarg0   - source byte array address
 3202   //   c_rarg1   - destination byte array address
 3203   //   c_rarg2   - K (key) in little endian int array
 3204   //   c_rarg3   - counter vector byte array address
 3205   //   c_rarg4   - input length
 3206   //   c_rarg5   - saved encryptedCounter start
 3207   //   c_rarg6   - saved used length
 3208   //
 3209   // Output:
 3210   //   r0       - input length
 3211   //
 3212   address generate_counterMode_AESCrypt() {
 3213     const Register in = c_rarg0;
 3214     const Register out = c_rarg1;
 3215     const Register key = c_rarg2;
 3216     const Register counter = c_rarg3;
 3217     const Register saved_len = c_rarg4, len = r10;
 3218     const Register saved_encrypted_ctr = c_rarg5;
 3219     const Register used_ptr = c_rarg6, used = r12;
 3220 
 3221     const Register offset = r7;
 3222     const Register keylen = r11;
 3223 
 3224     const unsigned char block_size = 16;
 3225     const int bulk_width = 4;
 3226     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3227     // performance with larger data sizes, but it also means that the
 3228     // fast path isn't used until you have at least 8 blocks, and up
 3229     // to 127 bytes of data will be executed on the slow path. For
 3230     // that reason, and also so as not to blow away too much icache, 4
 3231     // blocks seems like a sensible compromise.
 3232 
 3233     // Algorithm:
 3234     //
 3235     //    if (len == 0) {
 3236     //        goto DONE;
 3237     //    }
 3238     //    int result = len;
 3239     //    do {
 3240     //        if (used >= blockSize) {
 3241     //            if (len >= bulk_width * blockSize) {
 3242     //                CTR_large_block();
 3243     //                if (len == 0)
 3244     //                    goto DONE;
 3245     //            }
 3246     //            for (;;) {
 3247     //                16ByteVector v0 = counter;
 3248     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3249     //                used = 0;
 3250     //                if (len < blockSize)
 3251     //                    break;    /* goto NEXT */
 3252     //                16ByteVector v1 = load16Bytes(in, offset);
 3253     //                v1 = v1 ^ encryptedCounter;
 3254     //                store16Bytes(out, offset);
 3255     //                used = blockSize;
 3256     //                offset += blockSize;
 3257     //                len -= blockSize;
 3258     //                if (len == 0)
 3259     //                    goto DONE;
 3260     //            }
 3261     //        }
 3262     //      NEXT:
 3263     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3264     //        len--;
 3265     //    } while (len != 0);
 3266     //  DONE:
 3267     //    return result;
 3268     //
 3269     // CTR_large_block()
 3270     //    Wide bulk encryption of whole blocks.
 3271 
 3272     __ align(CodeEntryAlignment);
 3273     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3274     StubCodeMark mark(this, stub_id);
 3275     const address start = __ pc();
 3276     __ enter();
 3277 
 3278     Label DONE, CTR_large_block, large_block_return;
 3279     __ ldrw(used, Address(used_ptr));
 3280     __ cbzw(saved_len, DONE);
 3281 
 3282     __ mov(len, saved_len);
 3283     __ mov(offset, 0);
 3284 
 3285     // Compute #rounds for AES based on the length of the key array
 3286     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3287 
 3288     __ aesenc_loadkeys(key, keylen);
 3289 
 3290     {
 3291       Label L_CTR_loop, NEXT;
 3292 
 3293       __ bind(L_CTR_loop);
 3294 
 3295       __ cmp(used, block_size);
 3296       __ br(__ LO, NEXT);
 3297 
 3298       // Maybe we have a lot of data
 3299       __ subsw(rscratch1, len, bulk_width * block_size);
 3300       __ br(__ HS, CTR_large_block);
 3301       __ BIND(large_block_return);
 3302       __ cbzw(len, DONE);
 3303 
 3304       // Setup the counter
 3305       __ movi(v4, __ T4S, 0);
 3306       __ movi(v5, __ T4S, 1);
 3307       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3308 
 3309       // 128-bit big-endian increment
 3310       __ ld1(v0, __ T16B, counter);
 3311       __ rev64(v16, __ T16B, v0);
 3312       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3313       __ rev64(v16, __ T16B, v16);
 3314       __ st1(v16, __ T16B, counter);
 3315       // Previous counter value is in v0
 3316       // v4 contains { 0, 1 }
 3317 
 3318       {
 3319         // We have fewer than bulk_width blocks of data left. Encrypt
 3320         // them one by one until there is less than a full block
 3321         // remaining, being careful to save both the encrypted counter
 3322         // and the counter.
 3323 
 3324         Label inner_loop;
 3325         __ bind(inner_loop);
 3326         // Counter to encrypt is in v0
 3327         __ aesecb_encrypt(noreg, noreg, keylen);
 3328         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3329 
 3330         // Do we have a remaining full block?
 3331 
 3332         __ mov(used, 0);
 3333         __ cmp(len, block_size);
 3334         __ br(__ LO, NEXT);
 3335 
 3336         // Yes, we have a full block
 3337         __ ldrq(v1, Address(in, offset));
 3338         __ eor(v1, __ T16B, v1, v0);
 3339         __ strq(v1, Address(out, offset));
 3340         __ mov(used, block_size);
 3341         __ add(offset, offset, block_size);
 3342 
 3343         __ subw(len, len, block_size);
 3344         __ cbzw(len, DONE);
 3345 
 3346         // Increment the counter, store it back
 3347         __ orr(v0, __ T16B, v16, v16);
 3348         __ rev64(v16, __ T16B, v16);
 3349         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3350         __ rev64(v16, __ T16B, v16);
 3351         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3352 
 3353         __ b(inner_loop);
 3354       }
 3355 
 3356       __ BIND(NEXT);
 3357 
 3358       // Encrypt a single byte, and loop.
 3359       // We expect this to be a rare event.
 3360       __ ldrb(rscratch1, Address(in, offset));
 3361       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3362       __ eor(rscratch1, rscratch1, rscratch2);
 3363       __ strb(rscratch1, Address(out, offset));
 3364       __ add(offset, offset, 1);
 3365       __ add(used, used, 1);
 3366       __ subw(len, len,1);
 3367       __ cbnzw(len, L_CTR_loop);
 3368     }
 3369 
 3370     __ bind(DONE);
 3371     __ strw(used, Address(used_ptr));
 3372     __ mov(r0, saved_len);
 3373 
 3374     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3375     __ ret(lr);
 3376 
 3377     // Bulk encryption
 3378 
 3379     __ BIND (CTR_large_block);
 3380     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3381 
 3382     if (bulk_width == 8) {
 3383       __ sub(sp, sp, 4 * 16);
 3384       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3385     }
 3386     __ sub(sp, sp, 4 * 16);
 3387     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3388     RegSet saved_regs = (RegSet::of(in, out, offset)
 3389                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3390     __ push(saved_regs, sp);
 3391     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3392     __ add(in, in, offset);
 3393     __ add(out, out, offset);
 3394 
 3395     // Keys should already be loaded into the correct registers
 3396 
 3397     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3398     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3399 
 3400     // AES/CTR loop
 3401     {
 3402       Label L_CTR_loop;
 3403       __ BIND(L_CTR_loop);
 3404 
 3405       // Setup the counters
 3406       __ movi(v8, __ T4S, 0);
 3407       __ movi(v9, __ T4S, 1);
 3408       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3409 
 3410       for (int i = 0; i < bulk_width; i++) {
 3411         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3412         __ rev64(v0_ofs, __ T16B, v16);
 3413         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3414       }
 3415 
 3416       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3417 
 3418       // Encrypt the counters
 3419       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3420 
 3421       if (bulk_width == 8) {
 3422         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3423       }
 3424 
 3425       // XOR the encrypted counters with the inputs
 3426       for (int i = 0; i < bulk_width; i++) {
 3427         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3428         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3429         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3430       }
 3431 
 3432       // Write the encrypted data
 3433       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3434       if (bulk_width == 8) {
 3435         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3436       }
 3437 
 3438       __ subw(len, len, 16 * bulk_width);
 3439       __ cbnzw(len, L_CTR_loop);
 3440     }
 3441 
 3442     // Save the counter back where it goes
 3443     __ rev64(v16, __ T16B, v16);
 3444     __ st1(v16, __ T16B, counter);
 3445 
 3446     __ pop(saved_regs, sp);
 3447 
 3448     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3449     if (bulk_width == 8) {
 3450       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3451     }
 3452 
 3453     __ andr(rscratch1, len, -16 * bulk_width);
 3454     __ sub(len, len, rscratch1);
 3455     __ add(offset, offset, rscratch1);
 3456     __ mov(used, 16);
 3457     __ strw(used, Address(used_ptr));
 3458     __ b(large_block_return);
 3459 
 3460     return start;
 3461   }
 3462 
 3463   // Vector AES Galois Counter Mode implementation. Parameters:
 3464   //
 3465   // in = c_rarg0
 3466   // len = c_rarg1
 3467   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3468   // out = c_rarg3
 3469   // key = c_rarg4
 3470   // state = c_rarg5 - GHASH.state
 3471   // subkeyHtbl = c_rarg6 - powers of H
 3472   // counter = c_rarg7 - 16 bytes of CTR
 3473   // return - number of processed bytes
 3474   address generate_galoisCounterMode_AESCrypt() {
 3475     Label ghash_polynomial; // local data generated after code
 3476 
 3477    __ align(CodeEntryAlignment);
 3478     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3479     StubCodeMark mark(this, stub_id);
 3480     address start = __ pc();
 3481     __ enter();
 3482 
 3483     const Register in = c_rarg0;
 3484     const Register len = c_rarg1;
 3485     const Register ct = c_rarg2;
 3486     const Register out = c_rarg3;
 3487     // and updated with the incremented counter in the end
 3488 
 3489     const Register key = c_rarg4;
 3490     const Register state = c_rarg5;
 3491 
 3492     const Register subkeyHtbl = c_rarg6;
 3493 
 3494     const Register counter = c_rarg7;
 3495 
 3496     const Register keylen = r10;
 3497     // Save state before entering routine
 3498     __ sub(sp, sp, 4 * 16);
 3499     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3500     __ sub(sp, sp, 4 * 16);
 3501     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3502 
 3503     // __ andr(len, len, -512);
 3504     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3505     __ str(len, __ pre(sp, -2 * wordSize));
 3506 
 3507     Label DONE;
 3508     __ cbz(len, DONE);
 3509 
 3510     // Compute #rounds for AES based on the length of the key array
 3511     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3512 
 3513     __ aesenc_loadkeys(key, keylen);
 3514     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3515     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3516 
 3517     // AES/CTR loop
 3518     {
 3519       Label L_CTR_loop;
 3520       __ BIND(L_CTR_loop);
 3521 
 3522       // Setup the counters
 3523       __ movi(v8, __ T4S, 0);
 3524       __ movi(v9, __ T4S, 1);
 3525       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3526 
 3527       assert(v0->encoding() < v8->encoding(), "");
 3528       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3529         FloatRegister f = as_FloatRegister(i);
 3530         __ rev32(f, __ T16B, v16);
 3531         __ addv(v16, __ T4S, v16, v8);
 3532       }
 3533 
 3534       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3535 
 3536       // Encrypt the counters
 3537       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3538 
 3539       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3540 
 3541       // XOR the encrypted counters with the inputs
 3542       for (int i = 0; i < 8; i++) {
 3543         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3544         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3545         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3546       }
 3547       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3548       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3549 
 3550       __ subw(len, len, 16 * 8);
 3551       __ cbnzw(len, L_CTR_loop);
 3552     }
 3553 
 3554     __ rev32(v16, __ T16B, v16);
 3555     __ st1(v16, __ T16B, counter);
 3556 
 3557     __ ldr(len, Address(sp));
 3558     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3559 
 3560     // GHASH/CTR loop
 3561     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3562                                 len, /*unrolls*/4);
 3563 
 3564 #ifdef ASSERT
 3565     { Label L;
 3566       __ cmp(len, (unsigned char)0);
 3567       __ br(Assembler::EQ, L);
 3568       __ stop("stubGenerator: abort");
 3569       __ bind(L);
 3570   }
 3571 #endif
 3572 
 3573   __ bind(DONE);
 3574     // Return the number of bytes processed
 3575     __ ldr(r0, __ post(sp, 2 * wordSize));
 3576 
 3577     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3578     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3579 
 3580     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3581     __ ret(lr);
 3582 
 3583     // bind label and generate polynomial data
 3584     __ align(wordSize * 2);
 3585     __ bind(ghash_polynomial);
 3586     __ emit_int64(0x87);  // The low-order bits of the field
 3587                           // polynomial (i.e. p = z^7+z^2+z+1)
 3588                           // repeated in the low and high parts of a
 3589                           // 128-bit vector
 3590     __ emit_int64(0x87);
 3591 
 3592     return start;
 3593   }
 3594 
 3595   class Cached64Bytes {
 3596   private:
 3597     MacroAssembler *_masm;
 3598     Register _regs[8];
 3599 
 3600   public:
 3601     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3602       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3603       auto it = rs.begin();
 3604       for (auto &r: _regs) {
 3605         r = *it;
 3606         ++it;
 3607       }
 3608     }
 3609 
 3610     void gen_loads(Register base) {
 3611       for (int i = 0; i < 8; i += 2) {
 3612         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3613       }
 3614     }
 3615 
 3616     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3617     void extract_u32(Register dest, int i) {
 3618       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3619     }
 3620   };
 3621 
 3622   // Utility routines for md5.
 3623   // Clobbers r10 and r11.
 3624   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3625               int k, int s, int t) {
 3626     Register rscratch3 = r10;
 3627     Register rscratch4 = r11;
 3628 
 3629     __ eorw(rscratch3, r3, r4);
 3630     __ movw(rscratch2, t);
 3631     __ andw(rscratch3, rscratch3, r2);
 3632     __ addw(rscratch4, r1, rscratch2);
 3633     reg_cache.extract_u32(rscratch1, k);
 3634     __ eorw(rscratch3, rscratch3, r4);
 3635     __ addw(rscratch4, rscratch4, rscratch1);
 3636     __ addw(rscratch3, rscratch3, rscratch4);
 3637     __ rorw(rscratch2, rscratch3, 32 - s);
 3638     __ addw(r1, rscratch2, r2);
 3639   }
 3640 
 3641   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3642               int k, int s, int t) {
 3643     Register rscratch3 = r10;
 3644     Register rscratch4 = r11;
 3645 
 3646     reg_cache.extract_u32(rscratch1, k);
 3647     __ movw(rscratch2, t);
 3648     __ addw(rscratch4, r1, rscratch2);
 3649     __ addw(rscratch4, rscratch4, rscratch1);
 3650     __ bicw(rscratch2, r3, r4);
 3651     __ andw(rscratch3, r2, r4);
 3652     __ addw(rscratch2, rscratch2, rscratch4);
 3653     __ addw(rscratch2, rscratch2, rscratch3);
 3654     __ rorw(rscratch2, rscratch2, 32 - s);
 3655     __ addw(r1, rscratch2, r2);
 3656   }
 3657 
 3658   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3659               int k, int s, int t) {
 3660     Register rscratch3 = r10;
 3661     Register rscratch4 = r11;
 3662 
 3663     __ eorw(rscratch3, r3, r4);
 3664     __ movw(rscratch2, t);
 3665     __ addw(rscratch4, r1, rscratch2);
 3666     reg_cache.extract_u32(rscratch1, k);
 3667     __ eorw(rscratch3, rscratch3, r2);
 3668     __ addw(rscratch4, rscratch4, rscratch1);
 3669     __ addw(rscratch3, rscratch3, rscratch4);
 3670     __ rorw(rscratch2, rscratch3, 32 - s);
 3671     __ addw(r1, rscratch2, r2);
 3672   }
 3673 
 3674   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3675               int k, int s, int t) {
 3676     Register rscratch3 = r10;
 3677     Register rscratch4 = r11;
 3678 
 3679     __ movw(rscratch3, t);
 3680     __ ornw(rscratch2, r2, r4);
 3681     __ addw(rscratch4, r1, rscratch3);
 3682     reg_cache.extract_u32(rscratch1, k);
 3683     __ eorw(rscratch3, rscratch2, r3);
 3684     __ addw(rscratch4, rscratch4, rscratch1);
 3685     __ addw(rscratch3, rscratch3, rscratch4);
 3686     __ rorw(rscratch2, rscratch3, 32 - s);
 3687     __ addw(r1, rscratch2, r2);
 3688   }
 3689 
 3690   // Arguments:
 3691   //
 3692   // Inputs:
 3693   //   c_rarg0   - byte[]  source+offset
 3694   //   c_rarg1   - int[]   SHA.state
 3695   //   c_rarg2   - int     offset
 3696   //   c_rarg3   - int     limit
 3697   //
 3698   address generate_md5_implCompress(StubId stub_id) {
 3699     bool multi_block;
 3700     switch (stub_id) {
 3701     case StubId::stubgen_md5_implCompress_id:
 3702       multi_block = false;
 3703       break;
 3704     case StubId::stubgen_md5_implCompressMB_id:
 3705       multi_block = true;
 3706       break;
 3707     default:
 3708       ShouldNotReachHere();
 3709     }
 3710     __ align(CodeEntryAlignment);
 3711 
 3712     StubCodeMark mark(this, stub_id);
 3713     address start = __ pc();
 3714 
 3715     Register buf       = c_rarg0;
 3716     Register state     = c_rarg1;
 3717     Register ofs       = c_rarg2;
 3718     Register limit     = c_rarg3;
 3719     Register a         = r4;
 3720     Register b         = r5;
 3721     Register c         = r6;
 3722     Register d         = r7;
 3723     Register rscratch3 = r10;
 3724     Register rscratch4 = r11;
 3725 
 3726     Register state_regs[2] = { r12, r13 };
 3727     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3728     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3729 
 3730     __ push(saved_regs, sp);
 3731 
 3732     __ ldp(state_regs[0], state_regs[1], Address(state));
 3733     __ ubfx(a, state_regs[0],  0, 32);
 3734     __ ubfx(b, state_regs[0], 32, 32);
 3735     __ ubfx(c, state_regs[1],  0, 32);
 3736     __ ubfx(d, state_regs[1], 32, 32);
 3737 
 3738     Label md5_loop;
 3739     __ BIND(md5_loop);
 3740 
 3741     reg_cache.gen_loads(buf);
 3742 
 3743     // Round 1
 3744     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3745     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3746     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3747     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3748     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3749     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3750     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3751     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3752     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3753     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3754     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3755     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3756     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3757     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3758     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3759     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3760 
 3761     // Round 2
 3762     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3763     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3764     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3765     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3766     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3767     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3768     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3769     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3770     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3771     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3772     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3773     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3774     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3775     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3776     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3777     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3778 
 3779     // Round 3
 3780     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3781     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3782     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3783     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3784     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3785     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3786     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3787     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3788     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3789     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3790     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3791     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3792     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3793     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3794     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3795     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3796 
 3797     // Round 4
 3798     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3799     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3800     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3801     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3802     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3803     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3804     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3805     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3806     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3807     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3808     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3809     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3810     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3811     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3812     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3813     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3814 
 3815     __ addw(a, state_regs[0], a);
 3816     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3817     __ addw(b, rscratch2, b);
 3818     __ addw(c, state_regs[1], c);
 3819     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3820     __ addw(d, rscratch4, d);
 3821 
 3822     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3823     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3824 
 3825     if (multi_block) {
 3826       __ add(buf, buf, 64);
 3827       __ add(ofs, ofs, 64);
 3828       __ cmp(ofs, limit);
 3829       __ br(Assembler::LE, md5_loop);
 3830       __ mov(c_rarg0, ofs); // return ofs
 3831     }
 3832 
 3833     // write hash values back in the correct order
 3834     __ stp(state_regs[0], state_regs[1], Address(state));
 3835 
 3836     __ pop(saved_regs, sp);
 3837 
 3838     __ ret(lr);
 3839 
 3840     return start;
 3841   }
 3842 
 3843   // Arguments:
 3844   //
 3845   // Inputs:
 3846   //   c_rarg0   - byte[]  source+offset
 3847   //   c_rarg1   - int[]   SHA.state
 3848   //   c_rarg2   - int     offset
 3849   //   c_rarg3   - int     limit
 3850   //
 3851   address generate_sha1_implCompress(StubId stub_id) {
 3852     bool multi_block;
 3853     switch (stub_id) {
 3854     case StubId::stubgen_sha1_implCompress_id:
 3855       multi_block = false;
 3856       break;
 3857     case StubId::stubgen_sha1_implCompressMB_id:
 3858       multi_block = true;
 3859       break;
 3860     default:
 3861       ShouldNotReachHere();
 3862     }
 3863 
 3864     __ align(CodeEntryAlignment);
 3865 
 3866     StubCodeMark mark(this, stub_id);
 3867     address start = __ pc();
 3868 
 3869     Register buf   = c_rarg0;
 3870     Register state = c_rarg1;
 3871     Register ofs   = c_rarg2;
 3872     Register limit = c_rarg3;
 3873 
 3874     Label keys;
 3875     Label sha1_loop;
 3876 
 3877     // load the keys into v0..v3
 3878     __ adr(rscratch1, keys);
 3879     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3880     // load 5 words state into v6, v7
 3881     __ ldrq(v6, Address(state, 0));
 3882     __ ldrs(v7, Address(state, 16));
 3883 
 3884 
 3885     __ BIND(sha1_loop);
 3886     // load 64 bytes of data into v16..v19
 3887     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3888     __ rev32(v16, __ T16B, v16);
 3889     __ rev32(v17, __ T16B, v17);
 3890     __ rev32(v18, __ T16B, v18);
 3891     __ rev32(v19, __ T16B, v19);
 3892 
 3893     // do the sha1
 3894     __ addv(v4, __ T4S, v16, v0);
 3895     __ orr(v20, __ T16B, v6, v6);
 3896 
 3897     FloatRegister d0 = v16;
 3898     FloatRegister d1 = v17;
 3899     FloatRegister d2 = v18;
 3900     FloatRegister d3 = v19;
 3901 
 3902     for (int round = 0; round < 20; round++) {
 3903       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3904       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3905       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3906       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3907       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3908 
 3909       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3910       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3911       __ sha1h(tmp2, __ T4S, v20);
 3912       if (round < 5)
 3913         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3914       else if (round < 10 || round >= 15)
 3915         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3916       else
 3917         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3918       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3919 
 3920       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3921     }
 3922 
 3923     __ addv(v7, __ T2S, v7, v21);
 3924     __ addv(v6, __ T4S, v6, v20);
 3925 
 3926     if (multi_block) {
 3927       __ add(ofs, ofs, 64);
 3928       __ cmp(ofs, limit);
 3929       __ br(Assembler::LE, sha1_loop);
 3930       __ mov(c_rarg0, ofs); // return ofs
 3931     }
 3932 
 3933     __ strq(v6, Address(state, 0));
 3934     __ strs(v7, Address(state, 16));
 3935 
 3936     __ ret(lr);
 3937 
 3938     __ bind(keys);
 3939     __ emit_int32(0x5a827999);
 3940     __ emit_int32(0x6ed9eba1);
 3941     __ emit_int32(0x8f1bbcdc);
 3942     __ emit_int32(0xca62c1d6);
 3943 
 3944     return start;
 3945   }
 3946 
 3947 
 3948   // Arguments:
 3949   //
 3950   // Inputs:
 3951   //   c_rarg0   - byte[]  source+offset
 3952   //   c_rarg1   - int[]   SHA.state
 3953   //   c_rarg2   - int     offset
 3954   //   c_rarg3   - int     limit
 3955   //
 3956   address generate_sha256_implCompress(StubId stub_id) {
 3957     bool multi_block;
 3958     switch (stub_id) {
 3959     case StubId::stubgen_sha256_implCompress_id:
 3960       multi_block = false;
 3961       break;
 3962     case StubId::stubgen_sha256_implCompressMB_id:
 3963       multi_block = true;
 3964       break;
 3965     default:
 3966       ShouldNotReachHere();
 3967     }
 3968 
 3969     static const uint32_t round_consts[64] = {
 3970       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3971       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3972       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3973       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3974       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3975       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3976       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3977       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3978       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3979       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3980       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3981       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3982       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3983       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3984       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3985       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3986     };
 3987 
 3988     __ align(CodeEntryAlignment);
 3989 
 3990     StubCodeMark mark(this, stub_id);
 3991     address start = __ pc();
 3992 
 3993     Register buf   = c_rarg0;
 3994     Register state = c_rarg1;
 3995     Register ofs   = c_rarg2;
 3996     Register limit = c_rarg3;
 3997 
 3998     Label sha1_loop;
 3999 
 4000     __ stpd(v8, v9, __ pre(sp, -32));
 4001     __ stpd(v10, v11, Address(sp, 16));
 4002 
 4003 // dga == v0
 4004 // dgb == v1
 4005 // dg0 == v2
 4006 // dg1 == v3
 4007 // dg2 == v4
 4008 // t0 == v6
 4009 // t1 == v7
 4010 
 4011     // load 16 keys to v16..v31
 4012     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4013     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4014     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4015     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4016     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4017 
 4018     // load 8 words (256 bits) state
 4019     __ ldpq(v0, v1, state);
 4020 
 4021     __ BIND(sha1_loop);
 4022     // load 64 bytes of data into v8..v11
 4023     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4024     __ rev32(v8, __ T16B, v8);
 4025     __ rev32(v9, __ T16B, v9);
 4026     __ rev32(v10, __ T16B, v10);
 4027     __ rev32(v11, __ T16B, v11);
 4028 
 4029     __ addv(v6, __ T4S, v8, v16);
 4030     __ orr(v2, __ T16B, v0, v0);
 4031     __ orr(v3, __ T16B, v1, v1);
 4032 
 4033     FloatRegister d0 = v8;
 4034     FloatRegister d1 = v9;
 4035     FloatRegister d2 = v10;
 4036     FloatRegister d3 = v11;
 4037 
 4038 
 4039     for (int round = 0; round < 16; round++) {
 4040       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4041       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4042       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4043       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4044 
 4045       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4046        __ orr(v4, __ T16B, v2, v2);
 4047       if (round < 15)
 4048         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4049       __ sha256h(v2, __ T4S, v3, tmp2);
 4050       __ sha256h2(v3, __ T4S, v4, tmp2);
 4051       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4052 
 4053       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4054     }
 4055 
 4056     __ addv(v0, __ T4S, v0, v2);
 4057     __ addv(v1, __ T4S, v1, v3);
 4058 
 4059     if (multi_block) {
 4060       __ add(ofs, ofs, 64);
 4061       __ cmp(ofs, limit);
 4062       __ br(Assembler::LE, sha1_loop);
 4063       __ mov(c_rarg0, ofs); // return ofs
 4064     }
 4065 
 4066     __ ldpd(v10, v11, Address(sp, 16));
 4067     __ ldpd(v8, v9, __ post(sp, 32));
 4068 
 4069     __ stpq(v0, v1, state);
 4070 
 4071     __ ret(lr);
 4072 
 4073     return start;
 4074   }
 4075 
 4076   // Double rounds for sha512.
 4077   void sha512_dround(int dr,
 4078                      FloatRegister vi0, FloatRegister vi1,
 4079                      FloatRegister vi2, FloatRegister vi3,
 4080                      FloatRegister vi4, FloatRegister vrc0,
 4081                      FloatRegister vrc1, FloatRegister vin0,
 4082                      FloatRegister vin1, FloatRegister vin2,
 4083                      FloatRegister vin3, FloatRegister vin4) {
 4084       if (dr < 36) {
 4085         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4086       }
 4087       __ addv(v5, __ T2D, vrc0, vin0);
 4088       __ ext(v6, __ T16B, vi2, vi3, 8);
 4089       __ ext(v5, __ T16B, v5, v5, 8);
 4090       __ ext(v7, __ T16B, vi1, vi2, 8);
 4091       __ addv(vi3, __ T2D, vi3, v5);
 4092       if (dr < 32) {
 4093         __ ext(v5, __ T16B, vin3, vin4, 8);
 4094         __ sha512su0(vin0, __ T2D, vin1);
 4095       }
 4096       __ sha512h(vi3, __ T2D, v6, v7);
 4097       if (dr < 32) {
 4098         __ sha512su1(vin0, __ T2D, vin2, v5);
 4099       }
 4100       __ addv(vi4, __ T2D, vi1, vi3);
 4101       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4102   }
 4103 
 4104   // Arguments:
 4105   //
 4106   // Inputs:
 4107   //   c_rarg0   - byte[]  source+offset
 4108   //   c_rarg1   - int[]   SHA.state
 4109   //   c_rarg2   - int     offset
 4110   //   c_rarg3   - int     limit
 4111   //
 4112   address generate_sha512_implCompress(StubId stub_id) {
 4113     bool multi_block;
 4114     switch (stub_id) {
 4115     case StubId::stubgen_sha512_implCompress_id:
 4116       multi_block = false;
 4117       break;
 4118     case StubId::stubgen_sha512_implCompressMB_id:
 4119       multi_block = true;
 4120       break;
 4121     default:
 4122       ShouldNotReachHere();
 4123     }
 4124 
 4125     static const uint64_t round_consts[80] = {
 4126       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4127       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4128       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4129       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4130       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4131       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4132       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4133       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4134       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4135       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4136       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4137       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4138       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4139       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4140       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4141       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4142       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4143       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4144       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4145       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4146       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4147       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4148       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4149       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4150       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4151       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4152       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4153     };
 4154 
 4155     __ align(CodeEntryAlignment);
 4156 
 4157     StubCodeMark mark(this, stub_id);
 4158     address start = __ pc();
 4159 
 4160     Register buf   = c_rarg0;
 4161     Register state = c_rarg1;
 4162     Register ofs   = c_rarg2;
 4163     Register limit = c_rarg3;
 4164 
 4165     __ stpd(v8, v9, __ pre(sp, -64));
 4166     __ stpd(v10, v11, Address(sp, 16));
 4167     __ stpd(v12, v13, Address(sp, 32));
 4168     __ stpd(v14, v15, Address(sp, 48));
 4169 
 4170     Label sha512_loop;
 4171 
 4172     // load state
 4173     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4174 
 4175     // load first 4 round constants
 4176     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4177     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4178 
 4179     __ BIND(sha512_loop);
 4180     // load 128B of data into v12..v19
 4181     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4182     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4183     __ rev64(v12, __ T16B, v12);
 4184     __ rev64(v13, __ T16B, v13);
 4185     __ rev64(v14, __ T16B, v14);
 4186     __ rev64(v15, __ T16B, v15);
 4187     __ rev64(v16, __ T16B, v16);
 4188     __ rev64(v17, __ T16B, v17);
 4189     __ rev64(v18, __ T16B, v18);
 4190     __ rev64(v19, __ T16B, v19);
 4191 
 4192     __ mov(rscratch2, rscratch1);
 4193 
 4194     __ mov(v0, __ T16B, v8);
 4195     __ mov(v1, __ T16B, v9);
 4196     __ mov(v2, __ T16B, v10);
 4197     __ mov(v3, __ T16B, v11);
 4198 
 4199     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4200     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4201     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4202     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4203     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4204     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4205     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4206     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4207     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4208     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4209     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4210     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4211     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4212     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4213     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4214     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4215     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4216     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4217     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4218     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4219     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4220     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4221     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4222     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4223     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4224     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4225     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4226     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4227     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4228     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4229     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4230     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4231     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4232     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4233     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4234     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4235     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4236     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4237     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4238     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4239 
 4240     __ addv(v8, __ T2D, v8, v0);
 4241     __ addv(v9, __ T2D, v9, v1);
 4242     __ addv(v10, __ T2D, v10, v2);
 4243     __ addv(v11, __ T2D, v11, v3);
 4244 
 4245     if (multi_block) {
 4246       __ add(ofs, ofs, 128);
 4247       __ cmp(ofs, limit);
 4248       __ br(Assembler::LE, sha512_loop);
 4249       __ mov(c_rarg0, ofs); // return ofs
 4250     }
 4251 
 4252     __ st1(v8, v9, v10, v11, __ T2D, state);
 4253 
 4254     __ ldpd(v14, v15, Address(sp, 48));
 4255     __ ldpd(v12, v13, Address(sp, 32));
 4256     __ ldpd(v10, v11, Address(sp, 16));
 4257     __ ldpd(v8, v9, __ post(sp, 64));
 4258 
 4259     __ ret(lr);
 4260 
 4261     return start;
 4262   }
 4263 
 4264   // Execute one round of keccak of two computations in parallel.
 4265   // One of the states should be loaded into the lower halves of
 4266   // the vector registers v0-v24, the other should be loaded into
 4267   // the upper halves of those registers. The ld1r instruction loads
 4268   // the round constant into both halves of register v31.
 4269   // Intermediate results c0...c5 and d0...d5 are computed
 4270   // in registers v25...v30.
 4271   // All vector instructions that are used operate on both register
 4272   // halves in parallel.
 4273   // If only a single computation is needed, one can only load the lower halves.
 4274   void keccak_round(Register rscratch1) {
 4275   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4276   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4277   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4278   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4279   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4280   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4281   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4282   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4283   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4284   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4285 
 4286   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4287   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4288   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4289   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4290   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4291 
 4292   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4293   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4294   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4295   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4296   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4297   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4298   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4299   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4300   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4301   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4302   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4303   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4304   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4305   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4306   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4307   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4308   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4309   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4310   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4311   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4312   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4313   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4314   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4315   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4316   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4317 
 4318   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4319   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4320   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4321   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4322   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4323 
 4324   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4325 
 4326   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4327   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4328   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4329   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4330   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4331 
 4332   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4333   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4334   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4335   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4336   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4337 
 4338   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4339   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4340   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4341   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4342   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4343 
 4344   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4345   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4346   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4347   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4348   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4349 
 4350   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4351   }
 4352 
 4353   // Arguments:
 4354   //
 4355   // Inputs:
 4356   //   c_rarg0   - byte[]  source+offset
 4357   //   c_rarg1   - byte[]  SHA.state
 4358   //   c_rarg2   - int     block_size
 4359   //   c_rarg3   - int     offset
 4360   //   c_rarg4   - int     limit
 4361   //
 4362   address generate_sha3_implCompress(StubId stub_id) {
 4363     bool multi_block;
 4364     switch (stub_id) {
 4365     case StubId::stubgen_sha3_implCompress_id:
 4366       multi_block = false;
 4367       break;
 4368     case StubId::stubgen_sha3_implCompressMB_id:
 4369       multi_block = true;
 4370       break;
 4371     default:
 4372       ShouldNotReachHere();
 4373     }
 4374 
 4375     static const uint64_t round_consts[24] = {
 4376       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4377       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4378       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4379       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4380       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4381       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4382       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4383       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4384     };
 4385 
 4386     __ align(CodeEntryAlignment);
 4387 
 4388     StubCodeMark mark(this, stub_id);
 4389     address start = __ pc();
 4390 
 4391     Register buf           = c_rarg0;
 4392     Register state         = c_rarg1;
 4393     Register block_size    = c_rarg2;
 4394     Register ofs           = c_rarg3;
 4395     Register limit         = c_rarg4;
 4396 
 4397     Label sha3_loop, rounds24_loop;
 4398     Label sha3_512_or_sha3_384, shake128;
 4399 
 4400     __ stpd(v8, v9, __ pre(sp, -64));
 4401     __ stpd(v10, v11, Address(sp, 16));
 4402     __ stpd(v12, v13, Address(sp, 32));
 4403     __ stpd(v14, v15, Address(sp, 48));
 4404 
 4405     // load state
 4406     __ add(rscratch1, state, 32);
 4407     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4408     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4409     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4410     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4411     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4412     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4413     __ ld1(v24, __ T1D, rscratch1);
 4414 
 4415     __ BIND(sha3_loop);
 4416 
 4417     // 24 keccak rounds
 4418     __ movw(rscratch2, 24);
 4419 
 4420     // load round_constants base
 4421     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4422 
 4423     // load input
 4424     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4425     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4426     __ eor(v0, __ T8B, v0, v25);
 4427     __ eor(v1, __ T8B, v1, v26);
 4428     __ eor(v2, __ T8B, v2, v27);
 4429     __ eor(v3, __ T8B, v3, v28);
 4430     __ eor(v4, __ T8B, v4, v29);
 4431     __ eor(v5, __ T8B, v5, v30);
 4432     __ eor(v6, __ T8B, v6, v31);
 4433 
 4434     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4435     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4436 
 4437     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4438     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4439     __ eor(v7, __ T8B, v7, v25);
 4440     __ eor(v8, __ T8B, v8, v26);
 4441     __ eor(v9, __ T8B, v9, v27);
 4442     __ eor(v10, __ T8B, v10, v28);
 4443     __ eor(v11, __ T8B, v11, v29);
 4444     __ eor(v12, __ T8B, v12, v30);
 4445     __ eor(v13, __ T8B, v13, v31);
 4446 
 4447     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4448     __ eor(v14, __ T8B, v14, v25);
 4449     __ eor(v15, __ T8B, v15, v26);
 4450     __ eor(v16, __ T8B, v16, v27);
 4451 
 4452     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4453     __ andw(c_rarg5, block_size, 48);
 4454     __ cbzw(c_rarg5, rounds24_loop);
 4455 
 4456     __ tbnz(block_size, 5, shake128);
 4457     // block_size == 144, bit5 == 0, SHA3-224
 4458     __ ldrd(v28, __ post(buf, 8));
 4459     __ eor(v17, __ T8B, v17, v28);
 4460     __ b(rounds24_loop);
 4461 
 4462     __ BIND(shake128);
 4463     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4464     __ eor(v17, __ T8B, v17, v28);
 4465     __ eor(v18, __ T8B, v18, v29);
 4466     __ eor(v19, __ T8B, v19, v30);
 4467     __ eor(v20, __ T8B, v20, v31);
 4468     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4469 
 4470     __ BIND(sha3_512_or_sha3_384);
 4471     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4472     __ eor(v7, __ T8B, v7, v25);
 4473     __ eor(v8, __ T8B, v8, v26);
 4474     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4475 
 4476     // SHA3-384
 4477     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4478     __ eor(v9,  __ T8B, v9,  v27);
 4479     __ eor(v10, __ T8B, v10, v28);
 4480     __ eor(v11, __ T8B, v11, v29);
 4481     __ eor(v12, __ T8B, v12, v30);
 4482 
 4483     __ BIND(rounds24_loop);
 4484     __ subw(rscratch2, rscratch2, 1);
 4485 
 4486     keccak_round(rscratch1);
 4487 
 4488     __ cbnzw(rscratch2, rounds24_loop);
 4489 
 4490     if (multi_block) {
 4491       __ add(ofs, ofs, block_size);
 4492       __ cmp(ofs, limit);
 4493       __ br(Assembler::LE, sha3_loop);
 4494       __ mov(c_rarg0, ofs); // return ofs
 4495     }
 4496 
 4497     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4498     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4499     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4500     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4501     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4502     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4503     __ st1(v24, __ T1D, state);
 4504 
 4505     // restore callee-saved registers
 4506     __ ldpd(v14, v15, Address(sp, 48));
 4507     __ ldpd(v12, v13, Address(sp, 32));
 4508     __ ldpd(v10, v11, Address(sp, 16));
 4509     __ ldpd(v8, v9, __ post(sp, 64));
 4510 
 4511     __ ret(lr);
 4512 
 4513     return start;
 4514   }
 4515 
 4516   // Inputs:
 4517   //   c_rarg0   - long[]  state0
 4518   //   c_rarg1   - long[]  state1
 4519   address generate_double_keccak() {
 4520     static const uint64_t round_consts[24] = {
 4521       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4522       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4523       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4524       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4525       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4526       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4527       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4528       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4529     };
 4530 
 4531     // Implements the double_keccak() method of the
 4532     // sun.secyrity.provider.SHA3Parallel class
 4533     __ align(CodeEntryAlignment);
 4534     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4535     address start = __ pc();
 4536     __ enter();
 4537 
 4538     Register state0        = c_rarg0;
 4539     Register state1        = c_rarg1;
 4540 
 4541     Label rounds24_loop;
 4542 
 4543     // save callee-saved registers
 4544     __ stpd(v8, v9, __ pre(sp, -64));
 4545     __ stpd(v10, v11, Address(sp, 16));
 4546     __ stpd(v12, v13, Address(sp, 32));
 4547     __ stpd(v14, v15, Address(sp, 48));
 4548 
 4549     // load states
 4550     __ add(rscratch1, state0, 32);
 4551     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4552     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4553     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4554     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4555     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4556     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4557     __ ld1(v24, __ D, 0, rscratch1);
 4558     __ add(rscratch1, state1, 32);
 4559     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4560     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4561     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4562     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4563     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4564     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4565     __ ld1(v24, __ D, 1, rscratch1);
 4566 
 4567     // 24 keccak rounds
 4568     __ movw(rscratch2, 24);
 4569 
 4570     // load round_constants base
 4571     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4572 
 4573     __ BIND(rounds24_loop);
 4574     __ subw(rscratch2, rscratch2, 1);
 4575     keccak_round(rscratch1);
 4576     __ cbnzw(rscratch2, rounds24_loop);
 4577 
 4578     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4579     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4580     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4581     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4582     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4583     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4584     __ st1(v24, __ D, 0, state0);
 4585     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4586     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4587     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4588     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4589     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4590     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4591     __ st1(v24, __ D, 1, state1);
 4592 
 4593     // restore callee-saved vector registers
 4594     __ ldpd(v14, v15, Address(sp, 48));
 4595     __ ldpd(v12, v13, Address(sp, 32));
 4596     __ ldpd(v10, v11, Address(sp, 16));
 4597     __ ldpd(v8, v9, __ post(sp, 64));
 4598 
 4599     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4600     __ mov(r0, zr); // return 0
 4601     __ ret(lr);
 4602 
 4603     return start;
 4604   }
 4605 
 4606   // ChaCha20 block function.  This version parallelizes the 32-bit
 4607   // state elements on each of 16 vectors, producing 4 blocks of
 4608   // keystream at a time.
 4609   //
 4610   // state (int[16]) = c_rarg0
 4611   // keystream (byte[256]) = c_rarg1
 4612   // return - number of bytes of produced keystream (always 256)
 4613   //
 4614   // This implementation takes each 32-bit integer from the state
 4615   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4616   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4617   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4618   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4619   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4620   // operations represented by one QUARTERROUND function, we instead stack all
 4621   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4622   // and then do the same for the second set of 4 quarter rounds.  This removes
 4623   // some latency that would otherwise be incurred by waiting for an add to
 4624   // complete before performing an xor (which depends on the result of the
 4625   // add), etc. An adjustment happens between the first and second groups of 4
 4626   // quarter rounds, but this is done only in the inputs to the macro functions
 4627   // that generate the assembly instructions - these adjustments themselves are
 4628   // not part of the resulting assembly.
 4629   // The 4 registers v0-v3 are used during the quarter round operations as
 4630   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4631   // registers become the vectors involved in adding the start state back onto
 4632   // the post-QR working state.  After the adds are complete, each of the 16
 4633   // vectors write their first lane back to the keystream buffer, followed
 4634   // by the second lane from all vectors and so on.
 4635   address generate_chacha20Block_blockpar() {
 4636     Label L_twoRounds, L_cc20_const;
 4637     __ align(CodeEntryAlignment);
 4638     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4639     StubCodeMark mark(this, stub_id);
 4640     address start = __ pc();
 4641     __ enter();
 4642 
 4643     int i, j;
 4644     const Register state = c_rarg0;
 4645     const Register keystream = c_rarg1;
 4646     const Register loopCtr = r10;
 4647     const Register tmpAddr = r11;
 4648     const FloatRegister ctrAddOverlay = v28;
 4649     const FloatRegister lrot8Tbl = v29;
 4650 
 4651     // Organize SIMD registers in an array that facilitates
 4652     // putting repetitive opcodes into loop structures.  It is
 4653     // important that each grouping of 4 registers is monotonically
 4654     // increasing to support the requirements of multi-register
 4655     // instructions (e.g. ld4r, st4, etc.)
 4656     const FloatRegister workSt[16] = {
 4657          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4658         v20, v21, v22, v23, v24, v25, v26, v27
 4659     };
 4660 
 4661     // Pull in constant data.  The first 16 bytes are the add overlay
 4662     // which is applied to the vector holding the counter (state[12]).
 4663     // The second 16 bytes is the index register for the 8-bit left
 4664     // rotation tbl instruction.
 4665     __ adr(tmpAddr, L_cc20_const);
 4666     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4667 
 4668     // Load from memory and interlace across 16 SIMD registers,
 4669     // With each word from memory being broadcast to all lanes of
 4670     // each successive SIMD register.
 4671     //      Addr(0) -> All lanes in workSt[i]
 4672     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4673     __ mov(tmpAddr, state);
 4674     for (i = 0; i < 16; i += 4) {
 4675       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4676           __ post(tmpAddr, 16));
 4677     }
 4678     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4679 
 4680     // Before entering the loop, create 5 4-register arrays.  These
 4681     // will hold the 4 registers that represent the a/b/c/d fields
 4682     // in the quarter round operation.  For instance the "b" field
 4683     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4684     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4685     // since it is part of a diagonal organization.  The aSet and scratch
 4686     // register sets are defined at declaration time because they do not change
 4687     // organization at any point during the 20-round processing.
 4688     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4689     FloatRegister bSet[4];
 4690     FloatRegister cSet[4];
 4691     FloatRegister dSet[4];
 4692     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4693 
 4694     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4695     __ mov(loopCtr, 10);
 4696     __ BIND(L_twoRounds);
 4697 
 4698     // Set to columnar organization and do the following 4 quarter-rounds:
 4699     // QUARTERROUND(0, 4, 8, 12)
 4700     // QUARTERROUND(1, 5, 9, 13)
 4701     // QUARTERROUND(2, 6, 10, 14)
 4702     // QUARTERROUND(3, 7, 11, 15)
 4703     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4704     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4705     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4706 
 4707     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4708     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4709     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4710 
 4711     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4712     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4713     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4714 
 4715     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4716     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4717     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4718 
 4719     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4720     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4721     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4722 
 4723     // Set to diagonal organization and do the next 4 quarter-rounds:
 4724     // QUARTERROUND(0, 5, 10, 15)
 4725     // QUARTERROUND(1, 6, 11, 12)
 4726     // QUARTERROUND(2, 7, 8, 13)
 4727     // QUARTERROUND(3, 4, 9, 14)
 4728     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4729     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4730     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4731 
 4732     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4733     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4734     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4735 
 4736     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4737     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4738     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4739 
 4740     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4741     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4742     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4743 
 4744     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4745     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4746     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4747 
 4748     // Decrement and iterate
 4749     __ sub(loopCtr, loopCtr, 1);
 4750     __ cbnz(loopCtr, L_twoRounds);
 4751 
 4752     __ mov(tmpAddr, state);
 4753 
 4754     // Add the starting state back to the post-loop keystream
 4755     // state.  We read/interlace the state array from memory into
 4756     // 4 registers similar to what we did in the beginning.  Then
 4757     // add the counter overlay onto workSt[12] at the end.
 4758     for (i = 0; i < 16; i += 4) {
 4759       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4760       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4761       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4762       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4763       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4764     }
 4765     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4766 
 4767     // Write working state into the keystream buffer.  This is accomplished
 4768     // by taking the lane "i" from each of the four vectors and writing
 4769     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4770     // repeating with the next 4 vectors until all 16 vectors have been used.
 4771     // Then move to the next lane and repeat the process until all lanes have
 4772     // been written.
 4773     for (i = 0; i < 4; i++) {
 4774       for (j = 0; j < 16; j += 4) {
 4775         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4776             __ post(keystream, 16));
 4777       }
 4778     }
 4779 
 4780     __ mov(r0, 256);             // Return length of output keystream
 4781     __ leave();
 4782     __ ret(lr);
 4783 
 4784     // bind label and generate local constant data used by this stub
 4785     // The constant data is broken into two 128-bit segments to be loaded
 4786     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4787     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4788     // The second 128-bits is a table constant used for 8-bit left rotations.
 4789     __ BIND(L_cc20_const);
 4790     __ emit_int64(0x0000000100000000UL);
 4791     __ emit_int64(0x0000000300000002UL);
 4792     __ emit_int64(0x0605040702010003UL);
 4793     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4794 
 4795     return start;
 4796   }
 4797 
 4798   // Helpers to schedule parallel operation bundles across vector
 4799   // register sequences of size 2, 4 or 8.
 4800 
 4801   // Implement various primitive computations across vector sequences
 4802 
 4803   template<int N>
 4804   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4805                const VSeq<N>& v1, const VSeq<N>& v2) {
 4806     // output must not be constant
 4807     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4808     // output cannot overwrite pending inputs
 4809     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4810     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4811     for (int i = 0; i < N; i++) {
 4812       __ addv(v[i], T, v1[i], v2[i]);
 4813     }
 4814   }
 4815 
 4816   template<int N>
 4817   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4818                const VSeq<N>& v1, const VSeq<N>& v2) {
 4819     // output must not be constant
 4820     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4821     // output cannot overwrite pending inputs
 4822     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4823     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4824     for (int i = 0; i < N; i++) {
 4825       __ subv(v[i], T, v1[i], v2[i]);
 4826     }
 4827   }
 4828 
 4829   template<int N>
 4830   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4831                const VSeq<N>& v1, const VSeq<N>& v2) {
 4832     // output must not be constant
 4833     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4834     // output cannot overwrite pending inputs
 4835     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4836     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4837     for (int i = 0; i < N; i++) {
 4838       __ mulv(v[i], T, v1[i], v2[i]);
 4839     }
 4840   }
 4841 
 4842   template<int N>
 4843   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4844     // output must not be constant
 4845     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4846     // output cannot overwrite pending inputs
 4847     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4848     for (int i = 0; i < N; i++) {
 4849       __ negr(v[i], T, v1[i]);
 4850     }
 4851   }
 4852 
 4853   template<int N>
 4854   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4855                const VSeq<N>& v1, int shift) {
 4856     // output must not be constant
 4857     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4858     // output cannot overwrite pending inputs
 4859     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4860     for (int i = 0; i < N; i++) {
 4861       __ sshr(v[i], T, v1[i], shift);
 4862     }
 4863   }
 4864 
 4865   template<int N>
 4866   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4867     // output must not be constant
 4868     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4869     // output cannot overwrite pending inputs
 4870     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4871     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4872     for (int i = 0; i < N; i++) {
 4873       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4874     }
 4875   }
 4876 
 4877   template<int N>
 4878   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4879     // output must not be constant
 4880     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4881     // output cannot overwrite pending inputs
 4882     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4883     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4884     for (int i = 0; i < N; i++) {
 4885       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4886     }
 4887   }
 4888 
 4889   template<int N>
 4890   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4891     // output must not be constant
 4892     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4893     // output cannot overwrite pending inputs
 4894     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4895     for (int i = 0; i < N; i++) {
 4896       __ notr(v[i], __ T16B, v1[i]);
 4897     }
 4898   }
 4899 
 4900   template<int N>
 4901   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4902     // output must not be constant
 4903     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4904     // output cannot overwrite pending inputs
 4905     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4906     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4907     for (int i = 0; i < N; i++) {
 4908       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4909     }
 4910   }
 4911 
 4912   template<int N>
 4913   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4914     // output must not be constant
 4915     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4916     // output cannot overwrite pending inputs
 4917     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4918     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4919     for (int i = 0; i < N; i++) {
 4920       __ mlsv(v[i], T, v1[i], v2[i]);
 4921     }
 4922   }
 4923 
 4924   // load N/2 successive pairs of quadword values from memory in order
 4925   // into N successive vector registers of the sequence via the
 4926   // address supplied in base.
 4927   template<int N>
 4928   void vs_ldpq(const VSeq<N>& v, Register base) {
 4929     for (int i = 0; i < N; i += 2) {
 4930       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4931     }
 4932   }
 4933 
 4934   // load N/2 successive pairs of quadword values from memory in order
 4935   // into N vector registers of the sequence via the address supplied
 4936   // in base using post-increment addressing
 4937   template<int N>
 4938   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4939     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4940     for (int i = 0; i < N; i += 2) {
 4941       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4942     }
 4943   }
 4944 
 4945   // store N successive vector registers of the sequence into N/2
 4946   // successive pairs of quadword memory locations via the address
 4947   // supplied in base using post-increment addressing
 4948   template<int N>
 4949   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4950     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4951     for (int i = 0; i < N; i += 2) {
 4952       __ stpq(v[i], v[i+1], __ post(base, 32));
 4953     }
 4954   }
 4955 
 4956   // load N/2 pairs of quadword values from memory de-interleaved into
 4957   // N vector registers 2 at a time via the address supplied in base
 4958   // using post-increment addressing.
 4959   template<int N>
 4960   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4961     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4962     for (int i = 0; i < N; i += 2) {
 4963       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4964     }
 4965   }
 4966 
 4967   // store N vector registers interleaved into N/2 pairs of quadword
 4968   // memory locations via the address supplied in base using
 4969   // post-increment addressing.
 4970   template<int N>
 4971   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4972     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4973     for (int i = 0; i < N; i += 2) {
 4974       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4975     }
 4976   }
 4977 
 4978   // load N quadword values from memory de-interleaved into N vector
 4979   // registers 3 elements at a time via the address supplied in base.
 4980   template<int N>
 4981   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4982     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4983     for (int i = 0; i < N; i += 3) {
 4984       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4985     }
 4986   }
 4987 
 4988   // load N quadword values from memory de-interleaved into N vector
 4989   // registers 3 elements at a time via the address supplied in base
 4990   // using post-increment addressing.
 4991   template<int N>
 4992   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4993     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4994     for (int i = 0; i < N; i += 3) {
 4995       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4996     }
 4997   }
 4998 
 4999   // load N/2 pairs of quadword values from memory into N vector
 5000   // registers via the address supplied in base with each pair indexed
 5001   // using the the start offset plus the corresponding entry in the
 5002   // offsets array
 5003   template<int N>
 5004   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5005     for (int i = 0; i < N/2; i++) {
 5006       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5007     }
 5008   }
 5009 
 5010   // store N vector registers into N/2 pairs of quadword memory
 5011   // locations via the address supplied in base with each pair indexed
 5012   // using the the start offset plus the corresponding entry in the
 5013   // offsets array
 5014   template<int N>
 5015   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5016     for (int i = 0; i < N/2; i++) {
 5017       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5018     }
 5019   }
 5020 
 5021   // load N single quadword values from memory into N vector registers
 5022   // via the address supplied in base with each value indexed using
 5023   // the the start offset plus the corresponding entry in the offsets
 5024   // array
 5025   template<int N>
 5026   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5027                       int start, int (&offsets)[N]) {
 5028     for (int i = 0; i < N; i++) {
 5029       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5030     }
 5031   }
 5032 
 5033   // store N vector registers into N single quadword memory locations
 5034   // via the address supplied in base with each value indexed using
 5035   // the the start offset plus the corresponding entry in the offsets
 5036   // array
 5037   template<int N>
 5038   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5039                       int start, int (&offsets)[N]) {
 5040     for (int i = 0; i < N; i++) {
 5041       __ str(v[i], T, Address(base, start + offsets[i]));
 5042     }
 5043   }
 5044 
 5045   // load N/2 pairs of quadword values from memory de-interleaved into
 5046   // N vector registers 2 at a time via the address supplied in base
 5047   // with each pair indexed using the the start offset plus the
 5048   // corresponding entry in the offsets array
 5049   template<int N>
 5050   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5051                       Register tmp, int start, int (&offsets)[N/2]) {
 5052     for (int i = 0; i < N/2; i++) {
 5053       __ add(tmp, base, start + offsets[i]);
 5054       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5055     }
 5056   }
 5057 
 5058   // store N vector registers 2 at a time interleaved into N/2 pairs
 5059   // of quadword memory locations via the address supplied in base
 5060   // with each pair indexed using the the start offset plus the
 5061   // corresponding entry in the offsets array
 5062   template<int N>
 5063   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5064                       Register tmp, int start, int (&offsets)[N/2]) {
 5065     for (int i = 0; i < N/2; i++) {
 5066       __ add(tmp, base, start + offsets[i]);
 5067       __ st2(v[2*i], v[2*i+1], T, tmp);
 5068     }
 5069   }
 5070 
 5071   // Helper routines for various flavours of Montgomery multiply
 5072 
 5073   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5074   // multiplications in parallel
 5075   //
 5076 
 5077   // See the montMul() method of the sun.security.provider.ML_DSA
 5078   // class.
 5079   //
 5080   // Computes 4x4S results or 8x8H results
 5081   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5082   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5083   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5084   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5085   // Outputs: va - 4x4S or 4x8H vector register sequences
 5086   // vb, vc, vtmp and vq must all be disjoint
 5087   // va must be disjoint from all other inputs/temps or must equal vc
 5088   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5089   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5090   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5091                    Assembler::SIMD_Arrangement T,
 5092                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5093     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5094     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5095     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5096     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5097 
 5098     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5099     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5100 
 5101     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5102 
 5103     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5104     assert(vs_disjoint(va, vb), "va and vb overlap");
 5105     assert(vs_disjoint(va, vq), "va and vq overlap");
 5106     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5107     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5108 
 5109     // schedule 4 streams of instructions across the vector sequences
 5110     for (int i = 0; i < 4; i++) {
 5111       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5112       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5113     }
 5114 
 5115     for (int i = 0; i < 4; i++) {
 5116       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5117     }
 5118 
 5119     for (int i = 0; i < 4; i++) {
 5120       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5121     }
 5122 
 5123     for (int i = 0; i < 4; i++) {
 5124       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5125     }
 5126   }
 5127 
 5128   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5129   // multiplications in parallel
 5130   //
 5131 
 5132   // See the montMul() method of the sun.security.provider.ML_DSA
 5133   // class.
 5134   //
 5135   // Computes 4x4S results or 8x8H results
 5136   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5137   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5138   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5139   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5140   // Outputs: va - 4x4S or 4x8H vector register sequences
 5141   // vb, vc, vtmp and vq must all be disjoint
 5142   // va must be disjoint from all other inputs/temps or must equal vc
 5143   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5144   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5145   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5146                    Assembler::SIMD_Arrangement T,
 5147                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5148     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5149     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5150     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5151     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5152 
 5153     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5154     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5155 
 5156     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5157 
 5158     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5159     assert(vs_disjoint(va, vb), "va and vb overlap");
 5160     assert(vs_disjoint(va, vq), "va and vq overlap");
 5161     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5162     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5163 
 5164     // schedule 2 streams of instructions across the vector sequences
 5165     for (int i = 0; i < 2; i++) {
 5166       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5167       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5168     }
 5169 
 5170     for (int i = 0; i < 2; i++) {
 5171       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5172     }
 5173 
 5174     for (int i = 0; i < 2; i++) {
 5175       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5176     }
 5177 
 5178     for (int i = 0; i < 2; i++) {
 5179       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5180     }
 5181   }
 5182 
 5183   // Perform 16 16-bit Montgomery multiplications in parallel.
 5184   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5185                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5186     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5187     // It will assert that the register use is valid
 5188     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5189   }
 5190 
 5191   // Perform 32 16-bit Montgomery multiplications in parallel.
 5192   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5193                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5194     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5195     // It will assert that the register use is valid
 5196     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5197   }
 5198 
 5199   // Perform 64 16-bit Montgomery multiplications in parallel.
 5200   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5201                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5202     // Schedule two successive 4x8H multiplies via the montmul helper
 5203     // on the front and back halves of va, vb and vc. The helper will
 5204     // assert that the register use has no overlap conflicts on each
 5205     // individual call but we also need to ensure that the necessary
 5206     // disjoint/equality constraints are met across both calls.
 5207 
 5208     // vb, vc, vtmp and vq must be disjoint. va must either be
 5209     // disjoint from all other registers or equal vc
 5210 
 5211     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5212     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5213     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5214 
 5215     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5216     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5217 
 5218     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5219 
 5220     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5221     assert(vs_disjoint(va, vb), "va and vb overlap");
 5222     assert(vs_disjoint(va, vq), "va and vq overlap");
 5223     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5224 
 5225     // we multiply the front and back halves of each sequence 4 at a
 5226     // time because
 5227     //
 5228     // 1) we are currently only able to get 4-way instruction
 5229     // parallelism at best
 5230     //
 5231     // 2) we need registers for the constants in vq and temporary
 5232     // scratch registers to hold intermediate results so vtmp can only
 5233     // be a VSeq<4> which means we only have 4 scratch slots
 5234 
 5235     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5236     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5237   }
 5238 
 5239   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5240                                const VSeq<4>& vc,
 5241                                const VSeq<4>& vtmp,
 5242                                const VSeq<2>& vq) {
 5243     // compute a = montmul(a1, c)
 5244     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5245     // ouptut a1 = a0 - a
 5246     vs_subv(va1, __ T8H, va0, vc);
 5247     //    and a0 = a0 + a
 5248     vs_addv(va0, __ T8H, va0, vc);
 5249   }
 5250 
 5251   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5252                                const VSeq<4>& vb,
 5253                                const VSeq<4>& vtmp1,
 5254                                const VSeq<4>& vtmp2,
 5255                                const VSeq<2>& vq) {
 5256     // compute c = a0 - a1
 5257     vs_subv(vtmp1, __ T8H, va0, va1);
 5258     // output a0 = a0 + a1
 5259     vs_addv(va0, __ T8H, va0, va1);
 5260     // output a1 = b montmul c
 5261     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5262   }
 5263 
 5264   void load64shorts(const VSeq<8>& v, Register shorts) {
 5265     vs_ldpq_post(v, shorts);
 5266   }
 5267 
 5268   void load32shorts(const VSeq<4>& v, Register shorts) {
 5269     vs_ldpq_post(v, shorts);
 5270   }
 5271 
 5272   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5273     vs_stpq_post(v, tmpAddr);
 5274   }
 5275 
 5276   // Kyber NTT function.
 5277   // Implements
 5278   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5279   //
 5280   // coeffs (short[256]) = c_rarg0
 5281   // ntt_zetas (short[256]) = c_rarg1
 5282   address generate_kyberNtt() {
 5283 
 5284     __ align(CodeEntryAlignment);
 5285     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5286     StubCodeMark mark(this, stub_id);
 5287     address start = __ pc();
 5288     __ enter();
 5289 
 5290     const Register coeffs = c_rarg0;
 5291     const Register zetas = c_rarg1;
 5292 
 5293     const Register kyberConsts = r10;
 5294     const Register tmpAddr = r11;
 5295 
 5296     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5297     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5298     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5299 
 5300     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5301     // load the montmul constants
 5302     vs_ldpq(vq, kyberConsts);
 5303 
 5304     // Each level corresponds to an iteration of the outermost loop of the
 5305     // Java method seilerNTT(int[] coeffs). There are some differences
 5306     // from what is done in the seilerNTT() method, though:
 5307     // 1. The computation is using 16-bit signed values, we do not convert them
 5308     // to ints here.
 5309     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5310     // this array for each level, it is easier that way to fill up the vector
 5311     // registers.
 5312     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5313     // multiplications (this is because that way there should not be any
 5314     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5315     // that we can use the 16-bit arithmetic in the vector unit.
 5316     //
 5317     // On each level, we fill up the vector registers in such a way that the
 5318     // array elements that need to be multiplied by the zetas go into one
 5319     // set of vector registers while the corresponding ones that don't need to
 5320     // be multiplied, go into another set.
 5321     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5322     // registers interleaving the steps of 4 identical computations,
 5323     // each done on 8 16-bit values per register.
 5324 
 5325     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5326     // to the zetas occur in discrete blocks whose size is some multiple
 5327     // of 32.
 5328 
 5329     // level 0
 5330     __ add(tmpAddr, coeffs, 256);
 5331     load64shorts(vs1, tmpAddr);
 5332     load64shorts(vs2, zetas);
 5333     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5334     __ add(tmpAddr, coeffs, 0);
 5335     load64shorts(vs1, tmpAddr);
 5336     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5337     vs_addv(vs1, __ T8H, vs1, vs2);
 5338     __ add(tmpAddr, coeffs, 0);
 5339     vs_stpq_post(vs1, tmpAddr);
 5340     __ add(tmpAddr, coeffs, 256);
 5341     vs_stpq_post(vs3, tmpAddr);
 5342     // restore montmul constants
 5343     vs_ldpq(vq, kyberConsts);
 5344     load64shorts(vs1, tmpAddr);
 5345     load64shorts(vs2, zetas);
 5346     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5347     __ add(tmpAddr, coeffs, 128);
 5348     load64shorts(vs1, tmpAddr);
 5349     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5350     vs_addv(vs1, __ T8H, vs1, vs2);
 5351     __ add(tmpAddr, coeffs, 128);
 5352     store64shorts(vs1, tmpAddr);
 5353     __ add(tmpAddr, coeffs, 384);
 5354     store64shorts(vs3, tmpAddr);
 5355 
 5356     // level 1
 5357     // restore montmul constants
 5358     vs_ldpq(vq, kyberConsts);
 5359     __ add(tmpAddr, coeffs, 128);
 5360     load64shorts(vs1, tmpAddr);
 5361     load64shorts(vs2, zetas);
 5362     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5363     __ add(tmpAddr, coeffs, 0);
 5364     load64shorts(vs1, tmpAddr);
 5365     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5366     vs_addv(vs1, __ T8H, vs1, vs2);
 5367     __ add(tmpAddr, coeffs, 0);
 5368     store64shorts(vs1, tmpAddr);
 5369     store64shorts(vs3, tmpAddr);
 5370     vs_ldpq(vq, kyberConsts);
 5371     __ add(tmpAddr, coeffs, 384);
 5372     load64shorts(vs1, tmpAddr);
 5373     load64shorts(vs2, zetas);
 5374     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5375     __ add(tmpAddr, coeffs, 256);
 5376     load64shorts(vs1, tmpAddr);
 5377     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5378     vs_addv(vs1, __ T8H, vs1, vs2);
 5379     __ add(tmpAddr, coeffs, 256);
 5380     store64shorts(vs1, tmpAddr);
 5381     store64shorts(vs3, tmpAddr);
 5382 
 5383     // level 2
 5384     vs_ldpq(vq, kyberConsts);
 5385     int offsets1[4] = { 0, 32, 128, 160 };
 5386     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5387     load64shorts(vs2, zetas);
 5388     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5389     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5390     // kyber_subv_addv64();
 5391     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5392     vs_addv(vs1, __ T8H, vs1, vs2);
 5393     __ add(tmpAddr, coeffs, 0);
 5394     vs_stpq_post(vs_front(vs1), tmpAddr);
 5395     vs_stpq_post(vs_front(vs3), tmpAddr);
 5396     vs_stpq_post(vs_back(vs1), tmpAddr);
 5397     vs_stpq_post(vs_back(vs3), tmpAddr);
 5398     vs_ldpq(vq, kyberConsts);
 5399     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5400     load64shorts(vs2, zetas);
 5401     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5402     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5403     // kyber_subv_addv64();
 5404     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5405     vs_addv(vs1, __ T8H, vs1, vs2);
 5406     __ add(tmpAddr, coeffs, 256);
 5407     vs_stpq_post(vs_front(vs1), tmpAddr);
 5408     vs_stpq_post(vs_front(vs3), tmpAddr);
 5409     vs_stpq_post(vs_back(vs1), tmpAddr);
 5410     vs_stpq_post(vs_back(vs3), tmpAddr);
 5411 
 5412     // level 3
 5413     vs_ldpq(vq, kyberConsts);
 5414     int offsets2[4] = { 0, 64, 128, 192 };
 5415     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5416     load64shorts(vs2, zetas);
 5417     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5418     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5419     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5420     vs_addv(vs1, __ T8H, vs1, vs2);
 5421     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5422     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5423 
 5424     vs_ldpq(vq, kyberConsts);
 5425     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5426     load64shorts(vs2, zetas);
 5427     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5428     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5429     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5430     vs_addv(vs1, __ T8H, vs1, vs2);
 5431     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5432     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5433 
 5434     // level 4
 5435     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5436     // so they are loaded using employing an ldr at 8 distinct offsets.
 5437 
 5438     vs_ldpq(vq, kyberConsts);
 5439     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5440     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5441     load64shorts(vs2, zetas);
 5442     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5443     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5444     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5445     vs_addv(vs1, __ T8H, vs1, vs2);
 5446     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5447     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5448 
 5449     vs_ldpq(vq, kyberConsts);
 5450     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5451     load64shorts(vs2, zetas);
 5452     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5453     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5454     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5455     vs_addv(vs1, __ T8H, vs1, vs2);
 5456     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5457     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5458 
 5459     // level 5
 5460     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5461     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5462 
 5463     vs_ldpq(vq, kyberConsts);
 5464     int offsets4[4] = { 0, 32, 64, 96 };
 5465     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5466     load32shorts(vs_front(vs2), zetas);
 5467     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5468     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5469     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5470     load32shorts(vs_front(vs2), zetas);
 5471     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5472     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5473     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5474     load32shorts(vs_front(vs2), zetas);
 5475     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5476     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5477 
 5478     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5479     load32shorts(vs_front(vs2), zetas);
 5480     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5481     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5482 
 5483     // level 6
 5484     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5485     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5486 
 5487     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5488     load32shorts(vs_front(vs2), zetas);
 5489     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5490     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5491     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5492     // __ ldpq(v18, v19, __ post(zetas, 32));
 5493     load32shorts(vs_front(vs2), zetas);
 5494     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5495     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5496 
 5497     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5498     load32shorts(vs_front(vs2), zetas);
 5499     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5500     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5501 
 5502     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5503     load32shorts(vs_front(vs2), zetas);
 5504     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5505     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5506 
 5507     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5508     __ mov(r0, zr); // return 0
 5509     __ ret(lr);
 5510 
 5511     return start;
 5512   }
 5513 
 5514   // Kyber Inverse NTT function
 5515   // Implements
 5516   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5517   //
 5518   // coeffs (short[256]) = c_rarg0
 5519   // ntt_zetas (short[256]) = c_rarg1
 5520   address generate_kyberInverseNtt() {
 5521 
 5522     __ align(CodeEntryAlignment);
 5523     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5524     StubCodeMark mark(this, stub_id);
 5525     address start = __ pc();
 5526     __ enter();
 5527 
 5528     const Register coeffs = c_rarg0;
 5529     const Register zetas = c_rarg1;
 5530 
 5531     const Register kyberConsts = r10;
 5532     const Register tmpAddr = r11;
 5533     const Register tmpAddr2 = c_rarg2;
 5534 
 5535     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5536     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5537     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5538 
 5539     __ lea(kyberConsts,
 5540              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5541 
 5542     // level 0
 5543     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5544     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5545 
 5546     vs_ldpq(vq, kyberConsts);
 5547     int offsets4[4] = { 0, 32, 64, 96 };
 5548     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5549     load32shorts(vs_front(vs2), zetas);
 5550     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5551                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5552     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5553     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5554     load32shorts(vs_front(vs2), zetas);
 5555     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5556                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5557     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5558     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5559     load32shorts(vs_front(vs2), zetas);
 5560     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5561                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5562     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5563     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5564     load32shorts(vs_front(vs2), zetas);
 5565     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5566                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5567     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5568 
 5569     // level 1
 5570     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5571     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5572 
 5573     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5574     load32shorts(vs_front(vs2), zetas);
 5575     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5576                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5577     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5578     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5579     load32shorts(vs_front(vs2), zetas);
 5580     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5581                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5582     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5583 
 5584     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5585     load32shorts(vs_front(vs2), zetas);
 5586     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5587                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5588     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5589     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5590     load32shorts(vs_front(vs2), zetas);
 5591     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5592                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5593     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5594 
 5595     // level 2
 5596     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5597     // so they are loaded using employing an ldr at 8 distinct offsets.
 5598 
 5599     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5600     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5601     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5602     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5603     vs_subv(vs1, __ T8H, vs1, vs2);
 5604     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5605     load64shorts(vs2, zetas);
 5606     vs_ldpq(vq, kyberConsts);
 5607     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5608     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5609 
 5610     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5611     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5612     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5613     vs_subv(vs1, __ T8H, vs1, vs2);
 5614     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5615     load64shorts(vs2, zetas);
 5616     vs_ldpq(vq, kyberConsts);
 5617     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5618     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5619 
 5620     // Barrett reduction at indexes where overflow may happen
 5621 
 5622     // load q and the multiplier for the Barrett reduction
 5623     __ add(tmpAddr, kyberConsts, 16);
 5624     vs_ldpq(vq, tmpAddr);
 5625 
 5626     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5627     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5628     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5629     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5630     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5631     vs_sshr(vs2, __ T8H, vs2, 11);
 5632     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5633     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5634     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5635     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5636     vs_sshr(vs2, __ T8H, vs2, 11);
 5637     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5638     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5639 
 5640     // level 3
 5641     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5642     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5643 
 5644     int offsets2[4] = { 0, 64, 128, 192 };
 5645     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5646     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5647     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5648     vs_subv(vs1, __ T8H, vs1, vs2);
 5649     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5650     load64shorts(vs2, zetas);
 5651     vs_ldpq(vq, kyberConsts);
 5652     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5653     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5654 
 5655     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5656     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5657     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5658     vs_subv(vs1, __ T8H, vs1, vs2);
 5659     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5660     load64shorts(vs2, zetas);
 5661     vs_ldpq(vq, kyberConsts);
 5662     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5663     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5664 
 5665     // level 4
 5666 
 5667     int offsets1[4] = { 0, 32, 128, 160 };
 5668     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5669     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5670     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5671     vs_subv(vs1, __ T8H, vs1, vs2);
 5672     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5673     load64shorts(vs2, zetas);
 5674     vs_ldpq(vq, kyberConsts);
 5675     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5676     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5677 
 5678     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5679     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5680     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5681     vs_subv(vs1, __ T8H, vs1, vs2);
 5682     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5683     load64shorts(vs2, zetas);
 5684     vs_ldpq(vq, kyberConsts);
 5685     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5686     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5687 
 5688     // level 5
 5689 
 5690     __ add(tmpAddr, coeffs, 0);
 5691     load64shorts(vs1, tmpAddr);
 5692     __ add(tmpAddr, coeffs, 128);
 5693     load64shorts(vs2, tmpAddr);
 5694     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5695     vs_subv(vs1, __ T8H, vs1, vs2);
 5696     __ add(tmpAddr, coeffs, 0);
 5697     store64shorts(vs3, tmpAddr);
 5698     load64shorts(vs2, zetas);
 5699     vs_ldpq(vq, kyberConsts);
 5700     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5701     __ add(tmpAddr, coeffs, 128);
 5702     store64shorts(vs2, tmpAddr);
 5703 
 5704     load64shorts(vs1, tmpAddr);
 5705     __ add(tmpAddr, coeffs, 384);
 5706     load64shorts(vs2, tmpAddr);
 5707     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5708     vs_subv(vs1, __ T8H, vs1, vs2);
 5709     __ add(tmpAddr, coeffs, 256);
 5710     store64shorts(vs3, tmpAddr);
 5711     load64shorts(vs2, zetas);
 5712     vs_ldpq(vq, kyberConsts);
 5713     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5714     __ add(tmpAddr, coeffs, 384);
 5715     store64shorts(vs2, tmpAddr);
 5716 
 5717     // Barrett reduction at indexes where overflow may happen
 5718 
 5719     // load q and the multiplier for the Barrett reduction
 5720     __ add(tmpAddr, kyberConsts, 16);
 5721     vs_ldpq(vq, tmpAddr);
 5722 
 5723     int offsets0[2] = { 0, 256 };
 5724     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5725     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5726     vs_sshr(vs2, __ T8H, vs2, 11);
 5727     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5728     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5729 
 5730     // level 6
 5731 
 5732     __ add(tmpAddr, coeffs, 0);
 5733     load64shorts(vs1, tmpAddr);
 5734     __ add(tmpAddr, coeffs, 256);
 5735     load64shorts(vs2, tmpAddr);
 5736     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5737     vs_subv(vs1, __ T8H, vs1, vs2);
 5738     __ add(tmpAddr, coeffs, 0);
 5739     store64shorts(vs3, tmpAddr);
 5740     load64shorts(vs2, zetas);
 5741     vs_ldpq(vq, kyberConsts);
 5742     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5743     __ add(tmpAddr, coeffs, 256);
 5744     store64shorts(vs2, tmpAddr);
 5745 
 5746     __ add(tmpAddr, coeffs, 128);
 5747     load64shorts(vs1, tmpAddr);
 5748     __ add(tmpAddr, coeffs, 384);
 5749     load64shorts(vs2, tmpAddr);
 5750     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5751     vs_subv(vs1, __ T8H, vs1, vs2);
 5752     __ add(tmpAddr, coeffs, 128);
 5753     store64shorts(vs3, tmpAddr);
 5754     load64shorts(vs2, zetas);
 5755     vs_ldpq(vq, kyberConsts);
 5756     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5757     __ add(tmpAddr, coeffs, 384);
 5758     store64shorts(vs2, tmpAddr);
 5759 
 5760     // multiply by 2^-n
 5761 
 5762     // load toMont(2^-n mod q)
 5763     __ add(tmpAddr, kyberConsts, 48);
 5764     __ ldr(v29, __ Q, tmpAddr);
 5765 
 5766     vs_ldpq(vq, kyberConsts);
 5767     __ add(tmpAddr, coeffs, 0);
 5768     load64shorts(vs1, tmpAddr);
 5769     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5770     __ add(tmpAddr, coeffs, 0);
 5771     store64shorts(vs2, tmpAddr);
 5772 
 5773     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5774     load64shorts(vs1, tmpAddr);
 5775     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5776     __ add(tmpAddr, coeffs, 128);
 5777     store64shorts(vs2, tmpAddr);
 5778 
 5779     // now tmpAddr contains coeffs + 256
 5780     load64shorts(vs1, tmpAddr);
 5781     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5782     __ add(tmpAddr, coeffs, 256);
 5783     store64shorts(vs2, tmpAddr);
 5784 
 5785     // now tmpAddr contains coeffs + 384
 5786     load64shorts(vs1, tmpAddr);
 5787     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5788     __ add(tmpAddr, coeffs, 384);
 5789     store64shorts(vs2, tmpAddr);
 5790 
 5791     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5792     __ mov(r0, zr); // return 0
 5793     __ ret(lr);
 5794 
 5795     return start;
 5796   }
 5797 
 5798   // Kyber multiply polynomials in the NTT domain.
 5799   // Implements
 5800   // static int implKyberNttMult(
 5801   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5802   //
 5803   // result (short[256]) = c_rarg0
 5804   // ntta (short[256]) = c_rarg1
 5805   // nttb (short[256]) = c_rarg2
 5806   // zetas (short[128]) = c_rarg3
 5807   address generate_kyberNttMult() {
 5808 
 5809     __ align(CodeEntryAlignment);
 5810     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5811     StubCodeMark mark(this, stub_id);
 5812     address start = __ pc();
 5813     __ enter();
 5814 
 5815     const Register result = c_rarg0;
 5816     const Register ntta = c_rarg1;
 5817     const Register nttb = c_rarg2;
 5818     const Register zetas = c_rarg3;
 5819 
 5820     const Register kyberConsts = r10;
 5821     const Register limit = r11;
 5822 
 5823     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5824     VSeq<4> vs3(16), vs4(20);
 5825     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5826     VSeq<2> vz(28);          // pair of zetas
 5827     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5828 
 5829     __ lea(kyberConsts,
 5830              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5831 
 5832     Label kyberNttMult_loop;
 5833 
 5834     __ add(limit, result, 512);
 5835 
 5836     // load q and qinv
 5837     vs_ldpq(vq, kyberConsts);
 5838 
 5839     // load R^2 mod q (to convert back from Montgomery representation)
 5840     __ add(kyberConsts, kyberConsts, 64);
 5841     __ ldr(v27, __ Q, kyberConsts);
 5842 
 5843     __ BIND(kyberNttMult_loop);
 5844 
 5845     // load 16 zetas
 5846     vs_ldpq_post(vz, zetas);
 5847 
 5848     // load 2 sets of 32 coefficients from the two input arrays
 5849     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5850     // are striped across pairs of vector registers
 5851     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5852     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5853     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5854     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5855 
 5856     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5857     // i.e. montmul the first and second halves of vs1 in order and
 5858     // then with one sequence reversed storing the two results in vs3
 5859     //
 5860     // vs3[0] <- montmul(a0, b0)
 5861     // vs3[1] <- montmul(a1, b1)
 5862     // vs3[2] <- montmul(a0, b1)
 5863     // vs3[3] <- montmul(a1, b0)
 5864     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5865     kyber_montmul16(vs_back(vs3),
 5866                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5867 
 5868     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5869     // i.e. montmul the first and second halves of vs4 in order and
 5870     // then with one sequence reversed storing the two results in vs1
 5871     //
 5872     // vs1[0] <- montmul(a2, b2)
 5873     // vs1[1] <- montmul(a3, b3)
 5874     // vs1[2] <- montmul(a2, b3)
 5875     // vs1[3] <- montmul(a3, b2)
 5876     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5877     kyber_montmul16(vs_back(vs1),
 5878                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5879 
 5880     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5881     // We can schedule two montmuls at a time if we use a suitable vector
 5882     // sequence <vs3[1], vs1[1]>.
 5883     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5884     VSeq<2> vs5(vs3[1], delta);
 5885 
 5886     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5887     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5888     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5889 
 5890     // add results in pairs storing in vs3
 5891     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5892     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5893     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5894 
 5895     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5896     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5897     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5898 
 5899     // vs1 <- montmul(vs3, montRSquareModQ)
 5900     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5901 
 5902     // store back the two pairs of result vectors de-interleaved as 8H elements
 5903     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5904     // in memory
 5905     vs_st2_post(vs1, __ T8H, result);
 5906 
 5907     __ cmp(result, limit);
 5908     __ br(Assembler::NE, kyberNttMult_loop);
 5909 
 5910     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5911     __ mov(r0, zr); // return 0
 5912     __ ret(lr);
 5913 
 5914     return start;
 5915   }
 5916 
 5917   // Kyber add 2 polynomials.
 5918   // Implements
 5919   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5920   //
 5921   // result (short[256]) = c_rarg0
 5922   // a (short[256]) = c_rarg1
 5923   // b (short[256]) = c_rarg2
 5924   address generate_kyberAddPoly_2() {
 5925 
 5926     __ align(CodeEntryAlignment);
 5927     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5928     StubCodeMark mark(this, stub_id);
 5929     address start = __ pc();
 5930     __ enter();
 5931 
 5932     const Register result = c_rarg0;
 5933     const Register a = c_rarg1;
 5934     const Register b = c_rarg2;
 5935 
 5936     const Register kyberConsts = r11;
 5937 
 5938     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5939     // So, we can load, add and store the data in 3 groups of 11,
 5940     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5941     // registers. A further constraint is that the mapping needs
 5942     // to skip callee saves. So, we allocate the register
 5943     // sequences using two 8 sequences, two 2 sequences and two
 5944     // single registers.
 5945     VSeq<8> vs1_1(0);
 5946     VSeq<2> vs1_2(16);
 5947     FloatRegister vs1_3 = v28;
 5948     VSeq<8> vs2_1(18);
 5949     VSeq<2> vs2_2(26);
 5950     FloatRegister vs2_3 = v29;
 5951 
 5952     // two constant vector sequences
 5953     VSeq<8> vc_1(31, 0);
 5954     VSeq<2> vc_2(31, 0);
 5955 
 5956     FloatRegister vc_3 = v31;
 5957     __ lea(kyberConsts,
 5958              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5959 
 5960     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5961     for (int i = 0; i < 3; i++) {
 5962       // load 80 or 88 values from a into vs1_1/2/3
 5963       vs_ldpq_post(vs1_1, a);
 5964       vs_ldpq_post(vs1_2, a);
 5965       if (i < 2) {
 5966         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5967       }
 5968       // load 80 or 88 values from b into vs2_1/2/3
 5969       vs_ldpq_post(vs2_1, b);
 5970       vs_ldpq_post(vs2_2, b);
 5971       if (i < 2) {
 5972         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5973       }
 5974       // sum 80 or 88 values across vs1 and vs2 into vs1
 5975       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5976       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5977       if (i < 2) {
 5978         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5979       }
 5980       // add constant to all 80 or 88 results
 5981       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5982       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5983       if (i < 2) {
 5984         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5985       }
 5986       // store 80 or 88 values
 5987       vs_stpq_post(vs1_1, result);
 5988       vs_stpq_post(vs1_2, result);
 5989       if (i < 2) {
 5990         __ str(vs1_3, __ Q, __ post(result, 16));
 5991       }
 5992     }
 5993 
 5994     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5995     __ mov(r0, zr); // return 0
 5996     __ ret(lr);
 5997 
 5998     return start;
 5999   }
 6000 
 6001   // Kyber add 3 polynomials.
 6002   // Implements
 6003   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6004   //
 6005   // result (short[256]) = c_rarg0
 6006   // a (short[256]) = c_rarg1
 6007   // b (short[256]) = c_rarg2
 6008   // c (short[256]) = c_rarg3
 6009   address generate_kyberAddPoly_3() {
 6010 
 6011     __ align(CodeEntryAlignment);
 6012     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6013     StubCodeMark mark(this, stub_id);
 6014     address start = __ pc();
 6015     __ enter();
 6016 
 6017     const Register result = c_rarg0;
 6018     const Register a = c_rarg1;
 6019     const Register b = c_rarg2;
 6020     const Register c = c_rarg3;
 6021 
 6022     const Register kyberConsts = r11;
 6023 
 6024     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6025     // quadwords.  So, we can load, add and store the data in 3
 6026     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6027     // of 10 or 11 registers. A further constraint is that the
 6028     // mapping needs to skip callee saves. So, we allocate the
 6029     // register sequences using two 8 sequences, two 2 sequences
 6030     // and two single registers.
 6031     VSeq<8> vs1_1(0);
 6032     VSeq<2> vs1_2(16);
 6033     FloatRegister vs1_3 = v28;
 6034     VSeq<8> vs2_1(18);
 6035     VSeq<2> vs2_2(26);
 6036     FloatRegister vs2_3 = v29;
 6037 
 6038     // two constant vector sequences
 6039     VSeq<8> vc_1(31, 0);
 6040     VSeq<2> vc_2(31, 0);
 6041 
 6042     FloatRegister vc_3 = v31;
 6043 
 6044     __ lea(kyberConsts,
 6045              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6046 
 6047     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6048     for (int i = 0; i < 3; i++) {
 6049       // load 80 or 88 values from a into vs1_1/2/3
 6050       vs_ldpq_post(vs1_1, a);
 6051       vs_ldpq_post(vs1_2, a);
 6052       if (i < 2) {
 6053         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6054       }
 6055       // load 80 or 88 values from b into vs2_1/2/3
 6056       vs_ldpq_post(vs2_1, b);
 6057       vs_ldpq_post(vs2_2, b);
 6058       if (i < 2) {
 6059         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6060       }
 6061       // sum 80 or 88 values across vs1 and vs2 into vs1
 6062       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6063       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6064       if (i < 2) {
 6065         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6066       }
 6067       // load 80 or 88 values from c into vs2_1/2/3
 6068       vs_ldpq_post(vs2_1, c);
 6069       vs_ldpq_post(vs2_2, c);
 6070       if (i < 2) {
 6071         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6072       }
 6073       // sum 80 or 88 values across vs1 and vs2 into vs1
 6074       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6075       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6076       if (i < 2) {
 6077         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6078       }
 6079       // add constant to all 80 or 88 results
 6080       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6081       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6082       if (i < 2) {
 6083         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6084       }
 6085       // store 80 or 88 values
 6086       vs_stpq_post(vs1_1, result);
 6087       vs_stpq_post(vs1_2, result);
 6088       if (i < 2) {
 6089         __ str(vs1_3, __ Q, __ post(result, 16));
 6090       }
 6091     }
 6092 
 6093     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6094     __ mov(r0, zr); // return 0
 6095     __ ret(lr);
 6096 
 6097     return start;
 6098   }
 6099 
 6100   // Kyber parse XOF output to polynomial coefficient candidates
 6101   // or decodePoly(12, ...).
 6102   // Implements
 6103   // static int implKyber12To16(
 6104   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6105   //
 6106   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6107   //
 6108   // condensed (byte[]) = c_rarg0
 6109   // condensedIndex = c_rarg1
 6110   // parsed (short[112 or 256]) = c_rarg2
 6111   // parsedLength (112 or 256) = c_rarg3
 6112   address generate_kyber12To16() {
 6113     Label L_F00, L_loop, L_end;
 6114 
 6115     __ align(CodeEntryAlignment);
 6116     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6117     StubCodeMark mark(this, stub_id);
 6118     address start = __ pc();
 6119     __ enter();
 6120 
 6121     const Register condensed = c_rarg0;
 6122     const Register condensedOffs = c_rarg1;
 6123     const Register parsed = c_rarg2;
 6124     const Register parsedLength = c_rarg3;
 6125 
 6126     const Register tmpAddr = r11;
 6127 
 6128     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6129     // quadwords so we need a 6 vector sequence for the inputs.
 6130     // Parsing produces 64 shorts, employing two 8 vector
 6131     // sequences to store and combine the intermediate data.
 6132     VSeq<6> vin(24);
 6133     VSeq<8> va(0), vb(16);
 6134 
 6135     __ adr(tmpAddr, L_F00);
 6136     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6137     __ add(condensed, condensed, condensedOffs);
 6138 
 6139     __ BIND(L_loop);
 6140     // load 96 (6 x 16B) byte values
 6141     vs_ld3_post(vin, __ T16B, condensed);
 6142 
 6143     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6144     // holds 48 (16x3) contiguous bytes from memory striped
 6145     // horizontally across each of the 16 byte lanes. Equivalently,
 6146     // that is 16 pairs of 12-bit integers. Likewise the back half
 6147     // holds the next 48 bytes in the same arrangement.
 6148 
 6149     // Each vector in the front half can also be viewed as a vertical
 6150     // strip across the 16 pairs of 12 bit integers. Each byte in
 6151     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6152     // byte in vin[1] stores the high 4 bits of the first int and the
 6153     // low 4 bits of the second int. Each byte in vin[2] stores the
 6154     // high 8 bits of the second int. Likewise the vectors in second
 6155     // half.
 6156 
 6157     // Converting the data to 16-bit shorts requires first of all
 6158     // expanding each of the 6 x 16B vectors into 6 corresponding
 6159     // pairs of 8H vectors. Mask, shift and add operations on the
 6160     // resulting vector pairs can be used to combine 4 and 8 bit
 6161     // parts of related 8H vector elements.
 6162     //
 6163     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6164     // twice, one copy manipulated to provide the lower 4 bits
 6165     // belonging to the first short in a pair and another copy
 6166     // manipulated to provide the higher 4 bits belonging to the
 6167     // second short in a pair. This is why the the vector sequences va
 6168     // and vb used to hold the expanded 8H elements are of length 8.
 6169 
 6170     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6171     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6172     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6173     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6174     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6175     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6176     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6177     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6178 
 6179     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6180     // and vb[4:5]
 6181     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6182     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6183     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6184     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6185     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6186     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6187 
 6188     // shift lo byte of copy 1 of the middle stripe into the high byte
 6189     __ shl(va[2], __ T8H, va[2], 8);
 6190     __ shl(va[3], __ T8H, va[3], 8);
 6191     __ shl(vb[2], __ T8H, vb[2], 8);
 6192     __ shl(vb[3], __ T8H, vb[3], 8);
 6193 
 6194     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6195     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6196     // are in bit positions [4..11].
 6197     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6198     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6199     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6200     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6201 
 6202     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6203     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6204     // copy2
 6205     __ andr(va[2], __ T16B, va[2], v31);
 6206     __ andr(va[3], __ T16B, va[3], v31);
 6207     __ ushr(va[4], __ T8H, va[4], 4);
 6208     __ ushr(va[5], __ T8H, va[5], 4);
 6209     __ andr(vb[2], __ T16B, vb[2], v31);
 6210     __ andr(vb[3], __ T16B, vb[3], v31);
 6211     __ ushr(vb[4], __ T8H, vb[4], 4);
 6212     __ ushr(vb[5], __ T8H, vb[5], 4);
 6213 
 6214     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6215     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6216     // n.b. the ordering ensures: i) inputs are consumed before they
 6217     // are overwritten ii) the order of 16-bit results across successive
 6218     // pairs of vectors in va and then vb reflects the order of the
 6219     // corresponding 12-bit inputs
 6220     __ addv(va[0], __ T8H, va[0], va[2]);
 6221     __ addv(va[2], __ T8H, va[1], va[3]);
 6222     __ addv(va[1], __ T8H, va[4], va[6]);
 6223     __ addv(va[3], __ T8H, va[5], va[7]);
 6224     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6225     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6226     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6227     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6228 
 6229     // store 64 results interleaved as shorts
 6230     vs_st2_post(vs_front(va), __ T8H, parsed);
 6231     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6232 
 6233     __ sub(parsedLength, parsedLength, 64);
 6234     __ cmp(parsedLength, (u1)64);
 6235     __ br(Assembler::GE, L_loop);
 6236     __ cbz(parsedLength, L_end);
 6237 
 6238     // if anything is left it should be a final 72 bytes of input
 6239     // i.e. a final 48 12-bit values. so we handle this by loading
 6240     // 48 bytes into all 16B lanes of front(vin) and only 24
 6241     // bytes into the lower 8B lane of back(vin)
 6242     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6243     vs_ld3(vs_back(vin), __ T8B, condensed);
 6244 
 6245     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6246     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6247     // 5 and target element 2 of vb duplicates element 4.
 6248     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6249     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6250     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6251     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6252     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6253     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6254 
 6255     // This time expand just the lower 8 lanes
 6256     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6257     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6258     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6259 
 6260     // shift lo byte of copy 1 of the middle stripe into the high byte
 6261     __ shl(va[2], __ T8H, va[2], 8);
 6262     __ shl(va[3], __ T8H, va[3], 8);
 6263     __ shl(vb[2], __ T8H, vb[2], 8);
 6264 
 6265     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6266     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6267     // int are in bit positions [4..11].
 6268     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6269     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6270     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6271 
 6272     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6273     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6274     // copy2
 6275     __ andr(va[2], __ T16B, va[2], v31);
 6276     __ andr(va[3], __ T16B, va[3], v31);
 6277     __ ushr(va[4], __ T8H, va[4], 4);
 6278     __ ushr(va[5], __ T8H, va[5], 4);
 6279     __ andr(vb[2], __ T16B, vb[2], v31);
 6280     __ ushr(vb[4], __ T8H, vb[4], 4);
 6281 
 6282 
 6283 
 6284     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6285     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6286 
 6287     // n.b. ordering ensures: i) inputs are consumed before they are
 6288     // overwritten ii) order of 16-bit results across succsessive
 6289     // pairs of vectors in va and then lower half of vb reflects order
 6290     // of corresponding 12-bit inputs
 6291     __ addv(va[0], __ T8H, va[0], va[2]);
 6292     __ addv(va[2], __ T8H, va[1], va[3]);
 6293     __ addv(va[1], __ T8H, va[4], va[6]);
 6294     __ addv(va[3], __ T8H, va[5], va[7]);
 6295     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6296     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6297 
 6298     // store 48 results interleaved as shorts
 6299     vs_st2_post(vs_front(va), __ T8H, parsed);
 6300     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6301 
 6302     __ BIND(L_end);
 6303 
 6304     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6305     __ mov(r0, zr); // return 0
 6306     __ ret(lr);
 6307 
 6308     // bind label and generate constant data used by this stub
 6309     __ BIND(L_F00);
 6310     __ emit_int64(0x0f000f000f000f00);
 6311     __ emit_int64(0x0f000f000f000f00);
 6312 
 6313     return start;
 6314   }
 6315 
 6316   // Kyber Barrett reduce function.
 6317   // Implements
 6318   // static int implKyberBarrettReduce(short[] coeffs) {}
 6319   //
 6320   // coeffs (short[256]) = c_rarg0
 6321   address generate_kyberBarrettReduce() {
 6322 
 6323     __ align(CodeEntryAlignment);
 6324     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6325     StubCodeMark mark(this, stub_id);
 6326     address start = __ pc();
 6327     __ enter();
 6328 
 6329     const Register coeffs = c_rarg0;
 6330 
 6331     const Register kyberConsts = r10;
 6332     const Register result = r11;
 6333 
 6334     // As above we process 256 sets of values in total i.e. 32 x
 6335     // 8H quadwords. So, we can load, add and store the data in 3
 6336     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6337     // of 10 or 11 registers. A further constraint is that the
 6338     // mapping needs to skip callee saves. So, we allocate the
 6339     // register sequences using two 8 sequences, two 2 sequences
 6340     // and two single registers.
 6341     VSeq<8> vs1_1(0);
 6342     VSeq<2> vs1_2(16);
 6343     FloatRegister vs1_3 = v28;
 6344     VSeq<8> vs2_1(18);
 6345     VSeq<2> vs2_2(26);
 6346     FloatRegister vs2_3 = v29;
 6347 
 6348     // we also need a pair of corresponding constant sequences
 6349 
 6350     VSeq<8> vc1_1(30, 0);
 6351     VSeq<2> vc1_2(30, 0);
 6352     FloatRegister vc1_3 = v30; // for kyber_q
 6353 
 6354     VSeq<8> vc2_1(31, 0);
 6355     VSeq<2> vc2_2(31, 0);
 6356     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6357 
 6358     __ add(result, coeffs, 0);
 6359     __ lea(kyberConsts,
 6360              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6361 
 6362     // load q and the multiplier for the Barrett reduction
 6363     __ add(kyberConsts, kyberConsts, 16);
 6364     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6365 
 6366     for (int i = 0; i < 3; i++) {
 6367       // load 80 or 88 coefficients
 6368       vs_ldpq_post(vs1_1, coeffs);
 6369       vs_ldpq_post(vs1_2, coeffs);
 6370       if (i < 2) {
 6371         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6372       }
 6373 
 6374       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6375       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6376       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6377       if (i < 2) {
 6378         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6379       }
 6380 
 6381       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6382       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6383       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6384       if (i < 2) {
 6385         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6386       }
 6387 
 6388       // vs1 <- vs1 - vs2 * kyber_q
 6389       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6390       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6391       if (i < 2) {
 6392         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6393       }
 6394 
 6395       vs_stpq_post(vs1_1, result);
 6396       vs_stpq_post(vs1_2, result);
 6397       if (i < 2) {
 6398         __ str(vs1_3, __ Q, __ post(result, 16));
 6399       }
 6400     }
 6401 
 6402     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6403     __ mov(r0, zr); // return 0
 6404     __ ret(lr);
 6405 
 6406     return start;
 6407   }
 6408 
 6409 
 6410   // Dilithium-specific montmul helper routines that generate parallel
 6411   // code for, respectively, a single 4x4s vector sequence montmul or
 6412   // two such multiplies in a row.
 6413 
 6414   // Perform 16 32-bit Montgomery multiplications in parallel
 6415   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6416                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6417     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6418     // It will assert that the register use is valid
 6419     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6420   }
 6421 
 6422   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6423   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6424                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6425     // Schedule two successive 4x4S multiplies via the montmul helper
 6426     // on the front and back halves of va, vb and vc. The helper will
 6427     // assert that the register use has no overlap conflicts on each
 6428     // individual call but we also need to ensure that the necessary
 6429     // disjoint/equality constraints are met across both calls.
 6430 
 6431     // vb, vc, vtmp and vq must be disjoint. va must either be
 6432     // disjoint from all other registers or equal vc
 6433 
 6434     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6435     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6436     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6437 
 6438     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6439     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6440 
 6441     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6442 
 6443     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6444     assert(vs_disjoint(va, vb), "va and vb overlap");
 6445     assert(vs_disjoint(va, vq), "va and vq overlap");
 6446     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6447 
 6448     // We multiply the front and back halves of each sequence 4 at a
 6449     // time because
 6450     //
 6451     // 1) we are currently only able to get 4-way instruction
 6452     // parallelism at best
 6453     //
 6454     // 2) we need registers for the constants in vq and temporary
 6455     // scratch registers to hold intermediate results so vtmp can only
 6456     // be a VSeq<4> which means we only have 4 scratch slots.
 6457 
 6458     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6459     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6460   }
 6461 
 6462   // Perform combined montmul then add/sub on 4x4S vectors.
 6463   void dilithium_montmul16_sub_add(
 6464           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6465           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6466     // compute a = montmul(a1, c)
 6467     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6468     // ouptut a1 = a0 - a
 6469     vs_subv(va1, __ T4S, va0, vc);
 6470     //    and a0 = a0 + a
 6471     vs_addv(va0, __ T4S, va0, vc);
 6472   }
 6473 
 6474   // Perform combined add/sub then montul on 4x4S vectors.
 6475   void dilithium_sub_add_montmul16(
 6476           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6477           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6478     // compute c = a0 - a1
 6479     vs_subv(vtmp1, __ T4S, va0, va1);
 6480     // output a0 = a0 + a1
 6481     vs_addv(va0, __ T4S, va0, va1);
 6482     // output a1 = b montmul c
 6483     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6484   }
 6485 
 6486   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6487   // in the Java implementation come in sequences of at least 8, so we
 6488   // can use ldpq to collect the corresponding data into pairs of vector
 6489   // registers.
 6490   // We collect the coefficients corresponding to the 'j+l' indexes into
 6491   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6492   // then we do the (Montgomery) multiplications by the zetas in parallel
 6493   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6494   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6495   // v0-v7 and finally save the results back to the coeffs array.
 6496   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6497     const Register coeffs, const Register zetas) {
 6498     int c1 = 0;
 6499     int c2 = 512;
 6500     int startIncr;
 6501     // don't use callee save registers v8 - v15
 6502     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6503     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6504     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6505     int offsets[4] = { 0, 32, 64, 96 };
 6506 
 6507     for (int level = 0; level < 5; level++) {
 6508       int c1Start = c1;
 6509       int c2Start = c2;
 6510       if (level == 3) {
 6511         offsets[1] = 32;
 6512         offsets[2] = 128;
 6513         offsets[3] = 160;
 6514       } else if (level == 4) {
 6515         offsets[1] = 64;
 6516         offsets[2] = 128;
 6517         offsets[3] = 192;
 6518       }
 6519 
 6520       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6521       // time at 4 different offsets and multiply them in order by the
 6522       // next set of input values. So we employ indexed load and store
 6523       // pair instructions with arrangement 4S.
 6524       for (int i = 0; i < 4; i++) {
 6525         // reload q and qinv
 6526         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6527         // load 8x4S coefficients via second start pos == c2
 6528         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6529         // load next 8x4S inputs == b
 6530         vs_ldpq_post(vs2, zetas);
 6531         // compute a == c2 * b mod MONT_Q
 6532         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6533         // load 8x4s coefficients via first start pos == c1
 6534         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6535         // compute a1 =  c1 + a
 6536         vs_addv(vs3, __ T4S, vs1, vs2);
 6537         // compute a2 =  c1 - a
 6538         vs_subv(vs1, __ T4S, vs1, vs2);
 6539         // output a1 and a2
 6540         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6541         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6542 
 6543         int k = 4 * level + i;
 6544 
 6545         if (k > 7) {
 6546           startIncr = 256;
 6547         } else if (k == 5) {
 6548           startIncr = 384;
 6549         } else {
 6550           startIncr = 128;
 6551         }
 6552 
 6553         c1Start += startIncr;
 6554         c2Start += startIncr;
 6555       }
 6556 
 6557       c2 /= 2;
 6558     }
 6559   }
 6560 
 6561   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6562   // Implements the method
 6563   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6564   // of the Java class sun.security.provider
 6565   //
 6566   // coeffs (int[256]) = c_rarg0
 6567   // zetas (int[256]) = c_rarg1
 6568   address generate_dilithiumAlmostNtt() {
 6569 
 6570     __ align(CodeEntryAlignment);
 6571     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6572     StubCodeMark mark(this, stub_id);
 6573     address start = __ pc();
 6574     __ enter();
 6575 
 6576     const Register coeffs = c_rarg0;
 6577     const Register zetas = c_rarg1;
 6578 
 6579     const Register tmpAddr = r9;
 6580     const Register dilithiumConsts = r10;
 6581     const Register result = r11;
 6582     // don't use callee save registers v8 - v15
 6583     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6584     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6585     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6586     int offsets[4] = { 0, 32, 64, 96};
 6587     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6588     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6589     __ add(result, coeffs, 0);
 6590     __ lea(dilithiumConsts,
 6591              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6592 
 6593     // Each level represents one iteration of the outer for loop of the Java version.
 6594 
 6595     // level 0-4
 6596     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6597 
 6598     // level 5
 6599 
 6600     // At level 5 the coefficients we need to combine with the zetas
 6601     // are grouped in memory in blocks of size 4. So, for both sets of
 6602     // coefficients we load 4 adjacent values at 8 different offsets
 6603     // using an indexed ldr with register variant Q and multiply them
 6604     // in sequence order by the next set of inputs. Likewise we store
 6605     // the resuls using an indexed str with register variant Q.
 6606     for (int i = 0; i < 1024; i += 256) {
 6607       // reload constants q, qinv each iteration as they get clobbered later
 6608       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6609       // load 32 (8x4S) coefficients via first offsets = c1
 6610       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6611       // load next 32 (8x4S) inputs = b
 6612       vs_ldpq_post(vs2, zetas);
 6613       // a = b montul c1
 6614       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6615       // load 32 (8x4S) coefficients via second offsets = c2
 6616       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6617       // add/sub with result of multiply
 6618       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6619       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6620       // write back new coefficients using same offsets
 6621       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6622       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6623     }
 6624 
 6625     // level 6
 6626     // At level 6 the coefficients we need to combine with the zetas
 6627     // are grouped in memory in pairs, the first two being montmul
 6628     // inputs and the second add/sub inputs. We can still implement
 6629     // the montmul+sub+add using 4-way parallelism but only if we
 6630     // combine the coefficients with the zetas 16 at a time. We load 8
 6631     // adjacent values at 4 different offsets using an ld2 load with
 6632     // arrangement 2D. That interleaves the lower and upper halves of
 6633     // each pair of quadwords into successive vector registers. We
 6634     // then need to montmul the 4 even elements of the coefficients
 6635     // register sequence by the zetas in order and then add/sub the 4
 6636     // odd elements of the coefficients register sequence. We use an
 6637     // equivalent st2 operation to store the results back into memory
 6638     // de-interleaved.
 6639     for (int i = 0; i < 1024; i += 128) {
 6640       // reload constants q, qinv each iteration as they get clobbered later
 6641       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6642       // load interleaved 16 (4x2D) coefficients via offsets
 6643       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6644       // load next 16 (4x4S) inputs
 6645       vs_ldpq_post(vs_front(vs2), zetas);
 6646       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6647       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6648                                   vs_front(vs2), vtmp, vq);
 6649       // store interleaved 16 (4x2D) coefficients via offsets
 6650       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6651     }
 6652 
 6653     // level 7
 6654     // At level 7 the coefficients we need to combine with the zetas
 6655     // occur singly with montmul inputs alterating with add/sub
 6656     // inputs. Once again we can use 4-way parallelism to combine 16
 6657     // zetas at a time. However, we have to load 8 adjacent values at
 6658     // 4 different offsets using an ld2 load with arrangement 4S. That
 6659     // interleaves the the odd words of each pair into one
 6660     // coefficients vector register and the even words of the pair
 6661     // into the next register. We then need to montmul the 4 even
 6662     // elements of the coefficients register sequence by the zetas in
 6663     // order and then add/sub the 4 odd elements of the coefficients
 6664     // register sequence. We use an equivalent st2 operation to store
 6665     // the results back into memory de-interleaved.
 6666 
 6667     for (int i = 0; i < 1024; i += 128) {
 6668       // reload constants q, qinv each iteration as they get clobbered later
 6669       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6670       // load interleaved 16 (4x4S) coefficients via offsets
 6671       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6672       // load next 16 (4x4S) inputs
 6673       vs_ldpq_post(vs_front(vs2), zetas);
 6674       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6675       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6676                                   vs_front(vs2), vtmp, vq);
 6677       // store interleaved 16 (4x4S) coefficients via offsets
 6678       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6679     }
 6680     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6681     __ mov(r0, zr); // return 0
 6682     __ ret(lr);
 6683 
 6684     return start;
 6685   }
 6686 
 6687   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6688   // in the Java implementation come in sequences of at least 8, so we
 6689   // can use ldpq to collect the corresponding data into pairs of vector
 6690   // registers
 6691   // We collect the coefficients that correspond to the 'j's into vs1
 6692   // the coefficiets that correspond to the 'j+l's into vs2 then
 6693   // do the additions into vs3 and the subtractions into vs1 then
 6694   // save the result of the additions, load the zetas into vs2
 6695   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6696   // finally save the results back to the coeffs array
 6697   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6698     const Register coeffs, const Register zetas) {
 6699     int c1 = 0;
 6700     int c2 = 32;
 6701     int startIncr;
 6702     int offsets[4];
 6703     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6704     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6705     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6706 
 6707     offsets[0] = 0;
 6708 
 6709     for (int level = 3; level < 8; level++) {
 6710       int c1Start = c1;
 6711       int c2Start = c2;
 6712       if (level == 3) {
 6713         offsets[1] = 64;
 6714         offsets[2] = 128;
 6715         offsets[3] = 192;
 6716       } else if (level == 4) {
 6717         offsets[1] = 32;
 6718         offsets[2] = 128;
 6719         offsets[3] = 160;
 6720       } else {
 6721         offsets[1] = 32;
 6722         offsets[2] = 64;
 6723         offsets[3] = 96;
 6724       }
 6725 
 6726       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6727       // time at 4 different offsets and multiply them in order by the
 6728       // next set of input values. So we employ indexed load and store
 6729       // pair instructions with arrangement 4S.
 6730       for (int i = 0; i < 4; i++) {
 6731         // load v1 32 (8x4S) coefficients relative to first start index
 6732         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6733         // load v2 32 (8x4S) coefficients relative to second start index
 6734         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6735         // a0 = v1 + v2 -- n.b. clobbers vqs
 6736         vs_addv(vs3, __ T4S, vs1, vs2);
 6737         // a1 = v1 - v2
 6738         vs_subv(vs1, __ T4S, vs1, vs2);
 6739         // save a1 relative to first start index
 6740         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6741         // load constants q, qinv each iteration as they get clobbered above
 6742         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6743         // load b next 32 (8x4S) inputs
 6744         vs_ldpq_post(vs2, zetas);
 6745         // a = a1 montmul b
 6746         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6747         // save a relative to second start index
 6748         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6749 
 6750         int k = 4 * level + i;
 6751 
 6752         if (k < 24) {
 6753           startIncr = 256;
 6754         } else if (k == 25) {
 6755           startIncr = 384;
 6756         } else {
 6757           startIncr = 128;
 6758         }
 6759 
 6760         c1Start += startIncr;
 6761         c2Start += startIncr;
 6762       }
 6763 
 6764       c2 *= 2;
 6765     }
 6766   }
 6767 
 6768   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6769   // Implements the method
 6770   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6771   // the sun.security.provider.ML_DSA class.
 6772   //
 6773   // coeffs (int[256]) = c_rarg0
 6774   // zetas (int[256]) = c_rarg1
 6775   address generate_dilithiumAlmostInverseNtt() {
 6776 
 6777     __ align(CodeEntryAlignment);
 6778     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6779     StubCodeMark mark(this, stub_id);
 6780     address start = __ pc();
 6781     __ enter();
 6782 
 6783     const Register coeffs = c_rarg0;
 6784     const Register zetas = c_rarg1;
 6785 
 6786     const Register tmpAddr = r9;
 6787     const Register dilithiumConsts = r10;
 6788     const Register result = r11;
 6789     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6790     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6791     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6792     int offsets[4] = { 0, 32, 64, 96 };
 6793     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6794     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6795 
 6796     __ add(result, coeffs, 0);
 6797     __ lea(dilithiumConsts,
 6798              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6799 
 6800     // Each level represents one iteration of the outer for loop of the Java version
 6801 
 6802     // level 0
 6803     // At level 0 we need to interleave adjacent quartets of
 6804     // coefficients before we multiply and add/sub by the next 16
 6805     // zetas just as we did for level 7 in the multiply code. So we
 6806     // load and store the values using an ld2/st2 with arrangement 4S.
 6807     for (int i = 0; i < 1024; i += 128) {
 6808       // load constants q, qinv
 6809       // n.b. this can be moved out of the loop as they do not get
 6810       // clobbered by first two loops
 6811       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6812       // a0/a1 load interleaved 32 (8x4S) coefficients
 6813       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6814       // b load next 32 (8x4S) inputs
 6815       vs_ldpq_post(vs_front(vs2), zetas);
 6816       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6817       // n.b. second half of vs2 provides temporary register storage
 6818       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6819                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6820       // a0/a1 store interleaved 32 (8x4S) coefficients
 6821       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6822     }
 6823 
 6824     // level 1
 6825     // At level 1 we need to interleave pairs of adjacent pairs of
 6826     // coefficients before we multiply by the next 16 zetas just as we
 6827     // did for level 6 in the multiply code. So we load and store the
 6828     // values an ld2/st2 with arrangement 2D.
 6829     for (int i = 0; i < 1024; i += 128) {
 6830       // a0/a1 load interleaved 32 (8x2D) coefficients
 6831       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6832       // b load next 16 (4x4S) inputs
 6833       vs_ldpq_post(vs_front(vs2), zetas);
 6834       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6835       // n.b. second half of vs2 provides temporary register storage
 6836       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6837                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6838       // a0/a1 store interleaved 32 (8x2D) coefficients
 6839       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6840     }
 6841 
 6842     // level 2
 6843     // At level 2 coefficients come in blocks of 4. So, we load 4
 6844     // adjacent coefficients at 8 distinct offsets for both the first
 6845     // and second coefficient sequences, using an ldr with register
 6846     // variant Q then combine them with next set of 32 zetas. Likewise
 6847     // we store the results using an str with register variant Q.
 6848     for (int i = 0; i < 1024; i += 256) {
 6849       // c0 load 32 (8x4S) coefficients via first offsets
 6850       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6851       // c1 load 32 (8x4S) coefficients via second offsets
 6852       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6853       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6854       vs_addv(vs3, __ T4S, vs1, vs2);
 6855       // c = c0 - c1
 6856       vs_subv(vs1, __ T4S, vs1, vs2);
 6857       // store a0 32 (8x4S) coefficients via first offsets
 6858       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6859       // b load 32 (8x4S) next inputs
 6860       vs_ldpq_post(vs2, zetas);
 6861       // reload constants q, qinv -- they were clobbered earlier
 6862       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6863       // compute a1 = b montmul c
 6864       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6865       // store a1 32 (8x4S) coefficients via second offsets
 6866       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6867     }
 6868 
 6869     // level 3-7
 6870     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6871 
 6872     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6873     __ mov(r0, zr); // return 0
 6874     __ ret(lr);
 6875 
 6876     return start;
 6877   }
 6878 
 6879   // Dilithium multiply polynomials in the NTT domain.
 6880   // Straightforward implementation of the method
 6881   // static int implDilithiumNttMult(
 6882   //              int[] result, int[] ntta, int[] nttb {} of
 6883   // the sun.security.provider.ML_DSA class.
 6884   //
 6885   // result (int[256]) = c_rarg0
 6886   // poly1 (int[256]) = c_rarg1
 6887   // poly2 (int[256]) = c_rarg2
 6888   address generate_dilithiumNttMult() {
 6889 
 6890         __ align(CodeEntryAlignment);
 6891     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6892     StubCodeMark mark(this, stub_id);
 6893     address start = __ pc();
 6894     __ enter();
 6895 
 6896     Label L_loop;
 6897 
 6898     const Register result = c_rarg0;
 6899     const Register poly1 = c_rarg1;
 6900     const Register poly2 = c_rarg2;
 6901 
 6902     const Register dilithiumConsts = r10;
 6903     const Register len = r11;
 6904 
 6905     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6906     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6907     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6908     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6909 
 6910     __ lea(dilithiumConsts,
 6911              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6912 
 6913     // load constants q, qinv
 6914     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6915     // load constant rSquare into v29
 6916     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6917 
 6918     __ mov(len, zr);
 6919     __ add(len, len, 1024);
 6920 
 6921     __ BIND(L_loop);
 6922 
 6923     // b load 32 (8x4S) next inputs from poly1
 6924     vs_ldpq_post(vs1, poly1);
 6925     // c load 32 (8x4S) next inputs from poly2
 6926     vs_ldpq_post(vs2, poly2);
 6927     // compute a = b montmul c
 6928     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6929     // compute a = rsquare montmul a
 6930     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6931     // save a 32 (8x4S) results
 6932     vs_stpq_post(vs2, result);
 6933 
 6934     __ sub(len, len, 128);
 6935     __ cmp(len, (u1)128);
 6936     __ br(Assembler::GE, L_loop);
 6937 
 6938     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6939     __ mov(r0, zr); // return 0
 6940     __ ret(lr);
 6941 
 6942     return start;
 6943   }
 6944 
 6945   // Dilithium Motgomery multiply an array by a constant.
 6946   // A straightforward implementation of the method
 6947   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6948   // of the sun.security.provider.MLDSA class
 6949   //
 6950   // coeffs (int[256]) = c_rarg0
 6951   // constant (int) = c_rarg1
 6952   address generate_dilithiumMontMulByConstant() {
 6953 
 6954     __ align(CodeEntryAlignment);
 6955     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6956     StubCodeMark mark(this, stub_id);
 6957     address start = __ pc();
 6958     __ enter();
 6959 
 6960     Label L_loop;
 6961 
 6962     const Register coeffs = c_rarg0;
 6963     const Register constant = c_rarg1;
 6964 
 6965     const Register dilithiumConsts = r10;
 6966     const Register result = r11;
 6967     const Register len = r12;
 6968 
 6969     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6970     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6971     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6972     VSeq<8> vconst(29, 0);             // for montmul by constant
 6973 
 6974     // results track inputs
 6975     __ add(result, coeffs, 0);
 6976     __ lea(dilithiumConsts,
 6977              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6978 
 6979     // load constants q, qinv -- they do not get clobbered by first two loops
 6980     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6981     // copy caller supplied constant across vconst
 6982     __ dup(vconst[0], __ T4S, constant);
 6983     __ mov(len, zr);
 6984     __ add(len, len, 1024);
 6985 
 6986     __ BIND(L_loop);
 6987 
 6988     // load next 32 inputs
 6989     vs_ldpq_post(vs2, coeffs);
 6990     // mont mul by constant
 6991     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6992     // write next 32 results
 6993     vs_stpq_post(vs2, result);
 6994 
 6995     __ sub(len, len, 128);
 6996     __ cmp(len, (u1)128);
 6997     __ br(Assembler::GE, L_loop);
 6998 
 6999     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7000     __ mov(r0, zr); // return 0
 7001     __ ret(lr);
 7002 
 7003     return start;
 7004   }
 7005 
 7006   // Dilithium decompose poly.
 7007   // Implements the method
 7008   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 7009   // of the sun.security.provider.ML_DSA class
 7010   //
 7011   // input (int[256]) = c_rarg0
 7012   // lowPart (int[256]) = c_rarg1
 7013   // highPart (int[256]) = c_rarg2
 7014   // twoGamma2  (int) = c_rarg3
 7015   // multiplier (int) = c_rarg4
 7016   address generate_dilithiumDecomposePoly() {
 7017 
 7018     __ align(CodeEntryAlignment);
 7019     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7020     StubCodeMark mark(this, stub_id);
 7021     address start = __ pc();
 7022     Label L_loop;
 7023 
 7024     const Register input = c_rarg0;
 7025     const Register lowPart = c_rarg1;
 7026     const Register highPart = c_rarg2;
 7027     const Register twoGamma2 = c_rarg3;
 7028     const Register multiplier = c_rarg4;
 7029 
 7030     const Register len = r9;
 7031     const Register dilithiumConsts = r10;
 7032     const Register tmp = r11;
 7033 
 7034     // 6 independent sets of 4x4s values
 7035     VSeq<4> vs1(0), vs2(4), vs3(8);
 7036     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7037 
 7038     // 7 constants for cross-multiplying
 7039     VSeq<4> one(25, 0);
 7040     VSeq<4> qminus1(26, 0);
 7041     VSeq<4> g2(27, 0);
 7042     VSeq<4> twog2(28, 0);
 7043     VSeq<4> mult(29, 0);
 7044     VSeq<4> q(30, 0);
 7045     VSeq<4> qadd(31, 0);
 7046 
 7047     __ enter();
 7048 
 7049     __ lea(dilithiumConsts,
 7050              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7051 
 7052     // save callee-saved registers
 7053     __ stpd(v8, v9, __ pre(sp, -64));
 7054     __ stpd(v10, v11, Address(sp, 16));
 7055     __ stpd(v12, v13, Address(sp, 32));
 7056     __ stpd(v14, v15, Address(sp, 48));
 7057 
 7058     // populate constant registers
 7059     __ mov(tmp, zr);
 7060     __ add(tmp, tmp, 1);
 7061     __ dup(one[0], __ T4S, tmp); // 1
 7062     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7063     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7064     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7065     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7066     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7067     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7068 
 7069     __ mov(len, zr);
 7070     __ add(len, len, 1024);
 7071 
 7072     __ BIND(L_loop);
 7073 
 7074     // load next 4x4S inputs interleaved: rplus --> vs1
 7075     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7076 
 7077     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7078     vs_addv(vtmp, __ T4S, vs1, qadd);
 7079     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7080     vs_mulv(vtmp, __ T4S, vtmp, q);
 7081     vs_subv(vs1, __ T4S, vs1, vtmp);
 7082 
 7083     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7084     vs_sshr(vtmp, __ T4S, vs1, 31);
 7085     vs_andr(vtmp, vtmp, q);
 7086     vs_addv(vs1, __ T4S, vs1, vtmp);
 7087 
 7088     // quotient --> vs2
 7089     // int quotient = (rplus * multiplier) >> 22;
 7090     vs_mulv(vtmp, __ T4S, vs1, mult);
 7091     vs_sshr(vs2, __ T4S, vtmp, 22);
 7092 
 7093     // r0 --> vs3
 7094     // int r0 = rplus - quotient * twoGamma2;
 7095     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7096     vs_subv(vs3, __ T4S, vs1, vtmp);
 7097 
 7098     // mask --> vs4
 7099     // int mask = (twoGamma2 - r0) >> 22;
 7100     vs_subv(vtmp, __ T4S, twog2, vs3);
 7101     vs_sshr(vs4, __ T4S, vtmp, 22);
 7102 
 7103     // r0 -= (mask & twoGamma2);
 7104     vs_andr(vtmp, vs4, twog2);
 7105     vs_subv(vs3, __ T4S, vs3, vtmp);
 7106 
 7107     //  quotient += (mask & 1);
 7108     vs_andr(vtmp, vs4, one);
 7109     vs_addv(vs2, __ T4S, vs2, vtmp);
 7110 
 7111     // mask = (twoGamma2 / 2 - r0) >> 31;
 7112     vs_subv(vtmp, __ T4S, g2, vs3);
 7113     vs_sshr(vs4, __ T4S, vtmp, 31);
 7114 
 7115     // r0 -= (mask & twoGamma2);
 7116     vs_andr(vtmp, vs4, twog2);
 7117     vs_subv(vs3, __ T4S, vs3, vtmp);
 7118 
 7119     // quotient += (mask & 1);
 7120     vs_andr(vtmp, vs4, one);
 7121     vs_addv(vs2, __ T4S, vs2, vtmp);
 7122 
 7123     // r1 --> vs5
 7124     // int r1 = rplus - r0 - (dilithium_q - 1);
 7125     vs_subv(vtmp, __ T4S, vs1, vs3);
 7126     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7127 
 7128     // r1 --> vs1 (overwriting rplus)
 7129     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7130     vs_negr(vtmp, __ T4S, vs5);
 7131     vs_orr(vtmp, vs5, vtmp);
 7132     vs_sshr(vs1, __ T4S, vtmp, 31);
 7133 
 7134     // r0 += ~r1;
 7135     vs_notr(vtmp, vs1);
 7136     vs_addv(vs3, __ T4S, vs3, vtmp);
 7137 
 7138     // r1 = r1 & quotient;
 7139     vs_andr(vs1, vs2, vs1);
 7140 
 7141     // store results inteleaved
 7142     // lowPart[m] = r0;
 7143     // highPart[m] = r1;
 7144     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7145     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7146 
 7147     __ sub(len, len, 64);
 7148     __ cmp(len, (u1)64);
 7149     __ br(Assembler::GE, L_loop);
 7150 
 7151     // restore callee-saved vector registers
 7152     __ ldpd(v14, v15, Address(sp, 48));
 7153     __ ldpd(v12, v13, Address(sp, 32));
 7154     __ ldpd(v10, v11, Address(sp, 16));
 7155     __ ldpd(v8, v9, __ post(sp, 64));
 7156 
 7157     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7158     __ mov(r0, zr); // return 0
 7159     __ ret(lr);
 7160 
 7161     return start;
 7162   }
 7163 
 7164   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7165              Register tmp0, Register tmp1, Register tmp2) {
 7166     __ bic(tmp0, a2, a1); // for a0
 7167     __ bic(tmp1, a3, a2); // for a1
 7168     __ bic(tmp2, a4, a3); // for a2
 7169     __ eor(a2, a2, tmp2);
 7170     __ bic(tmp2, a0, a4); // for a3
 7171     __ eor(a3, a3, tmp2);
 7172     __ bic(tmp2, a1, a0); // for a4
 7173     __ eor(a0, a0, tmp0);
 7174     __ eor(a1, a1, tmp1);
 7175     __ eor(a4, a4, tmp2);
 7176   }
 7177 
 7178   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7179                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7180                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7181                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7182                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7183                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7184                         Register tmp0, Register tmp1, Register tmp2) {
 7185     __ eor3(tmp1, a4, a9, a14);
 7186     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7187     __ eor3(tmp2, a1, a6, a11);
 7188     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7189     __ rax1(tmp2, tmp0, tmp1); // d0
 7190     {
 7191 
 7192       Register tmp3, tmp4;
 7193       if (can_use_fp && can_use_r18) {
 7194         tmp3 = rfp;
 7195         tmp4 = r18_tls;
 7196       } else {
 7197         tmp3 = a4;
 7198         tmp4 = a9;
 7199         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7200       }
 7201 
 7202       __ eor3(tmp3, a0, a5, a10);
 7203       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7204       __ eor(a0, a0, tmp2);
 7205       __ eor(a5, a5, tmp2);
 7206       __ eor(a10, a10, tmp2);
 7207       __ eor(a15, a15, tmp2);
 7208       __ eor(a20, a20, tmp2); // d0(tmp2)
 7209       __ eor3(tmp3, a2, a7, a12);
 7210       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7211       __ rax1(tmp3, tmp4, tmp2); // d1
 7212       __ eor(a1, a1, tmp3);
 7213       __ eor(a6, a6, tmp3);
 7214       __ eor(a11, a11, tmp3);
 7215       __ eor(a16, a16, tmp3);
 7216       __ eor(a21, a21, tmp3); // d1(tmp3)
 7217       __ rax1(tmp3, tmp2, tmp0); // d3
 7218       __ eor3(tmp2, a3, a8, a13);
 7219       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7220       __ eor(a3, a3, tmp3);
 7221       __ eor(a8, a8, tmp3);
 7222       __ eor(a13, a13, tmp3);
 7223       __ eor(a18, a18, tmp3);
 7224       __ eor(a23, a23, tmp3);
 7225       __ rax1(tmp2, tmp1, tmp0); // d2
 7226       __ eor(a2, a2, tmp2);
 7227       __ eor(a7, a7, tmp2);
 7228       __ eor(a12, a12, tmp2);
 7229       __ rax1(tmp0, tmp0, tmp4); // d4
 7230       if (!can_use_fp || !can_use_r18) {
 7231         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7232       }
 7233       __ eor(a17, a17, tmp2);
 7234       __ eor(a22, a22, tmp2);
 7235       __ eor(a4, a4, tmp0);
 7236       __ eor(a9, a9, tmp0);
 7237       __ eor(a14, a14, tmp0);
 7238       __ eor(a19, a19, tmp0);
 7239       __ eor(a24, a24, tmp0);
 7240     }
 7241 
 7242     __ rol(tmp0, a10, 3);
 7243     __ rol(a10, a1, 1);
 7244     __ rol(a1, a6, 44);
 7245     __ rol(a6, a9, 20);
 7246     __ rol(a9, a22, 61);
 7247     __ rol(a22, a14, 39);
 7248     __ rol(a14, a20, 18);
 7249     __ rol(a20, a2, 62);
 7250     __ rol(a2, a12, 43);
 7251     __ rol(a12, a13, 25);
 7252     __ rol(a13, a19, 8) ;
 7253     __ rol(a19, a23, 56);
 7254     __ rol(a23, a15, 41);
 7255     __ rol(a15, a4, 27);
 7256     __ rol(a4, a24, 14);
 7257     __ rol(a24, a21, 2);
 7258     __ rol(a21, a8, 55);
 7259     __ rol(a8, a16, 45);
 7260     __ rol(a16, a5, 36);
 7261     __ rol(a5, a3, 28);
 7262     __ rol(a3, a18, 21);
 7263     __ rol(a18, a17, 15);
 7264     __ rol(a17, a11, 10);
 7265     __ rol(a11, a7, 6);
 7266     __ mov(a7, tmp0);
 7267 
 7268     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7269     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7270     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7271     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7272     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7273 
 7274     __ ldr(tmp1, __ post(rc, 8));
 7275     __ eor(a0, a0, tmp1);
 7276 
 7277   }
 7278 
 7279   // Arguments:
 7280   //
 7281   // Inputs:
 7282   //   c_rarg0   - byte[]  source+offset
 7283   //   c_rarg1   - byte[]  SHA.state
 7284   //   c_rarg2   - int     block_size
 7285   //   c_rarg3   - int     offset
 7286   //   c_rarg4   - int     limit
 7287   //
 7288   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7289     bool multi_block;
 7290     switch (stub_id) {
 7291     case StubId::stubgen_sha3_implCompress_id:
 7292       multi_block = false;
 7293       break;
 7294     case StubId::stubgen_sha3_implCompressMB_id:
 7295       multi_block = true;
 7296       break;
 7297     default:
 7298       ShouldNotReachHere();
 7299     }
 7300 
 7301     static const uint64_t round_consts[24] = {
 7302       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7303       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7304       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7305       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7306       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7307       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7308       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7309       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7310     };
 7311 
 7312     __ align(CodeEntryAlignment);
 7313     StubCodeMark mark(this, stub_id);
 7314     address start = __ pc();
 7315 
 7316     Register buf           = c_rarg0;
 7317     Register state         = c_rarg1;
 7318     Register block_size    = c_rarg2;
 7319     Register ofs           = c_rarg3;
 7320     Register limit         = c_rarg4;
 7321 
 7322     // use r3.r17,r19..r28 to keep a0..a24.
 7323     // a0..a24 are respective locals from SHA3.java
 7324     Register a0 = r25,
 7325              a1 = r26,
 7326              a2 = r27,
 7327              a3 = r3,
 7328              a4 = r4,
 7329              a5 = r5,
 7330              a6 = r6,
 7331              a7 = r7,
 7332              a8 = rscratch1, // r8
 7333              a9 = rscratch2, // r9
 7334              a10 = r10,
 7335              a11 = r11,
 7336              a12 = r12,
 7337              a13 = r13,
 7338              a14 = r14,
 7339              a15 = r15,
 7340              a16 = r16,
 7341              a17 = r17,
 7342              a18 = r28,
 7343              a19 = r19,
 7344              a20 = r20,
 7345              a21 = r21,
 7346              a22 = r22,
 7347              a23 = r23,
 7348              a24 = r24;
 7349 
 7350     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7351 
 7352     Label sha3_loop, rounds24_preloop, loop_body;
 7353     Label sha3_512_or_sha3_384, shake128;
 7354 
 7355     bool can_use_r18 = false;
 7356 #ifndef R18_RESERVED
 7357     can_use_r18 = true;
 7358 #endif
 7359     bool can_use_fp = !PreserveFramePointer;
 7360 
 7361     __ enter();
 7362 
 7363     // save almost all yet unsaved gpr registers on stack
 7364     __ str(block_size, __ pre(sp, -128));
 7365     if (multi_block) {
 7366       __ stpw(ofs, limit, Address(sp, 8));
 7367     }
 7368     // 8 bytes at sp+16 will be used to keep buf
 7369     __ stp(r19, r20, Address(sp, 32));
 7370     __ stp(r21, r22, Address(sp, 48));
 7371     __ stp(r23, r24, Address(sp, 64));
 7372     __ stp(r25, r26, Address(sp, 80));
 7373     __ stp(r27, r28, Address(sp, 96));
 7374     if (can_use_r18 && can_use_fp) {
 7375       __ stp(r18_tls, state, Address(sp, 112));
 7376     } else {
 7377       __ str(state, Address(sp, 112));
 7378     }
 7379 
 7380     // begin sha3 calculations: loading a0..a24 from state arrary
 7381     __ ldp(a0, a1, state);
 7382     __ ldp(a2, a3, Address(state, 16));
 7383     __ ldp(a4, a5, Address(state, 32));
 7384     __ ldp(a6, a7, Address(state, 48));
 7385     __ ldp(a8, a9, Address(state, 64));
 7386     __ ldp(a10, a11, Address(state, 80));
 7387     __ ldp(a12, a13, Address(state, 96));
 7388     __ ldp(a14, a15, Address(state, 112));
 7389     __ ldp(a16, a17, Address(state, 128));
 7390     __ ldp(a18, a19, Address(state, 144));
 7391     __ ldp(a20, a21, Address(state, 160));
 7392     __ ldp(a22, a23, Address(state, 176));
 7393     __ ldr(a24, Address(state, 192));
 7394 
 7395     __ BIND(sha3_loop);
 7396 
 7397     // load input
 7398     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7399     __ eor(a0, a0, tmp3);
 7400     __ eor(a1, a1, tmp2);
 7401     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7402     __ eor(a2, a2, tmp3);
 7403     __ eor(a3, a3, tmp2);
 7404     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7405     __ eor(a4, a4, tmp3);
 7406     __ eor(a5, a5, tmp2);
 7407     __ ldr(tmp3, __ post(buf, 8));
 7408     __ eor(a6, a6, tmp3);
 7409 
 7410     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7411     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7412 
 7413     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7414     __ eor(a7, a7, tmp3);
 7415     __ eor(a8, a8, tmp2);
 7416     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7417     __ eor(a9, a9, tmp3);
 7418     __ eor(a10, a10, tmp2);
 7419     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7420     __ eor(a11, a11, tmp3);
 7421     __ eor(a12, a12, tmp2);
 7422     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7423     __ eor(a13, a13, tmp3);
 7424     __ eor(a14, a14, tmp2);
 7425     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7426     __ eor(a15, a15, tmp3);
 7427     __ eor(a16, a16, tmp2);
 7428 
 7429     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7430     __ andw(tmp2, block_size, 48);
 7431     __ cbzw(tmp2, rounds24_preloop);
 7432     __ tbnz(block_size, 5, shake128);
 7433     // block_size == 144, bit5 == 0, SHA3-244
 7434     __ ldr(tmp3, __ post(buf, 8));
 7435     __ eor(a17, a17, tmp3);
 7436     __ b(rounds24_preloop);
 7437 
 7438     __ BIND(shake128);
 7439     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7440     __ eor(a17, a17, tmp3);
 7441     __ eor(a18, a18, tmp2);
 7442     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7443     __ eor(a19, a19, tmp3);
 7444     __ eor(a20, a20, tmp2);
 7445     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7446 
 7447     __ BIND(sha3_512_or_sha3_384);
 7448     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7449     __ eor(a7, a7, tmp3);
 7450     __ eor(a8, a8, tmp2);
 7451     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7452 
 7453     // SHA3-384
 7454     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7455     __ eor(a9, a9, tmp3);
 7456     __ eor(a10, a10, tmp2);
 7457     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7458     __ eor(a11, a11, tmp3);
 7459     __ eor(a12, a12, tmp2);
 7460 
 7461     __ BIND(rounds24_preloop);
 7462     __ fmovs(v0, 24.0); // float loop counter,
 7463     __ fmovs(v1, 1.0);  // exact representation
 7464 
 7465     __ str(buf, Address(sp, 16));
 7466     __ lea(tmp3, ExternalAddress((address) round_consts));
 7467 
 7468     __ BIND(loop_body);
 7469     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7470                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7471                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7472                      tmp0, tmp1, tmp2);
 7473     __ fsubs(v0, v0, v1);
 7474     __ fcmps(v0, 0.0);
 7475     __ br(__ NE, loop_body);
 7476 
 7477     if (multi_block) {
 7478       __ ldrw(block_size, sp); // block_size
 7479       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7480       __ addw(tmp2, tmp2, block_size);
 7481       __ cmpw(tmp2, tmp1);
 7482       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7483       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7484       __ br(Assembler::LE, sha3_loop);
 7485       __ movw(c_rarg0, tmp2); // return offset
 7486     }
 7487     if (can_use_fp && can_use_r18) {
 7488       __ ldp(r18_tls, state, Address(sp, 112));
 7489     } else {
 7490       __ ldr(state, Address(sp, 112));
 7491     }
 7492     // save calculated sha3 state
 7493     __ stp(a0, a1, Address(state));
 7494     __ stp(a2, a3, Address(state, 16));
 7495     __ stp(a4, a5, Address(state, 32));
 7496     __ stp(a6, a7, Address(state, 48));
 7497     __ stp(a8, a9, Address(state, 64));
 7498     __ stp(a10, a11, Address(state, 80));
 7499     __ stp(a12, a13, Address(state, 96));
 7500     __ stp(a14, a15, Address(state, 112));
 7501     __ stp(a16, a17, Address(state, 128));
 7502     __ stp(a18, a19, Address(state, 144));
 7503     __ stp(a20, a21, Address(state, 160));
 7504     __ stp(a22, a23, Address(state, 176));
 7505     __ str(a24, Address(state, 192));
 7506 
 7507     // restore required registers from stack
 7508     __ ldp(r19, r20, Address(sp, 32));
 7509     __ ldp(r21, r22, Address(sp, 48));
 7510     __ ldp(r23, r24, Address(sp, 64));
 7511     __ ldp(r25, r26, Address(sp, 80));
 7512     __ ldp(r27, r28, Address(sp, 96));
 7513     if (can_use_fp && can_use_r18) {
 7514       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7515     } // else no need to recalculate rfp, since it wasn't changed
 7516 
 7517     __ leave();
 7518 
 7519     __ ret(lr);
 7520 
 7521     return start;
 7522   }
 7523 
 7524   /**
 7525    *  Arguments:
 7526    *
 7527    * Inputs:
 7528    *   c_rarg0   - int crc
 7529    *   c_rarg1   - byte* buf
 7530    *   c_rarg2   - int length
 7531    *
 7532    * Output:
 7533    *       rax   - int crc result
 7534    */
 7535   address generate_updateBytesCRC32() {
 7536     assert(UseCRC32Intrinsics, "what are we doing here?");
 7537 
 7538     __ align(CodeEntryAlignment);
 7539     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7540     StubCodeMark mark(this, stub_id);
 7541 
 7542     address start = __ pc();
 7543 
 7544     const Register crc   = c_rarg0;  // crc
 7545     const Register buf   = c_rarg1;  // source java byte array address
 7546     const Register len   = c_rarg2;  // length
 7547     const Register table0 = c_rarg3; // crc_table address
 7548     const Register table1 = c_rarg4;
 7549     const Register table2 = c_rarg5;
 7550     const Register table3 = c_rarg6;
 7551     const Register tmp3 = c_rarg7;
 7552 
 7553     BLOCK_COMMENT("Entry:");
 7554     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7555 
 7556     __ kernel_crc32(crc, buf, len,
 7557               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7558 
 7559     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7560     __ ret(lr);
 7561 
 7562     return start;
 7563   }
 7564 
 7565   /**
 7566    *  Arguments:
 7567    *
 7568    * Inputs:
 7569    *   c_rarg0   - int crc
 7570    *   c_rarg1   - byte* buf
 7571    *   c_rarg2   - int length
 7572    *   c_rarg3   - int* table
 7573    *
 7574    * Output:
 7575    *       r0   - int crc result
 7576    */
 7577   address generate_updateBytesCRC32C() {
 7578     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7579 
 7580     __ align(CodeEntryAlignment);
 7581     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7582     StubCodeMark mark(this, stub_id);
 7583 
 7584     address start = __ pc();
 7585 
 7586     const Register crc   = c_rarg0;  // crc
 7587     const Register buf   = c_rarg1;  // source java byte array address
 7588     const Register len   = c_rarg2;  // length
 7589     const Register table0 = c_rarg3; // crc_table address
 7590     const Register table1 = c_rarg4;
 7591     const Register table2 = c_rarg5;
 7592     const Register table3 = c_rarg6;
 7593     const Register tmp3 = c_rarg7;
 7594 
 7595     BLOCK_COMMENT("Entry:");
 7596     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7597 
 7598     __ kernel_crc32c(crc, buf, len,
 7599               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7600 
 7601     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7602     __ ret(lr);
 7603 
 7604     return start;
 7605   }
 7606 
 7607   /***
 7608    *  Arguments:
 7609    *
 7610    *  Inputs:
 7611    *   c_rarg0   - int   adler
 7612    *   c_rarg1   - byte* buff
 7613    *   c_rarg2   - int   len
 7614    *
 7615    * Output:
 7616    *   c_rarg0   - int adler result
 7617    */
 7618   address generate_updateBytesAdler32() {
 7619     __ align(CodeEntryAlignment);
 7620     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7621     StubCodeMark mark(this, stub_id);
 7622     address start = __ pc();
 7623 
 7624     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7625 
 7626     // Aliases
 7627     Register adler  = c_rarg0;
 7628     Register s1     = c_rarg0;
 7629     Register s2     = c_rarg3;
 7630     Register buff   = c_rarg1;
 7631     Register len    = c_rarg2;
 7632     Register nmax  = r4;
 7633     Register base  = r5;
 7634     Register count = r6;
 7635     Register temp0 = rscratch1;
 7636     Register temp1 = rscratch2;
 7637     FloatRegister vbytes = v0;
 7638     FloatRegister vs1acc = v1;
 7639     FloatRegister vs2acc = v2;
 7640     FloatRegister vtable = v3;
 7641 
 7642     // Max number of bytes we can process before having to take the mod
 7643     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7644     uint64_t BASE = 0xfff1;
 7645     uint64_t NMAX = 0x15B0;
 7646 
 7647     __ mov(base, BASE);
 7648     __ mov(nmax, NMAX);
 7649 
 7650     // Load accumulation coefficients for the upper 16 bits
 7651     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7652     __ ld1(vtable, __ T16B, Address(temp0));
 7653 
 7654     // s1 is initialized to the lower 16 bits of adler
 7655     // s2 is initialized to the upper 16 bits of adler
 7656     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7657     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7658 
 7659     // The pipelined loop needs at least 16 elements for 1 iteration
 7660     // It does check this, but it is more effective to skip to the cleanup loop
 7661     __ cmp(len, (u1)16);
 7662     __ br(Assembler::HS, L_nmax);
 7663     __ cbz(len, L_combine);
 7664 
 7665     __ bind(L_simple_by1_loop);
 7666     __ ldrb(temp0, Address(__ post(buff, 1)));
 7667     __ add(s1, s1, temp0);
 7668     __ add(s2, s2, s1);
 7669     __ subs(len, len, 1);
 7670     __ br(Assembler::HI, L_simple_by1_loop);
 7671 
 7672     // s1 = s1 % BASE
 7673     __ subs(temp0, s1, base);
 7674     __ csel(s1, temp0, s1, Assembler::HS);
 7675 
 7676     // s2 = s2 % BASE
 7677     __ lsr(temp0, s2, 16);
 7678     __ lsl(temp1, temp0, 4);
 7679     __ sub(temp1, temp1, temp0);
 7680     __ add(s2, temp1, s2, ext::uxth);
 7681 
 7682     __ subs(temp0, s2, base);
 7683     __ csel(s2, temp0, s2, Assembler::HS);
 7684 
 7685     __ b(L_combine);
 7686 
 7687     __ bind(L_nmax);
 7688     __ subs(len, len, nmax);
 7689     __ sub(count, nmax, 16);
 7690     __ br(Assembler::LO, L_by16);
 7691 
 7692     __ bind(L_nmax_loop);
 7693 
 7694     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7695                                       vbytes, vs1acc, vs2acc, vtable);
 7696 
 7697     __ subs(count, count, 16);
 7698     __ br(Assembler::HS, L_nmax_loop);
 7699 
 7700     // s1 = s1 % BASE
 7701     __ lsr(temp0, s1, 16);
 7702     __ lsl(temp1, temp0, 4);
 7703     __ sub(temp1, temp1, temp0);
 7704     __ add(temp1, temp1, s1, ext::uxth);
 7705 
 7706     __ lsr(temp0, temp1, 16);
 7707     __ lsl(s1, temp0, 4);
 7708     __ sub(s1, s1, temp0);
 7709     __ add(s1, s1, temp1, ext:: uxth);
 7710 
 7711     __ subs(temp0, s1, base);
 7712     __ csel(s1, temp0, s1, Assembler::HS);
 7713 
 7714     // s2 = s2 % BASE
 7715     __ lsr(temp0, s2, 16);
 7716     __ lsl(temp1, temp0, 4);
 7717     __ sub(temp1, temp1, temp0);
 7718     __ add(temp1, temp1, s2, ext::uxth);
 7719 
 7720     __ lsr(temp0, temp1, 16);
 7721     __ lsl(s2, temp0, 4);
 7722     __ sub(s2, s2, temp0);
 7723     __ add(s2, s2, temp1, ext:: uxth);
 7724 
 7725     __ subs(temp0, s2, base);
 7726     __ csel(s2, temp0, s2, Assembler::HS);
 7727 
 7728     __ subs(len, len, nmax);
 7729     __ sub(count, nmax, 16);
 7730     __ br(Assembler::HS, L_nmax_loop);
 7731 
 7732     __ bind(L_by16);
 7733     __ adds(len, len, count);
 7734     __ br(Assembler::LO, L_by1);
 7735 
 7736     __ bind(L_by16_loop);
 7737 
 7738     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7739                                       vbytes, vs1acc, vs2acc, vtable);
 7740 
 7741     __ subs(len, len, 16);
 7742     __ br(Assembler::HS, L_by16_loop);
 7743 
 7744     __ bind(L_by1);
 7745     __ adds(len, len, 15);
 7746     __ br(Assembler::LO, L_do_mod);
 7747 
 7748     __ bind(L_by1_loop);
 7749     __ ldrb(temp0, Address(__ post(buff, 1)));
 7750     __ add(s1, temp0, s1);
 7751     __ add(s2, s2, s1);
 7752     __ subs(len, len, 1);
 7753     __ br(Assembler::HS, L_by1_loop);
 7754 
 7755     __ bind(L_do_mod);
 7756     // s1 = s1 % BASE
 7757     __ lsr(temp0, s1, 16);
 7758     __ lsl(temp1, temp0, 4);
 7759     __ sub(temp1, temp1, temp0);
 7760     __ add(temp1, temp1, s1, ext::uxth);
 7761 
 7762     __ lsr(temp0, temp1, 16);
 7763     __ lsl(s1, temp0, 4);
 7764     __ sub(s1, s1, temp0);
 7765     __ add(s1, s1, temp1, ext:: uxth);
 7766 
 7767     __ subs(temp0, s1, base);
 7768     __ csel(s1, temp0, s1, Assembler::HS);
 7769 
 7770     // s2 = s2 % BASE
 7771     __ lsr(temp0, s2, 16);
 7772     __ lsl(temp1, temp0, 4);
 7773     __ sub(temp1, temp1, temp0);
 7774     __ add(temp1, temp1, s2, ext::uxth);
 7775 
 7776     __ lsr(temp0, temp1, 16);
 7777     __ lsl(s2, temp0, 4);
 7778     __ sub(s2, s2, temp0);
 7779     __ add(s2, s2, temp1, ext:: uxth);
 7780 
 7781     __ subs(temp0, s2, base);
 7782     __ csel(s2, temp0, s2, Assembler::HS);
 7783 
 7784     // Combine lower bits and higher bits
 7785     __ bind(L_combine);
 7786     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7787 
 7788     __ ret(lr);
 7789 
 7790     return start;
 7791   }
 7792 
 7793   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7794           Register temp0, Register temp1, FloatRegister vbytes,
 7795           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7796     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7797     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7798     // In non-vectorized code, we update s1 and s2 as:
 7799     //   s1 <- s1 + b1
 7800     //   s2 <- s2 + s1
 7801     //   s1 <- s1 + b2
 7802     //   s2 <- s2 + b1
 7803     //   ...
 7804     //   s1 <- s1 + b16
 7805     //   s2 <- s2 + s1
 7806     // Putting above assignments together, we have:
 7807     //   s1_new = s1 + b1 + b2 + ... + b16
 7808     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7809     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7810     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7811     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7812 
 7813     // s2 = s2 + s1 * 16
 7814     __ add(s2, s2, s1, Assembler::LSL, 4);
 7815 
 7816     // vs1acc = b1 + b2 + b3 + ... + b16
 7817     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7818     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7819     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7820     __ uaddlv(vs1acc, __ T16B, vbytes);
 7821     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7822 
 7823     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7824     __ fmovd(temp0, vs1acc);
 7825     __ fmovd(temp1, vs2acc);
 7826     __ add(s1, s1, temp0);
 7827     __ add(s2, s2, temp1);
 7828   }
 7829 
 7830   /**
 7831    *  Arguments:
 7832    *
 7833    *  Input:
 7834    *    c_rarg0   - x address
 7835    *    c_rarg1   - x length
 7836    *    c_rarg2   - y address
 7837    *    c_rarg3   - y length
 7838    *    c_rarg4   - z address
 7839    */
 7840   address generate_multiplyToLen() {
 7841     __ align(CodeEntryAlignment);
 7842     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7843     StubCodeMark mark(this, stub_id);
 7844 
 7845     address start = __ pc();
 7846     const Register x     = r0;
 7847     const Register xlen  = r1;
 7848     const Register y     = r2;
 7849     const Register ylen  = r3;
 7850     const Register z     = r4;
 7851 
 7852     const Register tmp0  = r5;
 7853     const Register tmp1  = r10;
 7854     const Register tmp2  = r11;
 7855     const Register tmp3  = r12;
 7856     const Register tmp4  = r13;
 7857     const Register tmp5  = r14;
 7858     const Register tmp6  = r15;
 7859     const Register tmp7  = r16;
 7860 
 7861     BLOCK_COMMENT("Entry:");
 7862     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7863     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7864     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7865     __ ret(lr);
 7866 
 7867     return start;
 7868   }
 7869 
 7870   address generate_squareToLen() {
 7871     // squareToLen algorithm for sizes 1..127 described in java code works
 7872     // faster than multiply_to_len on some CPUs and slower on others, but
 7873     // multiply_to_len shows a bit better overall results
 7874     __ align(CodeEntryAlignment);
 7875     StubId stub_id = StubId::stubgen_squareToLen_id;
 7876     StubCodeMark mark(this, stub_id);
 7877     address start = __ pc();
 7878 
 7879     const Register x     = r0;
 7880     const Register xlen  = r1;
 7881     const Register z     = r2;
 7882     const Register y     = r4; // == x
 7883     const Register ylen  = r5; // == xlen
 7884 
 7885     const Register tmp0  = r3;
 7886     const Register tmp1  = r10;
 7887     const Register tmp2  = r11;
 7888     const Register tmp3  = r12;
 7889     const Register tmp4  = r13;
 7890     const Register tmp5  = r14;
 7891     const Register tmp6  = r15;
 7892     const Register tmp7  = r16;
 7893 
 7894     RegSet spilled_regs = RegSet::of(y, ylen);
 7895     BLOCK_COMMENT("Entry:");
 7896     __ enter();
 7897     __ push(spilled_regs, sp);
 7898     __ mov(y, x);
 7899     __ mov(ylen, xlen);
 7900     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7901     __ pop(spilled_regs, sp);
 7902     __ leave();
 7903     __ ret(lr);
 7904     return start;
 7905   }
 7906 
 7907   address generate_mulAdd() {
 7908     __ align(CodeEntryAlignment);
 7909     StubId stub_id = StubId::stubgen_mulAdd_id;
 7910     StubCodeMark mark(this, stub_id);
 7911 
 7912     address start = __ pc();
 7913 
 7914     const Register out     = r0;
 7915     const Register in      = r1;
 7916     const Register offset  = r2;
 7917     const Register len     = r3;
 7918     const Register k       = r4;
 7919 
 7920     BLOCK_COMMENT("Entry:");
 7921     __ enter();
 7922     __ mul_add(out, in, offset, len, k);
 7923     __ leave();
 7924     __ ret(lr);
 7925 
 7926     return start;
 7927   }
 7928 
 7929   // Arguments:
 7930   //
 7931   // Input:
 7932   //   c_rarg0   - newArr address
 7933   //   c_rarg1   - oldArr address
 7934   //   c_rarg2   - newIdx
 7935   //   c_rarg3   - shiftCount
 7936   //   c_rarg4   - numIter
 7937   //
 7938   address generate_bigIntegerRightShift() {
 7939     __ align(CodeEntryAlignment);
 7940     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7941     StubCodeMark mark(this, stub_id);
 7942     address start = __ pc();
 7943 
 7944     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7945 
 7946     Register newArr        = c_rarg0;
 7947     Register oldArr        = c_rarg1;
 7948     Register newIdx        = c_rarg2;
 7949     Register shiftCount    = c_rarg3;
 7950     Register numIter       = c_rarg4;
 7951     Register idx           = numIter;
 7952 
 7953     Register newArrCur     = rscratch1;
 7954     Register shiftRevCount = rscratch2;
 7955     Register oldArrCur     = r13;
 7956     Register oldArrNext    = r14;
 7957 
 7958     FloatRegister oldElem0        = v0;
 7959     FloatRegister oldElem1        = v1;
 7960     FloatRegister newElem         = v2;
 7961     FloatRegister shiftVCount     = v3;
 7962     FloatRegister shiftVRevCount  = v4;
 7963 
 7964     __ cbz(idx, Exit);
 7965 
 7966     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7967 
 7968     // left shift count
 7969     __ movw(shiftRevCount, 32);
 7970     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7971 
 7972     // numIter too small to allow a 4-words SIMD loop, rolling back
 7973     __ cmp(numIter, (u1)4);
 7974     __ br(Assembler::LT, ShiftThree);
 7975 
 7976     __ dup(shiftVCount,    __ T4S, shiftCount);
 7977     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7978     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7979 
 7980     __ BIND(ShiftSIMDLoop);
 7981 
 7982     // Calculate the load addresses
 7983     __ sub(idx, idx, 4);
 7984     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7985     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7986     __ add(oldArrCur,  oldArrNext, 4);
 7987 
 7988     // Load 4 words and process
 7989     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7990     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7991     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7992     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7993     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7994     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7995 
 7996     __ cmp(idx, (u1)4);
 7997     __ br(Assembler::LT, ShiftTwoLoop);
 7998     __ b(ShiftSIMDLoop);
 7999 
 8000     __ BIND(ShiftTwoLoop);
 8001     __ cbz(idx, Exit);
 8002     __ cmp(idx, (u1)1);
 8003     __ br(Assembler::EQ, ShiftOne);
 8004 
 8005     // Calculate the load addresses
 8006     __ sub(idx, idx, 2);
 8007     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8008     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8009     __ add(oldArrCur,  oldArrNext, 4);
 8010 
 8011     // Load 2 words and process
 8012     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 8013     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 8014     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 8015     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 8016     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 8017     __ st1(newElem,   __ T2S, Address(newArrCur));
 8018     __ b(ShiftTwoLoop);
 8019 
 8020     __ BIND(ShiftThree);
 8021     __ tbz(idx, 1, ShiftOne);
 8022     __ tbz(idx, 0, ShiftTwo);
 8023     __ ldrw(r10,  Address(oldArr, 12));
 8024     __ ldrw(r11,  Address(oldArr, 8));
 8025     __ lsrvw(r10, r10, shiftCount);
 8026     __ lslvw(r11, r11, shiftRevCount);
 8027     __ orrw(r12,  r10, r11);
 8028     __ strw(r12,  Address(newArr, 8));
 8029 
 8030     __ BIND(ShiftTwo);
 8031     __ ldrw(r10,  Address(oldArr, 8));
 8032     __ ldrw(r11,  Address(oldArr, 4));
 8033     __ lsrvw(r10, r10, shiftCount);
 8034     __ lslvw(r11, r11, shiftRevCount);
 8035     __ orrw(r12,  r10, r11);
 8036     __ strw(r12,  Address(newArr, 4));
 8037 
 8038     __ BIND(ShiftOne);
 8039     __ ldrw(r10,  Address(oldArr, 4));
 8040     __ ldrw(r11,  Address(oldArr));
 8041     __ lsrvw(r10, r10, shiftCount);
 8042     __ lslvw(r11, r11, shiftRevCount);
 8043     __ orrw(r12,  r10, r11);
 8044     __ strw(r12,  Address(newArr));
 8045 
 8046     __ BIND(Exit);
 8047     __ ret(lr);
 8048 
 8049     return start;
 8050   }
 8051 
 8052   // Arguments:
 8053   //
 8054   // Input:
 8055   //   c_rarg0   - newArr address
 8056   //   c_rarg1   - oldArr address
 8057   //   c_rarg2   - newIdx
 8058   //   c_rarg3   - shiftCount
 8059   //   c_rarg4   - numIter
 8060   //
 8061   address generate_bigIntegerLeftShift() {
 8062     __ align(CodeEntryAlignment);
 8063     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8064     StubCodeMark mark(this, stub_id);
 8065     address start = __ pc();
 8066 
 8067     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8068 
 8069     Register newArr        = c_rarg0;
 8070     Register oldArr        = c_rarg1;
 8071     Register newIdx        = c_rarg2;
 8072     Register shiftCount    = c_rarg3;
 8073     Register numIter       = c_rarg4;
 8074 
 8075     Register shiftRevCount = rscratch1;
 8076     Register oldArrNext    = rscratch2;
 8077 
 8078     FloatRegister oldElem0        = v0;
 8079     FloatRegister oldElem1        = v1;
 8080     FloatRegister newElem         = v2;
 8081     FloatRegister shiftVCount     = v3;
 8082     FloatRegister shiftVRevCount  = v4;
 8083 
 8084     __ cbz(numIter, Exit);
 8085 
 8086     __ add(oldArrNext, oldArr, 4);
 8087     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8088 
 8089     // right shift count
 8090     __ movw(shiftRevCount, 32);
 8091     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8092 
 8093     // numIter too small to allow a 4-words SIMD loop, rolling back
 8094     __ cmp(numIter, (u1)4);
 8095     __ br(Assembler::LT, ShiftThree);
 8096 
 8097     __ dup(shiftVCount,     __ T4S, shiftCount);
 8098     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8099     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8100 
 8101     __ BIND(ShiftSIMDLoop);
 8102 
 8103     // load 4 words and process
 8104     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8105     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8106     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8107     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8108     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8109     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8110     __ sub(numIter,   numIter, 4);
 8111 
 8112     __ cmp(numIter, (u1)4);
 8113     __ br(Assembler::LT, ShiftTwoLoop);
 8114     __ b(ShiftSIMDLoop);
 8115 
 8116     __ BIND(ShiftTwoLoop);
 8117     __ cbz(numIter, Exit);
 8118     __ cmp(numIter, (u1)1);
 8119     __ br(Assembler::EQ, ShiftOne);
 8120 
 8121     // load 2 words and process
 8122     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8123     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8124     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8125     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8126     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8127     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8128     __ sub(numIter,   numIter, 2);
 8129     __ b(ShiftTwoLoop);
 8130 
 8131     __ BIND(ShiftThree);
 8132     __ ldrw(r10,  __ post(oldArr, 4));
 8133     __ ldrw(r11,  __ post(oldArrNext, 4));
 8134     __ lslvw(r10, r10, shiftCount);
 8135     __ lsrvw(r11, r11, shiftRevCount);
 8136     __ orrw(r12,  r10, r11);
 8137     __ strw(r12,  __ post(newArr, 4));
 8138     __ tbz(numIter, 1, Exit);
 8139     __ tbz(numIter, 0, ShiftOne);
 8140 
 8141     __ BIND(ShiftTwo);
 8142     __ ldrw(r10,  __ post(oldArr, 4));
 8143     __ ldrw(r11,  __ post(oldArrNext, 4));
 8144     __ lslvw(r10, r10, shiftCount);
 8145     __ lsrvw(r11, r11, shiftRevCount);
 8146     __ orrw(r12,  r10, r11);
 8147     __ strw(r12,  __ post(newArr, 4));
 8148 
 8149     __ BIND(ShiftOne);
 8150     __ ldrw(r10,  Address(oldArr));
 8151     __ ldrw(r11,  Address(oldArrNext));
 8152     __ lslvw(r10, r10, shiftCount);
 8153     __ lsrvw(r11, r11, shiftRevCount);
 8154     __ orrw(r12,  r10, r11);
 8155     __ strw(r12,  Address(newArr));
 8156 
 8157     __ BIND(Exit);
 8158     __ ret(lr);
 8159 
 8160     return start;
 8161   }
 8162 
 8163   address generate_count_positives(address &count_positives_long) {
 8164     const u1 large_loop_size = 64;
 8165     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8166     int dcache_line = VM_Version::dcache_line_size();
 8167 
 8168     Register ary1 = r1, len = r2, result = r0;
 8169 
 8170     __ align(CodeEntryAlignment);
 8171 
 8172     StubId stub_id = StubId::stubgen_count_positives_id;
 8173     StubCodeMark mark(this, stub_id);
 8174 
 8175     address entry = __ pc();
 8176 
 8177     __ enter();
 8178     // precondition: a copy of len is already in result
 8179     // __ mov(result, len);
 8180 
 8181   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8182         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8183 
 8184   __ cmp(len, (u1)15);
 8185   __ br(Assembler::GT, LEN_OVER_15);
 8186   // The only case when execution falls into this code is when pointer is near
 8187   // the end of memory page and we have to avoid reading next page
 8188   __ add(ary1, ary1, len);
 8189   __ subs(len, len, 8);
 8190   __ br(Assembler::GT, LEN_OVER_8);
 8191   __ ldr(rscratch2, Address(ary1, -8));
 8192   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8193   __ lsrv(rscratch2, rscratch2, rscratch1);
 8194   __ tst(rscratch2, UPPER_BIT_MASK);
 8195   __ csel(result, zr, result, Assembler::NE);
 8196   __ leave();
 8197   __ ret(lr);
 8198   __ bind(LEN_OVER_8);
 8199   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8200   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8201   __ tst(rscratch2, UPPER_BIT_MASK);
 8202   __ br(Assembler::NE, RET_NO_POP);
 8203   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8204   __ lsrv(rscratch1, rscratch1, rscratch2);
 8205   __ tst(rscratch1, UPPER_BIT_MASK);
 8206   __ bind(RET_NO_POP);
 8207   __ csel(result, zr, result, Assembler::NE);
 8208   __ leave();
 8209   __ ret(lr);
 8210 
 8211   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8212   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8213 
 8214   count_positives_long = __ pc(); // 2nd entry point
 8215 
 8216   __ enter();
 8217 
 8218   __ bind(LEN_OVER_15);
 8219     __ push(spilled_regs, sp);
 8220     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8221     __ cbz(rscratch2, ALIGNED);
 8222     __ ldp(tmp6, tmp1, Address(ary1));
 8223     __ mov(tmp5, 16);
 8224     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8225     __ add(ary1, ary1, rscratch1);
 8226     __ orr(tmp6, tmp6, tmp1);
 8227     __ tst(tmp6, UPPER_BIT_MASK);
 8228     __ br(Assembler::NE, RET_ADJUST);
 8229     __ sub(len, len, rscratch1);
 8230 
 8231   __ bind(ALIGNED);
 8232     __ cmp(len, large_loop_size);
 8233     __ br(Assembler::LT, CHECK_16);
 8234     // Perform 16-byte load as early return in pre-loop to handle situation
 8235     // when initially aligned large array has negative values at starting bytes,
 8236     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8237     // slower. Cases with negative bytes further ahead won't be affected that
 8238     // much. In fact, it'll be faster due to early loads, less instructions and
 8239     // less branches in LARGE_LOOP.
 8240     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8241     __ sub(len, len, 16);
 8242     __ orr(tmp6, tmp6, tmp1);
 8243     __ tst(tmp6, UPPER_BIT_MASK);
 8244     __ br(Assembler::NE, RET_ADJUST_16);
 8245     __ cmp(len, large_loop_size);
 8246     __ br(Assembler::LT, CHECK_16);
 8247 
 8248     if (SoftwarePrefetchHintDistance >= 0
 8249         && SoftwarePrefetchHintDistance >= dcache_line) {
 8250       // initial prefetch
 8251       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8252     }
 8253   __ bind(LARGE_LOOP);
 8254     if (SoftwarePrefetchHintDistance >= 0) {
 8255       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8256     }
 8257     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8258     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8259     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8260     // instructions per cycle and have less branches, but this approach disables
 8261     // early return, thus, all 64 bytes are loaded and checked every time.
 8262     __ ldp(tmp2, tmp3, Address(ary1));
 8263     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8264     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8265     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8266     __ add(ary1, ary1, large_loop_size);
 8267     __ sub(len, len, large_loop_size);
 8268     __ orr(tmp2, tmp2, tmp3);
 8269     __ orr(tmp4, tmp4, tmp5);
 8270     __ orr(rscratch1, rscratch1, rscratch2);
 8271     __ orr(tmp6, tmp6, tmp1);
 8272     __ orr(tmp2, tmp2, tmp4);
 8273     __ orr(rscratch1, rscratch1, tmp6);
 8274     __ orr(tmp2, tmp2, rscratch1);
 8275     __ tst(tmp2, UPPER_BIT_MASK);
 8276     __ br(Assembler::NE, RET_ADJUST_LONG);
 8277     __ cmp(len, large_loop_size);
 8278     __ br(Assembler::GE, LARGE_LOOP);
 8279 
 8280   __ bind(CHECK_16); // small 16-byte load pre-loop
 8281     __ cmp(len, (u1)16);
 8282     __ br(Assembler::LT, POST_LOOP16);
 8283 
 8284   __ bind(LOOP16); // small 16-byte load loop
 8285     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8286     __ sub(len, len, 16);
 8287     __ orr(tmp2, tmp2, tmp3);
 8288     __ tst(tmp2, UPPER_BIT_MASK);
 8289     __ br(Assembler::NE, RET_ADJUST_16);
 8290     __ cmp(len, (u1)16);
 8291     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8292 
 8293   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8294     __ cmp(len, (u1)8);
 8295     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8296     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8297     __ tst(tmp3, UPPER_BIT_MASK);
 8298     __ br(Assembler::NE, RET_ADJUST);
 8299     __ sub(len, len, 8);
 8300 
 8301   __ bind(POST_LOOP16_LOAD_TAIL);
 8302     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8303     __ ldr(tmp1, Address(ary1));
 8304     __ mov(tmp2, 64);
 8305     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8306     __ lslv(tmp1, tmp1, tmp4);
 8307     __ tst(tmp1, UPPER_BIT_MASK);
 8308     __ br(Assembler::NE, RET_ADJUST);
 8309     // Fallthrough
 8310 
 8311   __ bind(RET_LEN);
 8312     __ pop(spilled_regs, sp);
 8313     __ leave();
 8314     __ ret(lr);
 8315 
 8316     // difference result - len is the count of guaranteed to be
 8317     // positive bytes
 8318 
 8319   __ bind(RET_ADJUST_LONG);
 8320     __ add(len, len, (u1)(large_loop_size - 16));
 8321   __ bind(RET_ADJUST_16);
 8322     __ add(len, len, 16);
 8323   __ bind(RET_ADJUST);
 8324     __ pop(spilled_regs, sp);
 8325     __ leave();
 8326     __ sub(result, result, len);
 8327     __ ret(lr);
 8328 
 8329     return entry;
 8330   }
 8331 
 8332   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8333         bool usePrefetch, Label &NOT_EQUAL) {
 8334     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8335         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8336         tmp7 = r12, tmp8 = r13;
 8337     Label LOOP;
 8338 
 8339     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8340     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8341     __ bind(LOOP);
 8342     if (usePrefetch) {
 8343       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8344       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8345     }
 8346     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8347     __ eor(tmp1, tmp1, tmp2);
 8348     __ eor(tmp3, tmp3, tmp4);
 8349     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8350     __ orr(tmp1, tmp1, tmp3);
 8351     __ cbnz(tmp1, NOT_EQUAL);
 8352     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8353     __ eor(tmp5, tmp5, tmp6);
 8354     __ eor(tmp7, tmp7, tmp8);
 8355     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8356     __ orr(tmp5, tmp5, tmp7);
 8357     __ cbnz(tmp5, NOT_EQUAL);
 8358     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8359     __ eor(tmp1, tmp1, tmp2);
 8360     __ eor(tmp3, tmp3, tmp4);
 8361     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8362     __ orr(tmp1, tmp1, tmp3);
 8363     __ cbnz(tmp1, NOT_EQUAL);
 8364     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8365     __ eor(tmp5, tmp5, tmp6);
 8366     __ sub(cnt1, cnt1, 8 * wordSize);
 8367     __ eor(tmp7, tmp7, tmp8);
 8368     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8369     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8370     // cmp) because subs allows an unlimited range of immediate operand.
 8371     __ subs(tmp6, cnt1, loopThreshold);
 8372     __ orr(tmp5, tmp5, tmp7);
 8373     __ cbnz(tmp5, NOT_EQUAL);
 8374     __ br(__ GE, LOOP);
 8375     // post-loop
 8376     __ eor(tmp1, tmp1, tmp2);
 8377     __ eor(tmp3, tmp3, tmp4);
 8378     __ orr(tmp1, tmp1, tmp3);
 8379     __ sub(cnt1, cnt1, 2 * wordSize);
 8380     __ cbnz(tmp1, NOT_EQUAL);
 8381   }
 8382 
 8383   void generate_large_array_equals_loop_simd(int loopThreshold,
 8384         bool usePrefetch, Label &NOT_EQUAL) {
 8385     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8386         tmp2 = rscratch2;
 8387     Label LOOP;
 8388 
 8389     __ bind(LOOP);
 8390     if (usePrefetch) {
 8391       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8392       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8393     }
 8394     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8395     __ sub(cnt1, cnt1, 8 * wordSize);
 8396     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8397     __ subs(tmp1, cnt1, loopThreshold);
 8398     __ eor(v0, __ T16B, v0, v4);
 8399     __ eor(v1, __ T16B, v1, v5);
 8400     __ eor(v2, __ T16B, v2, v6);
 8401     __ eor(v3, __ T16B, v3, v7);
 8402     __ orr(v0, __ T16B, v0, v1);
 8403     __ orr(v1, __ T16B, v2, v3);
 8404     __ orr(v0, __ T16B, v0, v1);
 8405     __ umov(tmp1, v0, __ D, 0);
 8406     __ umov(tmp2, v0, __ D, 1);
 8407     __ orr(tmp1, tmp1, tmp2);
 8408     __ cbnz(tmp1, NOT_EQUAL);
 8409     __ br(__ GE, LOOP);
 8410   }
 8411 
 8412   // a1 = r1 - array1 address
 8413   // a2 = r2 - array2 address
 8414   // result = r0 - return value. Already contains "false"
 8415   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8416   // r3-r5 are reserved temporary registers
 8417   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8418   address generate_large_array_equals() {
 8419     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8420         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8421         tmp7 = r12, tmp8 = r13;
 8422     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8423         SMALL_LOOP, POST_LOOP;
 8424     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8425     // calculate if at least 32 prefetched bytes are used
 8426     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8427     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8428     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8429     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8430         tmp5, tmp6, tmp7, tmp8);
 8431 
 8432     __ align(CodeEntryAlignment);
 8433 
 8434     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8435     StubCodeMark mark(this, stub_id);
 8436 
 8437     address entry = __ pc();
 8438     __ enter();
 8439     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8440     // also advance pointers to use post-increment instead of pre-increment
 8441     __ add(a1, a1, wordSize);
 8442     __ add(a2, a2, wordSize);
 8443     if (AvoidUnalignedAccesses) {
 8444       // both implementations (SIMD/nonSIMD) are using relatively large load
 8445       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8446       // on some CPUs in case of address is not at least 16-byte aligned.
 8447       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8448       // load if needed at least for 1st address and make if 16-byte aligned.
 8449       Label ALIGNED16;
 8450       __ tbz(a1, 3, ALIGNED16);
 8451       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8452       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8453       __ sub(cnt1, cnt1, wordSize);
 8454       __ eor(tmp1, tmp1, tmp2);
 8455       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8456       __ bind(ALIGNED16);
 8457     }
 8458     if (UseSIMDForArrayEquals) {
 8459       if (SoftwarePrefetchHintDistance >= 0) {
 8460         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8461         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8462         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8463             /* prfm = */ true, NOT_EQUAL);
 8464         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8465         __ br(__ LT, TAIL);
 8466       }
 8467       __ bind(NO_PREFETCH_LARGE_LOOP);
 8468       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8469           /* prfm = */ false, NOT_EQUAL);
 8470     } else {
 8471       __ push(spilled_regs, sp);
 8472       if (SoftwarePrefetchHintDistance >= 0) {
 8473         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8474         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8475         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8476             /* prfm = */ true, NOT_EQUAL);
 8477         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8478         __ br(__ LT, TAIL);
 8479       }
 8480       __ bind(NO_PREFETCH_LARGE_LOOP);
 8481       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8482           /* prfm = */ false, NOT_EQUAL);
 8483     }
 8484     __ bind(TAIL);
 8485       __ cbz(cnt1, EQUAL);
 8486       __ subs(cnt1, cnt1, wordSize);
 8487       __ br(__ LE, POST_LOOP);
 8488     __ bind(SMALL_LOOP);
 8489       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8490       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8491       __ subs(cnt1, cnt1, wordSize);
 8492       __ eor(tmp1, tmp1, tmp2);
 8493       __ cbnz(tmp1, NOT_EQUAL);
 8494       __ br(__ GT, SMALL_LOOP);
 8495     __ bind(POST_LOOP);
 8496       __ ldr(tmp1, Address(a1, cnt1));
 8497       __ ldr(tmp2, Address(a2, cnt1));
 8498       __ eor(tmp1, tmp1, tmp2);
 8499       __ cbnz(tmp1, NOT_EQUAL);
 8500     __ bind(EQUAL);
 8501       __ mov(result, true);
 8502     __ bind(NOT_EQUAL);
 8503       if (!UseSIMDForArrayEquals) {
 8504         __ pop(spilled_regs, sp);
 8505       }
 8506     __ bind(NOT_EQUAL_NO_POP);
 8507     __ leave();
 8508     __ ret(lr);
 8509     return entry;
 8510   }
 8511 
 8512   // result = r0 - return value. Contains initial hashcode value on entry.
 8513   // ary = r1 - array address
 8514   // cnt = r2 - elements count
 8515   // Clobbers: v0-v13, rscratch1, rscratch2
 8516   address generate_large_arrays_hashcode(BasicType eltype) {
 8517     const Register result = r0, ary = r1, cnt = r2;
 8518     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8519     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8520     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8521     const FloatRegister vpowm = v13;
 8522 
 8523     ARRAYS_HASHCODE_REGISTERS;
 8524 
 8525     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8526 
 8527     unsigned int vf; // vectorization factor
 8528     bool multiply_by_halves;
 8529     Assembler::SIMD_Arrangement load_arrangement;
 8530     switch (eltype) {
 8531     case T_BOOLEAN:
 8532     case T_BYTE:
 8533       load_arrangement = Assembler::T8B;
 8534       multiply_by_halves = true;
 8535       vf = 8;
 8536       break;
 8537     case T_CHAR:
 8538     case T_SHORT:
 8539       load_arrangement = Assembler::T8H;
 8540       multiply_by_halves = true;
 8541       vf = 8;
 8542       break;
 8543     case T_INT:
 8544       load_arrangement = Assembler::T4S;
 8545       multiply_by_halves = false;
 8546       vf = 4;
 8547       break;
 8548     default:
 8549       ShouldNotReachHere();
 8550     }
 8551 
 8552     // Unroll factor
 8553     const unsigned uf = 4;
 8554 
 8555     // Effective vectorization factor
 8556     const unsigned evf = vf * uf;
 8557 
 8558     __ align(CodeEntryAlignment);
 8559 
 8560     StubId stub_id;
 8561     switch (eltype) {
 8562     case T_BOOLEAN:
 8563       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8564       break;
 8565     case T_BYTE:
 8566       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8567       break;
 8568     case T_CHAR:
 8569       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8570       break;
 8571     case T_SHORT:
 8572       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8573       break;
 8574     case T_INT:
 8575       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8576       break;
 8577     default:
 8578       stub_id = StubId::NO_STUBID;
 8579       ShouldNotReachHere();
 8580     };
 8581 
 8582     StubCodeMark mark(this, stub_id);
 8583 
 8584     address entry = __ pc();
 8585     __ enter();
 8586 
 8587     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8588     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8589     // value shouldn't change throughout both loops.
 8590     __ movw(rscratch1, intpow(31U, 3));
 8591     __ mov(vpow, Assembler::S, 0, rscratch1);
 8592     __ movw(rscratch1, intpow(31U, 2));
 8593     __ mov(vpow, Assembler::S, 1, rscratch1);
 8594     __ movw(rscratch1, intpow(31U, 1));
 8595     __ mov(vpow, Assembler::S, 2, rscratch1);
 8596     __ movw(rscratch1, intpow(31U, 0));
 8597     __ mov(vpow, Assembler::S, 3, rscratch1);
 8598 
 8599     __ mov(vmul0, Assembler::T16B, 0);
 8600     __ mov(vmul0, Assembler::S, 3, result);
 8601 
 8602     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8603     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8604 
 8605     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8606     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8607 
 8608     // SMALL LOOP
 8609     __ bind(SMALL_LOOP);
 8610 
 8611     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8612     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8613     __ subsw(rscratch2, rscratch2, vf);
 8614 
 8615     if (load_arrangement == Assembler::T8B) {
 8616       // Extend 8B to 8H to be able to use vector multiply
 8617       // instructions
 8618       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8619       if (is_signed_subword_type(eltype)) {
 8620         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8621       } else {
 8622         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8623       }
 8624     }
 8625 
 8626     switch (load_arrangement) {
 8627     case Assembler::T4S:
 8628       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8629       break;
 8630     case Assembler::T8B:
 8631     case Assembler::T8H:
 8632       assert(is_subword_type(eltype), "subword type expected");
 8633       if (is_signed_subword_type(eltype)) {
 8634         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8635       } else {
 8636         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8637       }
 8638       break;
 8639     default:
 8640       __ should_not_reach_here();
 8641     }
 8642 
 8643     // Process the upper half of a vector
 8644     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8645       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8646       if (is_signed_subword_type(eltype)) {
 8647         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8648       } else {
 8649         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8650       }
 8651     }
 8652 
 8653     __ br(Assembler::HI, SMALL_LOOP);
 8654 
 8655     // SMALL LOOP'S EPILOQUE
 8656     __ lsr(rscratch2, cnt, exact_log2(evf));
 8657     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8658 
 8659     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8660     __ addv(vmul0, Assembler::T4S, vmul0);
 8661     __ umov(result, vmul0, Assembler::S, 0);
 8662 
 8663     // TAIL
 8664     __ bind(TAIL);
 8665 
 8666     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8667     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8668     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8669     __ andr(rscratch2, cnt, vf - 1);
 8670     __ bind(TAIL_SHORTCUT);
 8671     __ adr(rscratch1, BR_BASE);
 8672     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8673     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8674     __ movw(rscratch2, 0x1f);
 8675     __ br(rscratch1);
 8676 
 8677     for (size_t i = 0; i < vf - 1; ++i) {
 8678       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8679                                    eltype);
 8680       __ maddw(result, result, rscratch2, rscratch1);
 8681       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8682       // Generate 2nd nop to have 4 instructions per iteration.
 8683       if (VM_Version::supports_a53mac()) {
 8684         __ nop();
 8685       }
 8686     }
 8687     __ bind(BR_BASE);
 8688 
 8689     __ leave();
 8690     __ ret(lr);
 8691 
 8692     // LARGE LOOP
 8693     __ bind(LARGE_LOOP_PREHEADER);
 8694 
 8695     __ lsr(rscratch2, cnt, exact_log2(evf));
 8696 
 8697     if (multiply_by_halves) {
 8698       // 31^4 - multiplier between lower and upper parts of a register
 8699       __ movw(rscratch1, intpow(31U, vf / 2));
 8700       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8701       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8702       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8703       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8704     } else {
 8705       // 31^16
 8706       __ movw(rscratch1, intpow(31U, evf));
 8707       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8708     }
 8709 
 8710     __ mov(vmul3, Assembler::T16B, 0);
 8711     __ mov(vmul2, Assembler::T16B, 0);
 8712     __ mov(vmul1, Assembler::T16B, 0);
 8713 
 8714     __ bind(LARGE_LOOP);
 8715 
 8716     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8717     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8718     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8719     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8720 
 8721     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8722            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8723 
 8724     if (load_arrangement == Assembler::T8B) {
 8725       // Extend 8B to 8H to be able to use vector multiply
 8726       // instructions
 8727       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8728       if (is_signed_subword_type(eltype)) {
 8729         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8730         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8731         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8732         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8733       } else {
 8734         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8735         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8736         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8737         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8738       }
 8739     }
 8740 
 8741     switch (load_arrangement) {
 8742     case Assembler::T4S:
 8743       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8744       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8745       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8746       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8747       break;
 8748     case Assembler::T8B:
 8749     case Assembler::T8H:
 8750       assert(is_subword_type(eltype), "subword type expected");
 8751       if (is_signed_subword_type(eltype)) {
 8752         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8753         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8754         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8755         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8756       } else {
 8757         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8758         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8759         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8760         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8761       }
 8762       break;
 8763     default:
 8764       __ should_not_reach_here();
 8765     }
 8766 
 8767     // Process the upper half of a vector
 8768     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8769       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8770       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8771       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8772       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8773       if (is_signed_subword_type(eltype)) {
 8774         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8775         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8776         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8777         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8778       } else {
 8779         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8780         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8781         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8782         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8783       }
 8784     }
 8785 
 8786     __ subsw(rscratch2, rscratch2, 1);
 8787     __ br(Assembler::HI, LARGE_LOOP);
 8788 
 8789     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8790     __ addv(vmul3, Assembler::T4S, vmul3);
 8791     __ umov(result, vmul3, Assembler::S, 0);
 8792 
 8793     __ mov(rscratch2, intpow(31U, vf));
 8794 
 8795     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8796     __ addv(vmul2, Assembler::T4S, vmul2);
 8797     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8798     __ maddw(result, result, rscratch2, rscratch1);
 8799 
 8800     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8801     __ addv(vmul1, Assembler::T4S, vmul1);
 8802     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8803     __ maddw(result, result, rscratch2, rscratch1);
 8804 
 8805     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8806     __ addv(vmul0, Assembler::T4S, vmul0);
 8807     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8808     __ maddw(result, result, rscratch2, rscratch1);
 8809 
 8810     __ andr(rscratch2, cnt, vf - 1);
 8811     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8812 
 8813     __ leave();
 8814     __ ret(lr);
 8815 
 8816     return entry;
 8817   }
 8818 
 8819   address generate_dsin_dcos(bool isCos) {
 8820     __ align(CodeEntryAlignment);
 8821     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8822     StubCodeMark mark(this, stub_id);
 8823     address start = __ pc();
 8824     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8825         (address)StubRoutines::aarch64::_two_over_pi,
 8826         (address)StubRoutines::aarch64::_pio2,
 8827         (address)StubRoutines::aarch64::_dsin_coef,
 8828         (address)StubRoutines::aarch64::_dcos_coef);
 8829     return start;
 8830   }
 8831 
 8832   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8833   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8834       Label &DIFF2) {
 8835     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8836     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8837 
 8838     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8839     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8840     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8841     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8842 
 8843     __ fmovd(tmpL, vtmp3);
 8844     __ eor(rscratch2, tmp3, tmpL);
 8845     __ cbnz(rscratch2, DIFF2);
 8846 
 8847     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8848     __ umov(tmpL, vtmp3, __ D, 1);
 8849     __ eor(rscratch2, tmpU, tmpL);
 8850     __ cbnz(rscratch2, DIFF1);
 8851 
 8852     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8853     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8854     __ fmovd(tmpL, vtmp);
 8855     __ eor(rscratch2, tmp3, tmpL);
 8856     __ cbnz(rscratch2, DIFF2);
 8857 
 8858     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8859     __ umov(tmpL, vtmp, __ D, 1);
 8860     __ eor(rscratch2, tmpU, tmpL);
 8861     __ cbnz(rscratch2, DIFF1);
 8862   }
 8863 
 8864   // r0  = result
 8865   // r1  = str1
 8866   // r2  = cnt1
 8867   // r3  = str2
 8868   // r4  = cnt2
 8869   // r10 = tmp1
 8870   // r11 = tmp2
 8871   address generate_compare_long_string_different_encoding(bool isLU) {
 8872     __ align(CodeEntryAlignment);
 8873     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8874     StubCodeMark mark(this, stub_id);
 8875     address entry = __ pc();
 8876     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8877         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8878         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8879     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8880         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8881     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8882     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8883 
 8884     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8885 
 8886     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8887     // cnt2 == amount of characters left to compare
 8888     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8889     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8890     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8891     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8892     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8893     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8894     __ eor(rscratch2, tmp1, tmp2);
 8895     __ mov(rscratch1, tmp2);
 8896     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8897     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8898              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8899     __ push(spilled_regs, sp);
 8900     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8901     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8902 
 8903     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8904 
 8905     if (SoftwarePrefetchHintDistance >= 0) {
 8906       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8907       __ br(__ LT, NO_PREFETCH);
 8908       __ bind(LARGE_LOOP_PREFETCH);
 8909         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8910         __ mov(tmp4, 2);
 8911         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8912         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8913           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8914           __ subs(tmp4, tmp4, 1);
 8915           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8916           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8917           __ mov(tmp4, 2);
 8918         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8919           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8920           __ subs(tmp4, tmp4, 1);
 8921           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8922           __ sub(cnt2, cnt2, 64);
 8923           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8924           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8925     }
 8926     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8927     __ bind(NO_PREFETCH);
 8928     __ subs(cnt2, cnt2, 16);
 8929     __ br(__ LT, TAIL);
 8930     __ align(OptoLoopAlignment);
 8931     __ bind(SMALL_LOOP); // smaller loop
 8932       __ subs(cnt2, cnt2, 16);
 8933       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8934       __ br(__ GE, SMALL_LOOP);
 8935       __ cmn(cnt2, (u1)16);
 8936       __ br(__ EQ, LOAD_LAST);
 8937     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8938       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8939       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8940       __ ldr(tmp3, Address(cnt1, -8));
 8941       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8942       __ b(LOAD_LAST);
 8943     __ bind(DIFF2);
 8944       __ mov(tmpU, tmp3);
 8945     __ bind(DIFF1);
 8946       __ pop(spilled_regs, sp);
 8947       __ b(CALCULATE_DIFFERENCE);
 8948     __ bind(LOAD_LAST);
 8949       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8950       // No need to load it again
 8951       __ mov(tmpU, tmp3);
 8952       __ pop(spilled_regs, sp);
 8953 
 8954       // tmp2 points to the address of the last 4 Latin1 characters right now
 8955       __ ldrs(vtmp, Address(tmp2));
 8956       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8957       __ fmovd(tmpL, vtmp);
 8958 
 8959       __ eor(rscratch2, tmpU, tmpL);
 8960       __ cbz(rscratch2, DONE);
 8961 
 8962     // Find the first different characters in the longwords and
 8963     // compute their difference.
 8964     __ bind(CALCULATE_DIFFERENCE);
 8965       __ rev(rscratch2, rscratch2);
 8966       __ clz(rscratch2, rscratch2);
 8967       __ andr(rscratch2, rscratch2, -16);
 8968       __ lsrv(tmp1, tmp1, rscratch2);
 8969       __ uxthw(tmp1, tmp1);
 8970       __ lsrv(rscratch1, rscratch1, rscratch2);
 8971       __ uxthw(rscratch1, rscratch1);
 8972       __ subw(result, tmp1, rscratch1);
 8973     __ bind(DONE);
 8974       __ ret(lr);
 8975     return entry;
 8976   }
 8977 
 8978   // r0 = input (float16)
 8979   // v0 = result (float)
 8980   // v1 = temporary float register
 8981   address generate_float16ToFloat() {
 8982     __ align(CodeEntryAlignment);
 8983     StubId stub_id = StubId::stubgen_hf2f_id;
 8984     StubCodeMark mark(this, stub_id);
 8985     address entry = __ pc();
 8986     BLOCK_COMMENT("Entry:");
 8987     __ flt16_to_flt(v0, r0, v1);
 8988     __ ret(lr);
 8989     return entry;
 8990   }
 8991 
 8992   // v0 = input (float)
 8993   // r0 = result (float16)
 8994   // v1 = temporary float register
 8995   address generate_floatToFloat16() {
 8996     __ align(CodeEntryAlignment);
 8997     StubId stub_id = StubId::stubgen_f2hf_id;
 8998     StubCodeMark mark(this, stub_id);
 8999     address entry = __ pc();
 9000     BLOCK_COMMENT("Entry:");
 9001     __ flt_to_flt16(r0, v0, v1);
 9002     __ ret(lr);
 9003     return entry;
 9004   }
 9005 
 9006   address generate_method_entry_barrier() {
 9007     __ align(CodeEntryAlignment);
 9008     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 9009     StubCodeMark mark(this, stub_id);
 9010 
 9011     Label deoptimize_label;
 9012 
 9013     address start = __ pc();
 9014 
 9015     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 9016 
 9017     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 9018       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 9019       // We can get here despite the nmethod being good, if we have not
 9020       // yet applied our cross modification fence (or data fence).
 9021       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9022       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9023       __ ldrw(rscratch2, rscratch2);
 9024       __ strw(rscratch2, thread_epoch_addr);
 9025       __ isb();
 9026       __ membar(__ LoadLoad);
 9027     }
 9028 
 9029     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9030 
 9031     __ enter();
 9032     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9033 
 9034     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9035 
 9036     __ push_call_clobbered_registers();
 9037 
 9038     __ mov(c_rarg0, rscratch2);
 9039     __ call_VM_leaf
 9040          (CAST_FROM_FN_PTR
 9041           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9042 
 9043     __ reset_last_Java_frame(true);
 9044 
 9045     __ mov(rscratch1, r0);
 9046 
 9047     __ pop_call_clobbered_registers();
 9048 
 9049     __ cbnz(rscratch1, deoptimize_label);
 9050 
 9051     __ leave();
 9052     __ ret(lr);
 9053 
 9054     __ BIND(deoptimize_label);
 9055 
 9056     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9057     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9058 
 9059     __ mov(sp, rscratch1);
 9060     __ br(rscratch2);
 9061 
 9062     return start;
 9063   }
 9064 
 9065   // r0  = result
 9066   // r1  = str1
 9067   // r2  = cnt1
 9068   // r3  = str2
 9069   // r4  = cnt2
 9070   // r10 = tmp1
 9071   // r11 = tmp2
 9072   address generate_compare_long_string_same_encoding(bool isLL) {
 9073     __ align(CodeEntryAlignment);
 9074     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9075     StubCodeMark mark(this, stub_id);
 9076     address entry = __ pc();
 9077     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9078         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9079 
 9080     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9081 
 9082     // exit from large loop when less than 64 bytes left to read or we're about
 9083     // to prefetch memory behind array border
 9084     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9085 
 9086     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9087     __ eor(rscratch2, tmp1, tmp2);
 9088     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9089 
 9090     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9091     // update pointers, because of previous read
 9092     __ add(str1, str1, wordSize);
 9093     __ add(str2, str2, wordSize);
 9094     if (SoftwarePrefetchHintDistance >= 0) {
 9095       __ align(OptoLoopAlignment);
 9096       __ bind(LARGE_LOOP_PREFETCH);
 9097         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9098         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9099 
 9100         for (int i = 0; i < 4; i++) {
 9101           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9102           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9103           __ cmp(tmp1, tmp2);
 9104           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9105           __ br(Assembler::NE, DIFF);
 9106         }
 9107         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9108         __ add(str1, str1, 64);
 9109         __ add(str2, str2, 64);
 9110         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9111         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9112         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9113     }
 9114 
 9115     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9116     __ br(Assembler::LE, LESS16);
 9117     __ align(OptoLoopAlignment);
 9118     __ bind(LOOP_COMPARE16);
 9119       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9120       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9121       __ cmp(tmp1, tmp2);
 9122       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9123       __ br(Assembler::NE, DIFF);
 9124       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9125       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9126       __ br(Assembler::LT, LESS16);
 9127 
 9128       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9129       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9130       __ cmp(tmp1, tmp2);
 9131       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9132       __ br(Assembler::NE, DIFF);
 9133       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9134       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9135       __ br(Assembler::GE, LOOP_COMPARE16);
 9136       __ cbz(cnt2, LENGTH_DIFF);
 9137 
 9138     __ bind(LESS16);
 9139       // each 8 compare
 9140       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9141       __ br(Assembler::LE, LESS8);
 9142       __ ldr(tmp1, Address(__ post(str1, 8)));
 9143       __ ldr(tmp2, Address(__ post(str2, 8)));
 9144       __ eor(rscratch2, tmp1, tmp2);
 9145       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9146       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9147 
 9148     __ bind(LESS8); // directly load last 8 bytes
 9149       if (!isLL) {
 9150         __ add(cnt2, cnt2, cnt2);
 9151       }
 9152       __ ldr(tmp1, Address(str1, cnt2));
 9153       __ ldr(tmp2, Address(str2, cnt2));
 9154       __ eor(rscratch2, tmp1, tmp2);
 9155       __ cbz(rscratch2, LENGTH_DIFF);
 9156       __ b(CAL_DIFFERENCE);
 9157 
 9158     __ bind(DIFF);
 9159       __ cmp(tmp1, tmp2);
 9160       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9161       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9162       // reuse rscratch2 register for the result of eor instruction
 9163       __ eor(rscratch2, tmp1, tmp2);
 9164 
 9165     __ bind(CAL_DIFFERENCE);
 9166       __ rev(rscratch2, rscratch2);
 9167       __ clz(rscratch2, rscratch2);
 9168       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9169       __ lsrv(tmp1, tmp1, rscratch2);
 9170       __ lsrv(tmp2, tmp2, rscratch2);
 9171       if (isLL) {
 9172         __ uxtbw(tmp1, tmp1);
 9173         __ uxtbw(tmp2, tmp2);
 9174       } else {
 9175         __ uxthw(tmp1, tmp1);
 9176         __ uxthw(tmp2, tmp2);
 9177       }
 9178       __ subw(result, tmp1, tmp2);
 9179 
 9180     __ bind(LENGTH_DIFF);
 9181       __ ret(lr);
 9182     return entry;
 9183   }
 9184 
 9185   enum string_compare_mode {
 9186     LL,
 9187     LU,
 9188     UL,
 9189     UU,
 9190   };
 9191 
 9192   // The following registers are declared in aarch64.ad
 9193   // r0  = result
 9194   // r1  = str1
 9195   // r2  = cnt1
 9196   // r3  = str2
 9197   // r4  = cnt2
 9198   // r10 = tmp1
 9199   // r11 = tmp2
 9200   // z0  = ztmp1
 9201   // z1  = ztmp2
 9202   // p0  = pgtmp1
 9203   // p1  = pgtmp2
 9204   address generate_compare_long_string_sve(string_compare_mode mode) {
 9205     StubId stub_id;
 9206     switch (mode) {
 9207       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9208       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9209       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9210       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9211       default: ShouldNotReachHere();
 9212     }
 9213 
 9214     __ align(CodeEntryAlignment);
 9215     address entry = __ pc();
 9216     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9217              tmp1 = r10, tmp2 = r11;
 9218 
 9219     Label LOOP, DONE, MISMATCH;
 9220     Register vec_len = tmp1;
 9221     Register idx = tmp2;
 9222     // The minimum of the string lengths has been stored in cnt2.
 9223     Register cnt = cnt2;
 9224     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9225     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9226 
 9227 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9228     switch (mode) {                                                            \
 9229       case LL:                                                                 \
 9230         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9231         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9232         break;                                                                 \
 9233       case LU:                                                                 \
 9234         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9235         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9236         break;                                                                 \
 9237       case UL:                                                                 \
 9238         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9239         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9240         break;                                                                 \
 9241       case UU:                                                                 \
 9242         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9243         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9244         break;                                                                 \
 9245       default:                                                                 \
 9246         ShouldNotReachHere();                                                  \
 9247     }
 9248 
 9249     StubCodeMark mark(this, stub_id);
 9250 
 9251     __ mov(idx, 0);
 9252     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9253 
 9254     if (mode == LL) {
 9255       __ sve_cntb(vec_len);
 9256     } else {
 9257       __ sve_cnth(vec_len);
 9258     }
 9259 
 9260     __ sub(rscratch1, cnt, vec_len);
 9261 
 9262     __ bind(LOOP);
 9263 
 9264       // main loop
 9265       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9266       __ add(idx, idx, vec_len);
 9267       // Compare strings.
 9268       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9269       __ br(__ NE, MISMATCH);
 9270       __ cmp(idx, rscratch1);
 9271       __ br(__ LT, LOOP);
 9272 
 9273     // post loop, last iteration
 9274     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9275 
 9276     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9277     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9278     __ br(__ EQ, DONE);
 9279 
 9280     __ bind(MISMATCH);
 9281 
 9282     // Crop the vector to find its location.
 9283     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9284     // Extract the first different characters of each string.
 9285     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9286     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9287 
 9288     // Compute the difference of the first different characters.
 9289     __ sub(result, rscratch1, rscratch2);
 9290 
 9291     __ bind(DONE);
 9292     __ ret(lr);
 9293 #undef LOAD_PAIR
 9294     return entry;
 9295   }
 9296 
 9297   void generate_compare_long_strings() {
 9298     if (UseSVE == 0) {
 9299       StubRoutines::aarch64::_compare_long_string_LL
 9300           = generate_compare_long_string_same_encoding(true);
 9301       StubRoutines::aarch64::_compare_long_string_UU
 9302           = generate_compare_long_string_same_encoding(false);
 9303       StubRoutines::aarch64::_compare_long_string_LU
 9304           = generate_compare_long_string_different_encoding(true);
 9305       StubRoutines::aarch64::_compare_long_string_UL
 9306           = generate_compare_long_string_different_encoding(false);
 9307     } else {
 9308       StubRoutines::aarch64::_compare_long_string_LL
 9309           = generate_compare_long_string_sve(LL);
 9310       StubRoutines::aarch64::_compare_long_string_UU
 9311           = generate_compare_long_string_sve(UU);
 9312       StubRoutines::aarch64::_compare_long_string_LU
 9313           = generate_compare_long_string_sve(LU);
 9314       StubRoutines::aarch64::_compare_long_string_UL
 9315           = generate_compare_long_string_sve(UL);
 9316     }
 9317   }
 9318 
 9319   // R0 = result
 9320   // R1 = str2
 9321   // R2 = cnt1
 9322   // R3 = str1
 9323   // R4 = cnt2
 9324   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9325   //
 9326   // This generic linear code use few additional ideas, which makes it faster:
 9327   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9328   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9329   // 2) we can use "fast" algorithm of finding single character to search for
 9330   // first symbol with less branches(1 branch per each loaded register instead
 9331   // of branch for each symbol), so, this is where constants like
 9332   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9333   // 3) after loading and analyzing 1st register of source string, it can be
 9334   // used to search for every 1st character entry, saving few loads in
 9335   // comparison with "simplier-but-slower" implementation
 9336   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9337   // re-using/re-initializing/compressing register values, which makes code
 9338   // larger and a bit less readable, however, most of extra operations are
 9339   // issued during loads or branches, so, penalty is minimal
 9340   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9341     StubId stub_id;
 9342     if (str1_isL) {
 9343       if (str2_isL) {
 9344         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9345       } else {
 9346         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9347       }
 9348     } else {
 9349       if (str2_isL) {
 9350         ShouldNotReachHere();
 9351       } else {
 9352         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9353       }
 9354     }
 9355     __ align(CodeEntryAlignment);
 9356     StubCodeMark mark(this, stub_id);
 9357     address entry = __ pc();
 9358 
 9359     int str1_chr_size = str1_isL ? 1 : 2;
 9360     int str2_chr_size = str2_isL ? 1 : 2;
 9361     int str1_chr_shift = str1_isL ? 0 : 1;
 9362     int str2_chr_shift = str2_isL ? 0 : 1;
 9363     bool isL = str1_isL && str2_isL;
 9364    // parameters
 9365     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9366     // temporary registers
 9367     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9368     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9369     // redefinitions
 9370     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9371 
 9372     __ push(spilled_regs, sp);
 9373     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9374         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9375         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9376         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9377         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9378         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9379     // Read whole register from str1. It is safe, because length >=8 here
 9380     __ ldr(ch1, Address(str1));
 9381     // Read whole register from str2. It is safe, because length >=8 here
 9382     __ ldr(ch2, Address(str2));
 9383     __ sub(cnt2, cnt2, cnt1);
 9384     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9385     if (str1_isL != str2_isL) {
 9386       __ eor(v0, __ T16B, v0, v0);
 9387     }
 9388     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9389     __ mul(first, first, tmp1);
 9390     // check if we have less than 1 register to check
 9391     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9392     if (str1_isL != str2_isL) {
 9393       __ fmovd(v1, ch1);
 9394     }
 9395     __ br(__ LE, L_SMALL);
 9396     __ eor(ch2, first, ch2);
 9397     if (str1_isL != str2_isL) {
 9398       __ zip1(v1, __ T16B, v1, v0);
 9399     }
 9400     __ sub(tmp2, ch2, tmp1);
 9401     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9402     __ bics(tmp2, tmp2, ch2);
 9403     if (str1_isL != str2_isL) {
 9404       __ fmovd(ch1, v1);
 9405     }
 9406     __ br(__ NE, L_HAS_ZERO);
 9407     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9408     __ add(result, result, wordSize/str2_chr_size);
 9409     __ add(str2, str2, wordSize);
 9410     __ br(__ LT, L_POST_LOOP);
 9411     __ BIND(L_LOOP);
 9412       __ ldr(ch2, Address(str2));
 9413       __ eor(ch2, first, ch2);
 9414       __ sub(tmp2, ch2, tmp1);
 9415       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9416       __ bics(tmp2, tmp2, ch2);
 9417       __ br(__ NE, L_HAS_ZERO);
 9418     __ BIND(L_LOOP_PROCEED);
 9419       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9420       __ add(str2, str2, wordSize);
 9421       __ add(result, result, wordSize/str2_chr_size);
 9422       __ br(__ GE, L_LOOP);
 9423     __ BIND(L_POST_LOOP);
 9424       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9425       __ br(__ LE, NOMATCH);
 9426       __ ldr(ch2, Address(str2));
 9427       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9428       __ eor(ch2, first, ch2);
 9429       __ sub(tmp2, ch2, tmp1);
 9430       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9431       __ mov(tmp4, -1); // all bits set
 9432       __ b(L_SMALL_PROCEED);
 9433     __ align(OptoLoopAlignment);
 9434     __ BIND(L_SMALL);
 9435       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9436       __ eor(ch2, first, ch2);
 9437       if (str1_isL != str2_isL) {
 9438         __ zip1(v1, __ T16B, v1, v0);
 9439       }
 9440       __ sub(tmp2, ch2, tmp1);
 9441       __ mov(tmp4, -1); // all bits set
 9442       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9443       if (str1_isL != str2_isL) {
 9444         __ fmovd(ch1, v1); // move converted 4 symbols
 9445       }
 9446     __ BIND(L_SMALL_PROCEED);
 9447       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9448       __ bic(tmp2, tmp2, ch2);
 9449       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9450       __ rbit(tmp2, tmp2);
 9451       __ br(__ EQ, NOMATCH);
 9452     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9453       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9454       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9455       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9456       if (str2_isL) { // LL
 9457         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9458         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9459         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9460         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9461         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9462       } else {
 9463         __ mov(ch2, 0xE); // all bits in byte set except last one
 9464         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9465         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9466         __ lslv(tmp2, tmp2, tmp4);
 9467         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9468         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9469         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9470         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9471       }
 9472       __ cmp(ch1, ch2);
 9473       __ mov(tmp4, wordSize/str2_chr_size);
 9474       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9475     __ BIND(L_SMALL_CMP_LOOP);
 9476       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9477                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9478       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9479                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9480       __ add(tmp4, tmp4, 1);
 9481       __ cmp(tmp4, cnt1);
 9482       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9483       __ cmp(first, ch2);
 9484       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9485     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9486       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9487       __ clz(tmp4, tmp2);
 9488       __ add(result, result, 1); // advance index
 9489       __ add(str2, str2, str2_chr_size); // advance pointer
 9490       __ b(L_SMALL_HAS_ZERO_LOOP);
 9491     __ align(OptoLoopAlignment);
 9492     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9493       __ cmp(first, ch2);
 9494       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9495       __ b(DONE);
 9496     __ align(OptoLoopAlignment);
 9497     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9498       if (str2_isL) { // LL
 9499         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9500         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9501         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9502         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9503         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9504       } else {
 9505         __ mov(ch2, 0xE); // all bits in byte set except last one
 9506         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9507         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9508         __ lslv(tmp2, tmp2, tmp4);
 9509         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9510         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9511         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9512         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9513       }
 9514       __ cmp(ch1, ch2);
 9515       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9516       __ b(DONE);
 9517     __ align(OptoLoopAlignment);
 9518     __ BIND(L_HAS_ZERO);
 9519       __ rbit(tmp2, tmp2);
 9520       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9521       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9522       // It's fine because both counters are 32bit and are not changed in this
 9523       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9524       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9525       __ sub(result, result, 1);
 9526     __ BIND(L_HAS_ZERO_LOOP);
 9527       __ mov(cnt1, wordSize/str2_chr_size);
 9528       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9529       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9530       if (str2_isL) {
 9531         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9532         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9533         __ lslv(tmp2, tmp2, tmp4);
 9534         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9535         __ add(tmp4, tmp4, 1);
 9536         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9537         __ lsl(tmp2, tmp2, 1);
 9538         __ mov(tmp4, wordSize/str2_chr_size);
 9539       } else {
 9540         __ mov(ch2, 0xE);
 9541         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9542         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9543         __ lslv(tmp2, tmp2, tmp4);
 9544         __ add(tmp4, tmp4, 1);
 9545         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9546         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9547         __ lsl(tmp2, tmp2, 1);
 9548         __ mov(tmp4, wordSize/str2_chr_size);
 9549         __ sub(str2, str2, str2_chr_size);
 9550       }
 9551       __ cmp(ch1, ch2);
 9552       __ mov(tmp4, wordSize/str2_chr_size);
 9553       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9554     __ BIND(L_CMP_LOOP);
 9555       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9556                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9557       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9558                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9559       __ add(tmp4, tmp4, 1);
 9560       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9561       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9562       __ cmp(cnt1, ch2);
 9563       __ br(__ EQ, L_CMP_LOOP);
 9564     __ BIND(L_CMP_LOOP_NOMATCH);
 9565       // here we're not matched
 9566       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9567       __ clz(tmp4, tmp2);
 9568       __ add(str2, str2, str2_chr_size); // advance pointer
 9569       __ b(L_HAS_ZERO_LOOP);
 9570     __ align(OptoLoopAlignment);
 9571     __ BIND(L_CMP_LOOP_LAST_CMP);
 9572       __ cmp(cnt1, ch2);
 9573       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9574       __ b(DONE);
 9575     __ align(OptoLoopAlignment);
 9576     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9577       if (str2_isL) {
 9578         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9579         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9580         __ lslv(tmp2, tmp2, tmp4);
 9581         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9582         __ add(tmp4, tmp4, 1);
 9583         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9584         __ lsl(tmp2, tmp2, 1);
 9585       } else {
 9586         __ mov(ch2, 0xE);
 9587         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9588         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9589         __ lslv(tmp2, tmp2, tmp4);
 9590         __ add(tmp4, tmp4, 1);
 9591         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9592         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9593         __ lsl(tmp2, tmp2, 1);
 9594         __ sub(str2, str2, str2_chr_size);
 9595       }
 9596       __ cmp(ch1, ch2);
 9597       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9598       __ b(DONE);
 9599     __ align(OptoLoopAlignment);
 9600     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9601       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9602       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9603       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9604       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9605       // result by analyzed characters value, so, we can just reset lower bits
 9606       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9607       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9608       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9609       // index of last analyzed substring inside current octet. So, str2 in at
 9610       // respective start address. We need to advance it to next octet
 9611       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9612       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9613       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9614       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9615       __ movw(cnt2, cnt2);
 9616       __ b(L_LOOP_PROCEED);
 9617     __ align(OptoLoopAlignment);
 9618     __ BIND(NOMATCH);
 9619       __ mov(result, -1);
 9620     __ BIND(DONE);
 9621       __ pop(spilled_regs, sp);
 9622       __ ret(lr);
 9623     return entry;
 9624   }
 9625 
 9626   void generate_string_indexof_stubs() {
 9627     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9628     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9629     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9630   }
 9631 
 9632   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9633       FloatRegister src1, FloatRegister src2) {
 9634     Register dst = r1;
 9635     __ zip1(v1, __ T16B, src1, v0);
 9636     __ zip2(v2, __ T16B, src1, v0);
 9637     if (generatePrfm) {
 9638       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9639     }
 9640     __ zip1(v3, __ T16B, src2, v0);
 9641     __ zip2(v4, __ T16B, src2, v0);
 9642     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9643   }
 9644 
 9645   // R0 = src
 9646   // R1 = dst
 9647   // R2 = len
 9648   // R3 = len >> 3
 9649   // V0 = 0
 9650   // v1 = loaded 8 bytes
 9651   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9652   address generate_large_byte_array_inflate() {
 9653     __ align(CodeEntryAlignment);
 9654     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9655     StubCodeMark mark(this, stub_id);
 9656     address entry = __ pc();
 9657     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9658     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9659     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9660 
 9661     // do one more 8-byte read to have address 16-byte aligned in most cases
 9662     // also use single store instruction
 9663     __ ldrd(v2, __ post(src, 8));
 9664     __ sub(octetCounter, octetCounter, 2);
 9665     __ zip1(v1, __ T16B, v1, v0);
 9666     __ zip1(v2, __ T16B, v2, v0);
 9667     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9668     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9669     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9670     __ br(__ LE, LOOP_START);
 9671     __ b(LOOP_PRFM_START);
 9672     __ bind(LOOP_PRFM);
 9673       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9674     __ bind(LOOP_PRFM_START);
 9675       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9676       __ sub(octetCounter, octetCounter, 8);
 9677       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9678       inflate_and_store_2_fp_registers(true, v3, v4);
 9679       inflate_and_store_2_fp_registers(true, v5, v6);
 9680       __ br(__ GT, LOOP_PRFM);
 9681       __ cmp(octetCounter, (u1)8);
 9682       __ br(__ LT, DONE);
 9683     __ bind(LOOP);
 9684       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9685       __ bind(LOOP_START);
 9686       __ sub(octetCounter, octetCounter, 8);
 9687       __ cmp(octetCounter, (u1)8);
 9688       inflate_and_store_2_fp_registers(false, v3, v4);
 9689       inflate_and_store_2_fp_registers(false, v5, v6);
 9690       __ br(__ GE, LOOP);
 9691     __ bind(DONE);
 9692       __ ret(lr);
 9693     return entry;
 9694   }
 9695 
 9696   /**
 9697    *  Arguments:
 9698    *
 9699    *  Input:
 9700    *  c_rarg0   - current state address
 9701    *  c_rarg1   - H key address
 9702    *  c_rarg2   - data address
 9703    *  c_rarg3   - number of blocks
 9704    *
 9705    *  Output:
 9706    *  Updated state at c_rarg0
 9707    */
 9708   address generate_ghash_processBlocks() {
 9709     // Bafflingly, GCM uses little-endian for the byte order, but
 9710     // big-endian for the bit order.  For example, the polynomial 1 is
 9711     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9712     //
 9713     // So, we must either reverse the bytes in each word and do
 9714     // everything big-endian or reverse the bits in each byte and do
 9715     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9716     // the bits in each byte (we have an instruction, RBIT, to do
 9717     // that) and keep the data in little-endian bit order through the
 9718     // calculation, bit-reversing the inputs and outputs.
 9719 
 9720     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9721     StubCodeMark mark(this, stub_id);
 9722     Label polynomial; // local data generated at end of stub
 9723     __ align(CodeEntryAlignment);
 9724     address start = __ pc();
 9725 
 9726     Register state   = c_rarg0;
 9727     Register subkeyH = c_rarg1;
 9728     Register data    = c_rarg2;
 9729     Register blocks  = c_rarg3;
 9730 
 9731     FloatRegister vzr = v30;
 9732     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9733 
 9734     __ adr(rscratch1, polynomial);
 9735     __ ldrq(v24, rscratch1);    // The field polynomial
 9736 
 9737     __ ldrq(v0, Address(state));
 9738     __ ldrq(v1, Address(subkeyH));
 9739 
 9740     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9741     __ rbit(v0, __ T16B, v0);
 9742     __ rev64(v1, __ T16B, v1);
 9743     __ rbit(v1, __ T16B, v1);
 9744 
 9745     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9746     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9747 
 9748     {
 9749       Label L_ghash_loop;
 9750       __ bind(L_ghash_loop);
 9751 
 9752       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9753                                                  // reversing each byte
 9754       __ rbit(v2, __ T16B, v2);
 9755       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9756 
 9757       // Multiply state in v2 by subkey in v1
 9758       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9759                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9760                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9761       // Reduce v7:v5 by the field polynomial
 9762       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9763 
 9764       __ sub(blocks, blocks, 1);
 9765       __ cbnz(blocks, L_ghash_loop);
 9766     }
 9767 
 9768     // The bit-reversed result is at this point in v0
 9769     __ rev64(v0, __ T16B, v0);
 9770     __ rbit(v0, __ T16B, v0);
 9771 
 9772     __ st1(v0, __ T16B, state);
 9773     __ ret(lr);
 9774 
 9775     // bind label and generate local polynomial data
 9776     __ align(wordSize * 2);
 9777     __ bind(polynomial);
 9778     __ emit_int64(0x87);  // The low-order bits of the field
 9779                           // polynomial (i.e. p = z^7+z^2+z+1)
 9780                           // repeated in the low and high parts of a
 9781                           // 128-bit vector
 9782     __ emit_int64(0x87);
 9783 
 9784     return start;
 9785   }
 9786 
 9787   address generate_ghash_processBlocks_wide() {
 9788     address small = generate_ghash_processBlocks();
 9789 
 9790     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9791     StubCodeMark mark(this, stub_id);
 9792     Label polynomial;           // local data generated after stub
 9793     __ align(CodeEntryAlignment);
 9794     address start = __ pc();
 9795 
 9796     Register state   = c_rarg0;
 9797     Register subkeyH = c_rarg1;
 9798     Register data    = c_rarg2;
 9799     Register blocks  = c_rarg3;
 9800 
 9801     const int unroll = 4;
 9802 
 9803     __ cmp(blocks, (unsigned char)(unroll * 2));
 9804     __ br(__ LT, small);
 9805 
 9806     if (unroll > 1) {
 9807     // Save state before entering routine
 9808       __ sub(sp, sp, 4 * 16);
 9809       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9810       __ sub(sp, sp, 4 * 16);
 9811       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9812     }
 9813 
 9814     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
 9815 
 9816     if (unroll > 1) {
 9817       // And restore state
 9818       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9819       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9820     }
 9821 
 9822     __ cmp(blocks, (unsigned char)0);
 9823     __ br(__ GT, small);
 9824 
 9825     __ ret(lr);
 9826 
 9827     // bind label and generate polynomial data
 9828     __ align(wordSize * 2);
 9829     __ bind(polynomial);
 9830     __ emit_int64(0x87);  // The low-order bits of the field
 9831                           // polynomial (i.e. p = z^7+z^2+z+1)
 9832                           // repeated in the low and high parts of a
 9833                           // 128-bit vector
 9834     __ emit_int64(0x87);
 9835 
 9836     return start;
 9837 
 9838   }
 9839 
 9840   void generate_base64_encode_simdround(Register src, Register dst,
 9841         FloatRegister codec, u8 size) {
 9842 
 9843     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9844     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9845     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9846 
 9847     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9848 
 9849     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9850 
 9851     __ ushr(ind0, arrangement, in0,  2);
 9852 
 9853     __ ushr(ind1, arrangement, in1,  2);
 9854     __ shl(in0,   arrangement, in0,  6);
 9855     __ orr(ind1,  arrangement, ind1, in0);
 9856     __ ushr(ind1, arrangement, ind1, 2);
 9857 
 9858     __ ushr(ind2, arrangement, in2,  4);
 9859     __ shl(in1,   arrangement, in1,  4);
 9860     __ orr(ind2,  arrangement, in1,  ind2);
 9861     __ ushr(ind2, arrangement, ind2, 2);
 9862 
 9863     __ shl(ind3,  arrangement, in2,  2);
 9864     __ ushr(ind3, arrangement, ind3, 2);
 9865 
 9866     __ tbl(out0,  arrangement, codec,  4, ind0);
 9867     __ tbl(out1,  arrangement, codec,  4, ind1);
 9868     __ tbl(out2,  arrangement, codec,  4, ind2);
 9869     __ tbl(out3,  arrangement, codec,  4, ind3);
 9870 
 9871     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9872   }
 9873 
 9874    /**
 9875    *  Arguments:
 9876    *
 9877    *  Input:
 9878    *  c_rarg0   - src_start
 9879    *  c_rarg1   - src_offset
 9880    *  c_rarg2   - src_length
 9881    *  c_rarg3   - dest_start
 9882    *  c_rarg4   - dest_offset
 9883    *  c_rarg5   - isURL
 9884    *
 9885    */
 9886   address generate_base64_encodeBlock() {
 9887 
 9888     static const char toBase64[64] = {
 9889       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9890       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9891       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9892       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9893       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9894     };
 9895 
 9896     static const char toBase64URL[64] = {
 9897       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9898       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9899       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9900       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9901       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9902     };
 9903 
 9904     __ align(CodeEntryAlignment);
 9905     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9906     StubCodeMark mark(this, stub_id);
 9907     address start = __ pc();
 9908 
 9909     Register src   = c_rarg0;  // source array
 9910     Register soff  = c_rarg1;  // source start offset
 9911     Register send  = c_rarg2;  // source end offset
 9912     Register dst   = c_rarg3;  // dest array
 9913     Register doff  = c_rarg4;  // position for writing to dest array
 9914     Register isURL = c_rarg5;  // Base64 or URL character set
 9915 
 9916     // c_rarg6 and c_rarg7 are free to use as temps
 9917     Register codec  = c_rarg6;
 9918     Register length = c_rarg7;
 9919 
 9920     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9921 
 9922     __ add(src, src, soff);
 9923     __ add(dst, dst, doff);
 9924     __ sub(length, send, soff);
 9925 
 9926     // load the codec base address
 9927     __ lea(codec, ExternalAddress((address) toBase64));
 9928     __ cbz(isURL, ProcessData);
 9929     __ lea(codec, ExternalAddress((address) toBase64URL));
 9930 
 9931     __ BIND(ProcessData);
 9932 
 9933     // too short to formup a SIMD loop, roll back
 9934     __ cmp(length, (u1)24);
 9935     __ br(Assembler::LT, Process3B);
 9936 
 9937     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9938 
 9939     __ BIND(Process48B);
 9940     __ cmp(length, (u1)48);
 9941     __ br(Assembler::LT, Process24B);
 9942     generate_base64_encode_simdround(src, dst, v0, 16);
 9943     __ sub(length, length, 48);
 9944     __ b(Process48B);
 9945 
 9946     __ BIND(Process24B);
 9947     __ cmp(length, (u1)24);
 9948     __ br(Assembler::LT, SIMDExit);
 9949     generate_base64_encode_simdround(src, dst, v0, 8);
 9950     __ sub(length, length, 24);
 9951 
 9952     __ BIND(SIMDExit);
 9953     __ cbz(length, Exit);
 9954 
 9955     __ BIND(Process3B);
 9956     //  3 src bytes, 24 bits
 9957     __ ldrb(r10, __ post(src, 1));
 9958     __ ldrb(r11, __ post(src, 1));
 9959     __ ldrb(r12, __ post(src, 1));
 9960     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9961     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9962     // codec index
 9963     __ ubfmw(r15, r12, 18, 23);
 9964     __ ubfmw(r14, r12, 12, 17);
 9965     __ ubfmw(r13, r12, 6,  11);
 9966     __ andw(r12,  r12, 63);
 9967     // get the code based on the codec
 9968     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9969     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9970     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9971     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9972     __ strb(r15, __ post(dst, 1));
 9973     __ strb(r14, __ post(dst, 1));
 9974     __ strb(r13, __ post(dst, 1));
 9975     __ strb(r12, __ post(dst, 1));
 9976     __ sub(length, length, 3);
 9977     __ cbnz(length, Process3B);
 9978 
 9979     __ BIND(Exit);
 9980     __ ret(lr);
 9981 
 9982     return start;
 9983   }
 9984 
 9985   void generate_base64_decode_simdround(Register src, Register dst,
 9986         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9987 
 9988     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9989     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9990 
 9991     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9992     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9993 
 9994     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9995 
 9996     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9997 
 9998     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9999 
10000     // we need unsigned saturating subtract, to make sure all input values
10001     // in range [0, 63] will have 0U value in the higher half lookup
10002     __ uqsubv(decH0, __ T16B, in0, v27);
10003     __ uqsubv(decH1, __ T16B, in1, v27);
10004     __ uqsubv(decH2, __ T16B, in2, v27);
10005     __ uqsubv(decH3, __ T16B, in3, v27);
10006 
10007     // lower half lookup
10008     __ tbl(decL0, arrangement, codecL, 4, in0);
10009     __ tbl(decL1, arrangement, codecL, 4, in1);
10010     __ tbl(decL2, arrangement, codecL, 4, in2);
10011     __ tbl(decL3, arrangement, codecL, 4, in3);
10012 
10013     // higher half lookup
10014     __ tbx(decH0, arrangement, codecH, 4, decH0);
10015     __ tbx(decH1, arrangement, codecH, 4, decH1);
10016     __ tbx(decH2, arrangement, codecH, 4, decH2);
10017     __ tbx(decH3, arrangement, codecH, 4, decH3);
10018 
10019     // combine lower and higher
10020     __ orr(decL0, arrangement, decL0, decH0);
10021     __ orr(decL1, arrangement, decL1, decH1);
10022     __ orr(decL2, arrangement, decL2, decH2);
10023     __ orr(decL3, arrangement, decL3, decH3);
10024 
10025     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10026     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10027     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10028     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10029     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10030     __ orr(in0, arrangement, decH0, decH1);
10031     __ orr(in1, arrangement, decH2, decH3);
10032     __ orr(in2, arrangement, in0,   in1);
10033     __ umaxv(in3, arrangement, in2);
10034     __ umov(rscratch2, in3, __ B, 0);
10035 
10036     // get the data to output
10037     __ shl(out0,  arrangement, decL0, 2);
10038     __ ushr(out1, arrangement, decL1, 4);
10039     __ orr(out0,  arrangement, out0,  out1);
10040     __ shl(out1,  arrangement, decL1, 4);
10041     __ ushr(out2, arrangement, decL2, 2);
10042     __ orr(out1,  arrangement, out1,  out2);
10043     __ shl(out2,  arrangement, decL2, 6);
10044     __ orr(out2,  arrangement, out2,  decL3);
10045 
10046     __ cbz(rscratch2, NoIllegalData);
10047 
10048     // handle illegal input
10049     __ umov(r10, in2, __ D, 0);
10050     if (size == 16) {
10051       __ cbnz(r10, ErrorInLowerHalf);
10052 
10053       // illegal input is in higher half, store the lower half now.
10054       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10055 
10056       __ umov(r10, in2,  __ D, 1);
10057       __ umov(r11, out0, __ D, 1);
10058       __ umov(r12, out1, __ D, 1);
10059       __ umov(r13, out2, __ D, 1);
10060       __ b(StoreLegalData);
10061 
10062       __ BIND(ErrorInLowerHalf);
10063     }
10064     __ umov(r11, out0, __ D, 0);
10065     __ umov(r12, out1, __ D, 0);
10066     __ umov(r13, out2, __ D, 0);
10067 
10068     __ BIND(StoreLegalData);
10069     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10070     __ strb(r11, __ post(dst, 1));
10071     __ strb(r12, __ post(dst, 1));
10072     __ strb(r13, __ post(dst, 1));
10073     __ lsr(r10, r10, 8);
10074     __ lsr(r11, r11, 8);
10075     __ lsr(r12, r12, 8);
10076     __ lsr(r13, r13, 8);
10077     __ b(StoreLegalData);
10078 
10079     __ BIND(NoIllegalData);
10080     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10081   }
10082 
10083 
10084    /**
10085    *  Arguments:
10086    *
10087    *  Input:
10088    *  c_rarg0   - src_start
10089    *  c_rarg1   - src_offset
10090    *  c_rarg2   - src_length
10091    *  c_rarg3   - dest_start
10092    *  c_rarg4   - dest_offset
10093    *  c_rarg5   - isURL
10094    *  c_rarg6   - isMIME
10095    *
10096    */
10097   address generate_base64_decodeBlock() {
10098 
10099     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10100     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10101     // titled "Base64 decoding".
10102 
10103     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10104     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10105     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10106     static const uint8_t fromBase64ForNoSIMD[256] = {
10107       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10108       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10109       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10110        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10111       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10112        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10113       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10114        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10115       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10118       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10119       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10120       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10121       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10122       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10123     };
10124 
10125     static const uint8_t fromBase64URLForNoSIMD[256] = {
10126       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10127       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10128       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10129        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10130       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10131        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10132       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10133        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10134       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10135       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10136       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10137       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10138       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10139       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10140       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10141       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10142     };
10143 
10144     // A legal value of base64 code is in range [0, 127].  We need two lookups
10145     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10146     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10147     // table vector lookup use tbx, out of range indices are unchanged in
10148     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10149     // The value of index 64 is set to 0, so that we know that we already get the
10150     // decoded data with the 1st lookup.
10151     static const uint8_t fromBase64ForSIMD[128] = {
10152       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10153       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10154       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10155        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10156         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10157        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10158       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10159        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10160     };
10161 
10162     static const uint8_t fromBase64URLForSIMD[128] = {
10163       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10164       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10166        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10167         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10168        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10169        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10170        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10171     };
10172 
10173     __ align(CodeEntryAlignment);
10174     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10175     StubCodeMark mark(this, stub_id);
10176     address start = __ pc();
10177 
10178     Register src    = c_rarg0;  // source array
10179     Register soff   = c_rarg1;  // source start offset
10180     Register send   = c_rarg2;  // source end offset
10181     Register dst    = c_rarg3;  // dest array
10182     Register doff   = c_rarg4;  // position for writing to dest array
10183     Register isURL  = c_rarg5;  // Base64 or URL character set
10184     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10185 
10186     Register length = send;    // reuse send as length of source data to process
10187 
10188     Register simd_codec   = c_rarg6;
10189     Register nosimd_codec = c_rarg7;
10190 
10191     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10192 
10193     __ enter();
10194 
10195     __ add(src, src, soff);
10196     __ add(dst, dst, doff);
10197 
10198     __ mov(doff, dst);
10199 
10200     __ sub(length, send, soff);
10201     __ bfm(length, zr, 0, 1);
10202 
10203     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10204     __ cbz(isURL, ProcessData);
10205     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10206 
10207     __ BIND(ProcessData);
10208     __ mov(rscratch1, length);
10209     __ cmp(length, (u1)144); // 144 = 80 + 64
10210     __ br(Assembler::LT, Process4B);
10211 
10212     // In the MIME case, the line length cannot be more than 76
10213     // bytes (see RFC 2045). This is too short a block for SIMD
10214     // to be worthwhile, so we use non-SIMD here.
10215     __ movw(rscratch1, 79);
10216 
10217     __ BIND(Process4B);
10218     __ ldrw(r14, __ post(src, 4));
10219     __ ubfxw(r10, r14, 0,  8);
10220     __ ubfxw(r11, r14, 8,  8);
10221     __ ubfxw(r12, r14, 16, 8);
10222     __ ubfxw(r13, r14, 24, 8);
10223     // get the de-code
10224     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10225     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10226     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10227     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10228     // error detection, 255u indicates an illegal input
10229     __ orrw(r14, r10, r11);
10230     __ orrw(r15, r12, r13);
10231     __ orrw(r14, r14, r15);
10232     __ tbnz(r14, 7, Exit);
10233     // recover the data
10234     __ lslw(r14, r10, 10);
10235     __ bfiw(r14, r11, 4, 6);
10236     __ bfmw(r14, r12, 2, 5);
10237     __ rev16w(r14, r14);
10238     __ bfiw(r13, r12, 6, 2);
10239     __ strh(r14, __ post(dst, 2));
10240     __ strb(r13, __ post(dst, 1));
10241     // non-simd loop
10242     __ subsw(rscratch1, rscratch1, 4);
10243     __ br(Assembler::GT, Process4B);
10244 
10245     // if exiting from PreProcess80B, rscratch1 == -1;
10246     // otherwise, rscratch1 == 0.
10247     __ cbzw(rscratch1, Exit);
10248     __ sub(length, length, 80);
10249 
10250     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10251     __ cbz(isURL, SIMDEnter);
10252     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10253 
10254     __ BIND(SIMDEnter);
10255     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10256     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10257     __ mov(rscratch1, 63);
10258     __ dup(v27, __ T16B, rscratch1);
10259 
10260     __ BIND(Process64B);
10261     __ cmp(length, (u1)64);
10262     __ br(Assembler::LT, Process32B);
10263     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10264     __ sub(length, length, 64);
10265     __ b(Process64B);
10266 
10267     __ BIND(Process32B);
10268     __ cmp(length, (u1)32);
10269     __ br(Assembler::LT, SIMDExit);
10270     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10271     __ sub(length, length, 32);
10272     __ b(Process32B);
10273 
10274     __ BIND(SIMDExit);
10275     __ cbz(length, Exit);
10276     __ movw(rscratch1, length);
10277     __ b(Process4B);
10278 
10279     __ BIND(Exit);
10280     __ sub(c_rarg0, dst, doff);
10281 
10282     __ leave();
10283     __ ret(lr);
10284 
10285     return start;
10286   }
10287 
10288   // Support for spin waits.
10289   address generate_spin_wait() {
10290     __ align(CodeEntryAlignment);
10291     StubId stub_id = StubId::stubgen_spin_wait_id;
10292     StubCodeMark mark(this, stub_id);
10293     address start = __ pc();
10294 
10295     __ spin_wait();
10296     __ ret(lr);
10297 
10298     return start;
10299   }
10300 
10301   void generate_lookup_secondary_supers_table_stub() {
10302     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10303     StubCodeMark mark(this, stub_id);
10304 
10305     const Register
10306       r_super_klass  = r0,
10307       r_array_base   = r1,
10308       r_array_length = r2,
10309       r_array_index  = r3,
10310       r_sub_klass    = r4,
10311       r_bitmap       = rscratch2,
10312       result         = r5;
10313     const FloatRegister
10314       vtemp          = v0;
10315 
10316     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10317       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10318       Label L_success;
10319       __ enter();
10320       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10321                                              r_array_base, r_array_length, r_array_index,
10322                                              vtemp, result, slot,
10323                                              /*stub_is_near*/true);
10324       __ leave();
10325       __ ret(lr);
10326     }
10327   }
10328 
10329   // Slow path implementation for UseSecondarySupersTable.
10330   address generate_lookup_secondary_supers_table_slow_path_stub() {
10331     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10332     StubCodeMark mark(this, stub_id);
10333 
10334     address start = __ pc();
10335     const Register
10336       r_super_klass  = r0,        // argument
10337       r_array_base   = r1,        // argument
10338       temp1          = r2,        // temp
10339       r_array_index  = r3,        // argument
10340       r_bitmap       = rscratch2, // argument
10341       result         = r5;        // argument
10342 
10343     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10344     __ ret(lr);
10345 
10346     return start;
10347   }
10348 
10349 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10350 
10351   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10352   //
10353   // If LSE is in use, generate LSE versions of all the stubs. The
10354   // non-LSE versions are in atomic_aarch64.S.
10355 
10356   // class AtomicStubMark records the entry point of a stub and the
10357   // stub pointer which will point to it. The stub pointer is set to
10358   // the entry point when ~AtomicStubMark() is called, which must be
10359   // after ICache::invalidate_range. This ensures safe publication of
10360   // the generated code.
10361   class AtomicStubMark {
10362     address _entry_point;
10363     aarch64_atomic_stub_t *_stub;
10364     MacroAssembler *_masm;
10365   public:
10366     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10367       _masm = masm;
10368       __ align(32);
10369       _entry_point = __ pc();
10370       _stub = stub;
10371     }
10372     ~AtomicStubMark() {
10373       *_stub = (aarch64_atomic_stub_t)_entry_point;
10374     }
10375   };
10376 
10377   // NB: For memory_order_conservative we need a trailing membar after
10378   // LSE atomic operations but not a leading membar.
10379   //
10380   // We don't need a leading membar because a clause in the Arm ARM
10381   // says:
10382   //
10383   //   Barrier-ordered-before
10384   //
10385   //   Barrier instructions order prior Memory effects before subsequent
10386   //   Memory effects generated by the same Observer. A read or a write
10387   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10388   //   Observer if and only if RW1 appears in program order before RW 2
10389   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10390   //   instruction with both Acquire and Release semantics.
10391   //
10392   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10393   // and Release semantics, therefore we don't need a leading
10394   // barrier. However, there is no corresponding Barrier-ordered-after
10395   // relationship, therefore we need a trailing membar to prevent a
10396   // later store or load from being reordered with the store in an
10397   // atomic instruction.
10398   //
10399   // This was checked by using the herd7 consistency model simulator
10400   // (http://diy.inria.fr/) with this test case:
10401   //
10402   // AArch64 LseCas
10403   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10404   // P0 | P1;
10405   // LDR W4, [X2] | MOV W3, #0;
10406   // DMB LD       | MOV W4, #1;
10407   // LDR W3, [X1] | CASAL W3, W4, [X1];
10408   //              | DMB ISH;
10409   //              | STR W4, [X2];
10410   // exists
10411   // (0:X3=0 /\ 0:X4=1)
10412   //
10413   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10414   // with the store to x in P1. Without the DMB in P1 this may happen.
10415   //
10416   // At the time of writing we don't know of any AArch64 hardware that
10417   // reorders stores in this way, but the Reference Manual permits it.
10418 
10419   void gen_cas_entry(Assembler::operand_size size,
10420                      atomic_memory_order order) {
10421     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10422       exchange_val = c_rarg2;
10423     bool acquire, release;
10424     switch (order) {
10425       case memory_order_relaxed:
10426         acquire = false;
10427         release = false;
10428         break;
10429       case memory_order_release:
10430         acquire = false;
10431         release = true;
10432         break;
10433       default:
10434         acquire = true;
10435         release = true;
10436         break;
10437     }
10438     __ mov(prev, compare_val);
10439     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10440     if (order == memory_order_conservative) {
10441       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10442     }
10443     if (size == Assembler::xword) {
10444       __ mov(r0, prev);
10445     } else {
10446       __ movw(r0, prev);
10447     }
10448     __ ret(lr);
10449   }
10450 
10451   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10452     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10453     // If not relaxed, then default to conservative.  Relaxed is the only
10454     // case we use enough to be worth specializing.
10455     if (order == memory_order_relaxed) {
10456       __ ldadd(size, incr, prev, addr);
10457     } else {
10458       __ ldaddal(size, incr, prev, addr);
10459       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10460     }
10461     if (size == Assembler::xword) {
10462       __ mov(r0, prev);
10463     } else {
10464       __ movw(r0, prev);
10465     }
10466     __ ret(lr);
10467   }
10468 
10469   void gen_swpal_entry(Assembler::operand_size size) {
10470     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10471     __ swpal(size, incr, prev, addr);
10472     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10473     if (size == Assembler::xword) {
10474       __ mov(r0, prev);
10475     } else {
10476       __ movw(r0, prev);
10477     }
10478     __ ret(lr);
10479   }
10480 
10481   void generate_atomic_entry_points() {
10482     if (! UseLSE) {
10483       return;
10484     }
10485     __ align(CodeEntryAlignment);
10486     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10487     StubCodeMark mark(this, stub_id);
10488     address first_entry = __ pc();
10489 
10490     // ADD, memory_order_conservative
10491     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10492     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10493     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10494     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10495 
10496     // ADD, memory_order_relaxed
10497     AtomicStubMark mark_fetch_add_4_relaxed
10498       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10499     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10500     AtomicStubMark mark_fetch_add_8_relaxed
10501       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10502     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10503 
10504     // XCHG, memory_order_conservative
10505     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10506     gen_swpal_entry(Assembler::word);
10507     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10508     gen_swpal_entry(Assembler::xword);
10509 
10510     // CAS, memory_order_conservative
10511     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10512     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10513     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10514     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10515     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10516     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10517 
10518     // CAS, memory_order_relaxed
10519     AtomicStubMark mark_cmpxchg_1_relaxed
10520       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10521     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10522     AtomicStubMark mark_cmpxchg_4_relaxed
10523       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10524     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10525     AtomicStubMark mark_cmpxchg_8_relaxed
10526       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10527     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10528 
10529     AtomicStubMark mark_cmpxchg_4_release
10530       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10531     gen_cas_entry(MacroAssembler::word, memory_order_release);
10532     AtomicStubMark mark_cmpxchg_8_release
10533       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10534     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10535 
10536     AtomicStubMark mark_cmpxchg_4_seq_cst
10537       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10538     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10539     AtomicStubMark mark_cmpxchg_8_seq_cst
10540       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10541     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10542 
10543     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10544   }
10545 #endif // LINUX
10546 
10547   static void save_return_registers(MacroAssembler* masm) {
10548     if (InlineTypeReturnedAsFields) {
10549       masm->push(RegSet::range(r0, r7), sp);
10550       masm->sub(sp, sp, 4 * wordSize);
10551       masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
10552       masm->sub(sp, sp, 4 * wordSize);
10553       masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
10554     } else {
10555       masm->fmovd(rscratch1, v0);
10556       masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
10557     }
10558   }
10559 
10560   static void restore_return_registers(MacroAssembler* masm) {
10561     if (InlineTypeReturnedAsFields) {
10562       masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10563       masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10564       masm->pop(RegSet::range(r0, r7), sp);
10565     } else {
10566       masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
10567       masm->fmovd(v0, rscratch1);
10568     }
10569   }
10570 
10571   address generate_cont_thaw(Continuation::thaw_kind kind) {
10572     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10573     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10574 
10575     address start = __ pc();
10576 
10577     if (return_barrier) {
10578       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10579       __ mov(sp, rscratch1);
10580     }
10581     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10582 
10583     if (return_barrier) {
10584       // preserve possible return value from a method returning to the return barrier
10585       save_return_registers(_masm);
10586     }
10587 
10588     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10589     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10590     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10591 
10592     if (return_barrier) {
10593       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10594       restore_return_registers(_masm);
10595     }
10596     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10597 
10598 
10599     Label thaw_success;
10600     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10601     __ cbnz(rscratch2, thaw_success);
10602     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10603     __ br(rscratch1);
10604     __ bind(thaw_success);
10605 
10606     // make room for the thawed frames
10607     __ sub(rscratch1, sp, rscratch2);
10608     __ andr(rscratch1, rscratch1, -16); // align
10609     __ mov(sp, rscratch1);
10610 
10611     if (return_barrier) {
10612       // save original return value -- again
10613       save_return_registers(_masm);
10614     }
10615 
10616     // If we want, we can templatize thaw by kind, and have three different entries
10617     __ movw(c_rarg1, (uint32_t)kind);
10618 
10619     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10620     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10621 
10622     if (return_barrier) {
10623       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10624       restore_return_registers(_masm);
10625     } else {
10626       __ mov(r0, zr); // return 0 (success) from doYield
10627     }
10628 
10629     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10630     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10631     __ mov(rfp, sp);
10632 
10633     if (return_barrier_exception) {
10634       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10635       __ authenticate_return_address(c_rarg1);
10636       __ verify_oop(r0);
10637       // save return value containing the exception oop in callee-saved R19
10638       __ mov(r19, r0);
10639 
10640       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10641 
10642       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10643       // __ reinitialize_ptrue();
10644 
10645       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10646 
10647       __ mov(r1, r0); // the exception handler
10648       __ mov(r0, r19); // restore return value containing the exception oop
10649       __ verify_oop(r0);
10650 
10651       __ leave();
10652       __ mov(r3, lr);
10653       __ br(r1); // the exception handler
10654     } else {
10655       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10656       __ leave();
10657       __ ret(lr);
10658     }
10659 
10660     return start;
10661   }
10662 
10663   address generate_cont_thaw() {
10664     if (!Continuations::enabled()) return nullptr;
10665 
10666     StubId stub_id = StubId::stubgen_cont_thaw_id;
10667     StubCodeMark mark(this, stub_id);
10668     address start = __ pc();
10669     generate_cont_thaw(Continuation::thaw_top);
10670     return start;
10671   }
10672 
10673   address generate_cont_returnBarrier() {
10674     if (!Continuations::enabled()) return nullptr;
10675 
10676     // TODO: will probably need multiple return barriers depending on return type
10677     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10678     StubCodeMark mark(this, stub_id);
10679     address start = __ pc();
10680 
10681     generate_cont_thaw(Continuation::thaw_return_barrier);
10682 
10683     return start;
10684   }
10685 
10686   address generate_cont_returnBarrier_exception() {
10687     if (!Continuations::enabled()) return nullptr;
10688 
10689     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10690     StubCodeMark mark(this, stub_id);
10691     address start = __ pc();
10692 
10693     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10694 
10695     return start;
10696   }
10697 
10698   address generate_cont_preempt_stub() {
10699     if (!Continuations::enabled()) return nullptr;
10700     StubId stub_id = StubId::stubgen_cont_preempt_id;
10701     StubCodeMark mark(this, stub_id);
10702     address start = __ pc();
10703 
10704     __ reset_last_Java_frame(true);
10705 
10706     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10707     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10708     __ mov(sp, rscratch2);
10709 
10710     Label preemption_cancelled;
10711     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10712     __ cbnz(rscratch1, preemption_cancelled);
10713 
10714     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10715     SharedRuntime::continuation_enter_cleanup(_masm);
10716     __ leave();
10717     __ ret(lr);
10718 
10719     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10720     __ bind(preemption_cancelled);
10721     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10722     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10723     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10724     __ ldr(rscratch1, Address(rscratch1));
10725     __ br(rscratch1);
10726 
10727     return start;
10728   }
10729 
10730   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10731   // are represented as long[5], with BITS_PER_LIMB = 26.
10732   // Pack five 26-bit limbs into three 64-bit registers.
10733   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10734     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10735     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10736     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10737     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10738 
10739     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10740     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10741     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10742     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10743 
10744     if (dest2->is_valid()) {
10745       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10746     } else {
10747 #ifdef ASSERT
10748       Label OK;
10749       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10750       __ br(__ EQ, OK);
10751       __ stop("high bits of Poly1305 integer should be zero");
10752       __ should_not_reach_here();
10753       __ bind(OK);
10754 #endif
10755     }
10756   }
10757 
10758   // As above, but return only a 128-bit integer, packed into two
10759   // 64-bit registers.
10760   void pack_26(Register dest0, Register dest1, Register src) {
10761     pack_26(dest0, dest1, noreg, src);
10762   }
10763 
10764   // Multiply and multiply-accumulate unsigned 64-bit registers.
10765   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10766     __ mul(prod_lo, n, m);
10767     __ umulh(prod_hi, n, m);
10768   }
10769   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10770     wide_mul(rscratch1, rscratch2, n, m);
10771     __ adds(sum_lo, sum_lo, rscratch1);
10772     __ adc(sum_hi, sum_hi, rscratch2);
10773   }
10774 
10775   // Poly1305, RFC 7539
10776 
10777   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10778   // description of the tricks used to simplify and accelerate this
10779   // computation.
10780 
10781   address generate_poly1305_processBlocks() {
10782     __ align(CodeEntryAlignment);
10783     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10784     StubCodeMark mark(this, stub_id);
10785     address start = __ pc();
10786     Label here;
10787     __ enter();
10788     RegSet callee_saved = RegSet::range(r19, r28);
10789     __ push(callee_saved, sp);
10790 
10791     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10792 
10793     // Arguments
10794     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10795 
10796     // R_n is the 128-bit randomly-generated key, packed into two
10797     // registers.  The caller passes this key to us as long[5], with
10798     // BITS_PER_LIMB = 26.
10799     const Register R_0 = *++regs, R_1 = *++regs;
10800     pack_26(R_0, R_1, r_start);
10801 
10802     // RR_n is (R_n >> 2) * 5
10803     const Register RR_0 = *++regs, RR_1 = *++regs;
10804     __ lsr(RR_0, R_0, 2);
10805     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10806     __ lsr(RR_1, R_1, 2);
10807     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10808 
10809     // U_n is the current checksum
10810     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10811     pack_26(U_0, U_1, U_2, acc_start);
10812 
10813     static constexpr int BLOCK_LENGTH = 16;
10814     Label DONE, LOOP;
10815 
10816     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10817     __ br(Assembler::LT, DONE); {
10818       __ bind(LOOP);
10819 
10820       // S_n is to be the sum of U_n and the next block of data
10821       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10822       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10823       __ adds(S_0, U_0, S_0);
10824       __ adcs(S_1, U_1, S_1);
10825       __ adc(S_2, U_2, zr);
10826       __ add(S_2, S_2, 1);
10827 
10828       const Register U_0HI = *++regs, U_1HI = *++regs;
10829 
10830       // NB: this logic depends on some of the special properties of
10831       // Poly1305 keys. In particular, because we know that the top
10832       // four bits of R_0 and R_1 are zero, we can add together
10833       // partial products without any risk of needing to propagate a
10834       // carry out.
10835       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10836       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10837       __ andr(U_2, R_0, 3);
10838       __ mul(U_2, S_2, U_2);
10839 
10840       // Recycle registers S_0, S_1, S_2
10841       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10842 
10843       // Partial reduction mod 2**130 - 5
10844       __ adds(U_1, U_0HI, U_1);
10845       __ adc(U_2, U_1HI, U_2);
10846       // Sum now in U_2:U_1:U_0.
10847       // Dead: U_0HI, U_1HI.
10848       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10849 
10850       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10851 
10852       // First, U_2:U_1:U_0 += (U_2 >> 2)
10853       __ lsr(rscratch1, U_2, 2);
10854       __ andr(U_2, U_2, (u8)3);
10855       __ adds(U_0, U_0, rscratch1);
10856       __ adcs(U_1, U_1, zr);
10857       __ adc(U_2, U_2, zr);
10858       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10859       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10860       __ adcs(U_1, U_1, zr);
10861       __ adc(U_2, U_2, zr);
10862 
10863       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10864       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10865       __ br(~ Assembler::LT, LOOP);
10866     }
10867 
10868     // Further reduce modulo 2^130 - 5
10869     __ lsr(rscratch1, U_2, 2);
10870     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10871     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10872     __ adcs(U_1, U_1, zr);
10873     __ andr(U_2, U_2, (u1)3);
10874     __ adc(U_2, U_2, zr);
10875 
10876     // Unpack the sum into five 26-bit limbs and write to memory.
10877     __ ubfiz(rscratch1, U_0, 0, 26);
10878     __ ubfx(rscratch2, U_0, 26, 26);
10879     __ stp(rscratch1, rscratch2, Address(acc_start));
10880     __ ubfx(rscratch1, U_0, 52, 12);
10881     __ bfi(rscratch1, U_1, 12, 14);
10882     __ ubfx(rscratch2, U_1, 14, 26);
10883     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10884     __ ubfx(rscratch1, U_1, 40, 24);
10885     __ bfi(rscratch1, U_2, 24, 3);
10886     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10887 
10888     __ bind(DONE);
10889     __ pop(callee_saved, sp);
10890     __ leave();
10891     __ ret(lr);
10892 
10893     return start;
10894   }
10895 
10896   // exception handler for upcall stubs
10897   address generate_upcall_stub_exception_handler() {
10898     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10899     StubCodeMark mark(this, stub_id);
10900     address start = __ pc();
10901 
10902     // Native caller has no idea how to handle exceptions,
10903     // so we just crash here. Up to callee to catch exceptions.
10904     __ verify_oop(r0);
10905     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10906     __ blr(rscratch1);
10907     __ should_not_reach_here();
10908 
10909     return start;
10910   }
10911 
10912   // load Method* target of MethodHandle
10913   // j_rarg0 = jobject receiver
10914   // rmethod = result
10915   address generate_upcall_stub_load_target() {
10916     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10917     StubCodeMark mark(this, stub_id);
10918     address start = __ pc();
10919 
10920     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10921       // Load target method from receiver
10922     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10923     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10924     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10925     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10926                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10927                       noreg, noreg);
10928     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10929 
10930     __ ret(lr);
10931 
10932     return start;
10933   }
10934 
10935 #undef __
10936 #define __ masm->
10937 
10938   class MontgomeryMultiplyGenerator : public MacroAssembler {
10939 
10940     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10941       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10942 
10943     RegSet _toSave;
10944     bool _squaring;
10945 
10946   public:
10947     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10948       : MacroAssembler(as->code()), _squaring(squaring) {
10949 
10950       // Register allocation
10951 
10952       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10953       Pa_base = *regs;       // Argument registers
10954       if (squaring)
10955         Pb_base = Pa_base;
10956       else
10957         Pb_base = *++regs;
10958       Pn_base = *++regs;
10959       Rlen= *++regs;
10960       inv = *++regs;
10961       Pm_base = *++regs;
10962 
10963                           // Working registers:
10964       Ra =  *++regs;        // The current digit of a, b, n, and m.
10965       Rb =  *++regs;
10966       Rm =  *++regs;
10967       Rn =  *++regs;
10968 
10969       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10970       Pb =  *++regs;
10971       Pm =  *++regs;
10972       Pn =  *++regs;
10973 
10974       t0 =  *++regs;        // Three registers which form a
10975       t1 =  *++regs;        // triple-precision accumuator.
10976       t2 =  *++regs;
10977 
10978       Ri =  *++regs;        // Inner and outer loop indexes.
10979       Rj =  *++regs;
10980 
10981       Rhi_ab = *++regs;     // Product registers: low and high parts
10982       Rlo_ab = *++regs;     // of a*b and m*n.
10983       Rhi_mn = *++regs;
10984       Rlo_mn = *++regs;
10985 
10986       // r19 and up are callee-saved.
10987       _toSave = RegSet::range(r19, *regs) + Pm_base;
10988     }
10989 
10990   private:
10991     void save_regs() {
10992       push(_toSave, sp);
10993     }
10994 
10995     void restore_regs() {
10996       pop(_toSave, sp);
10997     }
10998 
10999     template <typename T>
11000     void unroll_2(Register count, T block) {
11001       Label loop, end, odd;
11002       tbnz(count, 0, odd);
11003       cbz(count, end);
11004       align(16);
11005       bind(loop);
11006       (this->*block)();
11007       bind(odd);
11008       (this->*block)();
11009       subs(count, count, 2);
11010       br(Assembler::GT, loop);
11011       bind(end);
11012     }
11013 
11014     template <typename T>
11015     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11016       Label loop, end, odd;
11017       tbnz(count, 0, odd);
11018       cbz(count, end);
11019       align(16);
11020       bind(loop);
11021       (this->*block)(d, s, tmp);
11022       bind(odd);
11023       (this->*block)(d, s, tmp);
11024       subs(count, count, 2);
11025       br(Assembler::GT, loop);
11026       bind(end);
11027     }
11028 
11029     void pre1(RegisterOrConstant i) {
11030       block_comment("pre1");
11031       // Pa = Pa_base;
11032       // Pb = Pb_base + i;
11033       // Pm = Pm_base;
11034       // Pn = Pn_base + i;
11035       // Ra = *Pa;
11036       // Rb = *Pb;
11037       // Rm = *Pm;
11038       // Rn = *Pn;
11039       ldr(Ra, Address(Pa_base));
11040       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11041       ldr(Rm, Address(Pm_base));
11042       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11043       lea(Pa, Address(Pa_base));
11044       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11045       lea(Pm, Address(Pm_base));
11046       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11047 
11048       // Zero the m*n result.
11049       mov(Rhi_mn, zr);
11050       mov(Rlo_mn, zr);
11051     }
11052 
11053     // The core multiply-accumulate step of a Montgomery
11054     // multiplication.  The idea is to schedule operations as a
11055     // pipeline so that instructions with long latencies (loads and
11056     // multiplies) have time to complete before their results are
11057     // used.  This most benefits in-order implementations of the
11058     // architecture but out-of-order ones also benefit.
11059     void step() {
11060       block_comment("step");
11061       // MACC(Ra, Rb, t0, t1, t2);
11062       // Ra = *++Pa;
11063       // Rb = *--Pb;
11064       umulh(Rhi_ab, Ra, Rb);
11065       mul(Rlo_ab, Ra, Rb);
11066       ldr(Ra, pre(Pa, wordSize));
11067       ldr(Rb, pre(Pb, -wordSize));
11068       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11069                                        // previous iteration.
11070       // MACC(Rm, Rn, t0, t1, t2);
11071       // Rm = *++Pm;
11072       // Rn = *--Pn;
11073       umulh(Rhi_mn, Rm, Rn);
11074       mul(Rlo_mn, Rm, Rn);
11075       ldr(Rm, pre(Pm, wordSize));
11076       ldr(Rn, pre(Pn, -wordSize));
11077       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11078     }
11079 
11080     void post1() {
11081       block_comment("post1");
11082 
11083       // MACC(Ra, Rb, t0, t1, t2);
11084       // Ra = *++Pa;
11085       // Rb = *--Pb;
11086       umulh(Rhi_ab, Ra, Rb);
11087       mul(Rlo_ab, Ra, Rb);
11088       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11089       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11090 
11091       // *Pm = Rm = t0 * inv;
11092       mul(Rm, t0, inv);
11093       str(Rm, Address(Pm));
11094 
11095       // MACC(Rm, Rn, t0, t1, t2);
11096       // t0 = t1; t1 = t2; t2 = 0;
11097       umulh(Rhi_mn, Rm, Rn);
11098 
11099 #ifndef PRODUCT
11100       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11101       {
11102         mul(Rlo_mn, Rm, Rn);
11103         add(Rlo_mn, t0, Rlo_mn);
11104         Label ok;
11105         cbz(Rlo_mn, ok); {
11106           stop("broken Montgomery multiply");
11107         } bind(ok);
11108       }
11109 #endif
11110       // We have very carefully set things up so that
11111       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11112       // the lower half of Rm * Rn because we know the result already:
11113       // it must be -t0.  t0 + (-t0) must generate a carry iff
11114       // t0 != 0.  So, rather than do a mul and an adds we just set
11115       // the carry flag iff t0 is nonzero.
11116       //
11117       // mul(Rlo_mn, Rm, Rn);
11118       // adds(zr, t0, Rlo_mn);
11119       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11120       adcs(t0, t1, Rhi_mn);
11121       adc(t1, t2, zr);
11122       mov(t2, zr);
11123     }
11124 
11125     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11126       block_comment("pre2");
11127       // Pa = Pa_base + i-len;
11128       // Pb = Pb_base + len;
11129       // Pm = Pm_base + i-len;
11130       // Pn = Pn_base + len;
11131 
11132       if (i.is_register()) {
11133         sub(Rj, i.as_register(), len);
11134       } else {
11135         mov(Rj, i.as_constant());
11136         sub(Rj, Rj, len);
11137       }
11138       // Rj == i-len
11139 
11140       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11141       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11142       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11143       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11144 
11145       // Ra = *++Pa;
11146       // Rb = *--Pb;
11147       // Rm = *++Pm;
11148       // Rn = *--Pn;
11149       ldr(Ra, pre(Pa, wordSize));
11150       ldr(Rb, pre(Pb, -wordSize));
11151       ldr(Rm, pre(Pm, wordSize));
11152       ldr(Rn, pre(Pn, -wordSize));
11153 
11154       mov(Rhi_mn, zr);
11155       mov(Rlo_mn, zr);
11156     }
11157 
11158     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11159       block_comment("post2");
11160       if (i.is_constant()) {
11161         mov(Rj, i.as_constant()-len.as_constant());
11162       } else {
11163         sub(Rj, i.as_register(), len);
11164       }
11165 
11166       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11167 
11168       // As soon as we know the least significant digit of our result,
11169       // store it.
11170       // Pm_base[i-len] = t0;
11171       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11172 
11173       // t0 = t1; t1 = t2; t2 = 0;
11174       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11175       adc(t1, t2, zr);
11176       mov(t2, zr);
11177     }
11178 
11179     // A carry in t0 after Montgomery multiplication means that we
11180     // should subtract multiples of n from our result in m.  We'll
11181     // keep doing that until there is no carry.
11182     void normalize(RegisterOrConstant len) {
11183       block_comment("normalize");
11184       // while (t0)
11185       //   t0 = sub(Pm_base, Pn_base, t0, len);
11186       Label loop, post, again;
11187       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11188       cbz(t0, post); {
11189         bind(again); {
11190           mov(i, zr);
11191           mov(cnt, len);
11192           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11193           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11194           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11195           align(16);
11196           bind(loop); {
11197             sbcs(Rm, Rm, Rn);
11198             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11199             add(i, i, 1);
11200             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11201             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11202             sub(cnt, cnt, 1);
11203           } cbnz(cnt, loop);
11204           sbc(t0, t0, zr);
11205         } cbnz(t0, again);
11206       } bind(post);
11207     }
11208 
11209     // Move memory at s to d, reversing words.
11210     //    Increments d to end of copied memory
11211     //    Destroys tmp1, tmp2
11212     //    Preserves len
11213     //    Leaves s pointing to the address which was in d at start
11214     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11215       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11216       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11217 
11218       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11219       mov(tmp1, len);
11220       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11221       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11222     }
11223     // where
11224     void reverse1(Register d, Register s, Register tmp) {
11225       ldr(tmp, pre(s, -wordSize));
11226       ror(tmp, tmp, 32);
11227       str(tmp, post(d, wordSize));
11228     }
11229 
11230     void step_squaring() {
11231       // An extra ACC
11232       step();
11233       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11234     }
11235 
11236     void last_squaring(RegisterOrConstant i) {
11237       Label dont;
11238       // if ((i & 1) == 0) {
11239       tbnz(i.as_register(), 0, dont); {
11240         // MACC(Ra, Rb, t0, t1, t2);
11241         // Ra = *++Pa;
11242         // Rb = *--Pb;
11243         umulh(Rhi_ab, Ra, Rb);
11244         mul(Rlo_ab, Ra, Rb);
11245         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11246       } bind(dont);
11247     }
11248 
11249     void extra_step_squaring() {
11250       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11251 
11252       // MACC(Rm, Rn, t0, t1, t2);
11253       // Rm = *++Pm;
11254       // Rn = *--Pn;
11255       umulh(Rhi_mn, Rm, Rn);
11256       mul(Rlo_mn, Rm, Rn);
11257       ldr(Rm, pre(Pm, wordSize));
11258       ldr(Rn, pre(Pn, -wordSize));
11259     }
11260 
11261     void post1_squaring() {
11262       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11263 
11264       // *Pm = Rm = t0 * inv;
11265       mul(Rm, t0, inv);
11266       str(Rm, Address(Pm));
11267 
11268       // MACC(Rm, Rn, t0, t1, t2);
11269       // t0 = t1; t1 = t2; t2 = 0;
11270       umulh(Rhi_mn, Rm, Rn);
11271 
11272 #ifndef PRODUCT
11273       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11274       {
11275         mul(Rlo_mn, Rm, Rn);
11276         add(Rlo_mn, t0, Rlo_mn);
11277         Label ok;
11278         cbz(Rlo_mn, ok); {
11279           stop("broken Montgomery multiply");
11280         } bind(ok);
11281       }
11282 #endif
11283       // We have very carefully set things up so that
11284       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11285       // the lower half of Rm * Rn because we know the result already:
11286       // it must be -t0.  t0 + (-t0) must generate a carry iff
11287       // t0 != 0.  So, rather than do a mul and an adds we just set
11288       // the carry flag iff t0 is nonzero.
11289       //
11290       // mul(Rlo_mn, Rm, Rn);
11291       // adds(zr, t0, Rlo_mn);
11292       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11293       adcs(t0, t1, Rhi_mn);
11294       adc(t1, t2, zr);
11295       mov(t2, zr);
11296     }
11297 
11298     void acc(Register Rhi, Register Rlo,
11299              Register t0, Register t1, Register t2) {
11300       adds(t0, t0, Rlo);
11301       adcs(t1, t1, Rhi);
11302       adc(t2, t2, zr);
11303     }
11304 
11305   public:
11306     /**
11307      * Fast Montgomery multiplication.  The derivation of the
11308      * algorithm is in A Cryptographic Library for the Motorola
11309      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11310      *
11311      * Arguments:
11312      *
11313      * Inputs for multiplication:
11314      *   c_rarg0   - int array elements a
11315      *   c_rarg1   - int array elements b
11316      *   c_rarg2   - int array elements n (the modulus)
11317      *   c_rarg3   - int length
11318      *   c_rarg4   - int inv
11319      *   c_rarg5   - int array elements m (the result)
11320      *
11321      * Inputs for squaring:
11322      *   c_rarg0   - int array elements a
11323      *   c_rarg1   - int array elements n (the modulus)
11324      *   c_rarg2   - int length
11325      *   c_rarg3   - int inv
11326      *   c_rarg4   - int array elements m (the result)
11327      *
11328      */
11329     address generate_multiply() {
11330       Label argh, nothing;
11331       bind(argh);
11332       stop("MontgomeryMultiply total_allocation must be <= 8192");
11333 
11334       align(CodeEntryAlignment);
11335       address entry = pc();
11336 
11337       cbzw(Rlen, nothing);
11338 
11339       enter();
11340 
11341       // Make room.
11342       cmpw(Rlen, 512);
11343       br(Assembler::HI, argh);
11344       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11345       andr(sp, Ra, -2 * wordSize);
11346 
11347       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11348 
11349       {
11350         // Copy input args, reversing as we go.  We use Ra as a
11351         // temporary variable.
11352         reverse(Ra, Pa_base, Rlen, t0, t1);
11353         if (!_squaring)
11354           reverse(Ra, Pb_base, Rlen, t0, t1);
11355         reverse(Ra, Pn_base, Rlen, t0, t1);
11356       }
11357 
11358       // Push all call-saved registers and also Pm_base which we'll need
11359       // at the end.
11360       save_regs();
11361 
11362 #ifndef PRODUCT
11363       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11364       {
11365         ldr(Rn, Address(Pn_base, 0));
11366         mul(Rlo_mn, Rn, inv);
11367         subs(zr, Rlo_mn, -1);
11368         Label ok;
11369         br(EQ, ok); {
11370           stop("broken inverse in Montgomery multiply");
11371         } bind(ok);
11372       }
11373 #endif
11374 
11375       mov(Pm_base, Ra);
11376 
11377       mov(t0, zr);
11378       mov(t1, zr);
11379       mov(t2, zr);
11380 
11381       block_comment("for (int i = 0; i < len; i++) {");
11382       mov(Ri, zr); {
11383         Label loop, end;
11384         cmpw(Ri, Rlen);
11385         br(Assembler::GE, end);
11386 
11387         bind(loop);
11388         pre1(Ri);
11389 
11390         block_comment("  for (j = i; j; j--) {"); {
11391           movw(Rj, Ri);
11392           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11393         } block_comment("  } // j");
11394 
11395         post1();
11396         addw(Ri, Ri, 1);
11397         cmpw(Ri, Rlen);
11398         br(Assembler::LT, loop);
11399         bind(end);
11400         block_comment("} // i");
11401       }
11402 
11403       block_comment("for (int i = len; i < 2*len; i++) {");
11404       mov(Ri, Rlen); {
11405         Label loop, end;
11406         cmpw(Ri, Rlen, Assembler::LSL, 1);
11407         br(Assembler::GE, end);
11408 
11409         bind(loop);
11410         pre2(Ri, Rlen);
11411 
11412         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11413           lslw(Rj, Rlen, 1);
11414           subw(Rj, Rj, Ri);
11415           subw(Rj, Rj, 1);
11416           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11417         } block_comment("  } // j");
11418 
11419         post2(Ri, Rlen);
11420         addw(Ri, Ri, 1);
11421         cmpw(Ri, Rlen, Assembler::LSL, 1);
11422         br(Assembler::LT, loop);
11423         bind(end);
11424       }
11425       block_comment("} // i");
11426 
11427       normalize(Rlen);
11428 
11429       mov(Ra, Pm_base);  // Save Pm_base in Ra
11430       restore_regs();  // Restore caller's Pm_base
11431 
11432       // Copy our result into caller's Pm_base
11433       reverse(Pm_base, Ra, Rlen, t0, t1);
11434 
11435       leave();
11436       bind(nothing);
11437       ret(lr);
11438 
11439       return entry;
11440     }
11441     // In C, approximately:
11442 
11443     // void
11444     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11445     //                     julong Pn_base[], julong Pm_base[],
11446     //                     julong inv, int len) {
11447     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11448     //   julong *Pa, *Pb, *Pn, *Pm;
11449     //   julong Ra, Rb, Rn, Rm;
11450 
11451     //   int i;
11452 
11453     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11454 
11455     //   for (i = 0; i < len; i++) {
11456     //     int j;
11457 
11458     //     Pa = Pa_base;
11459     //     Pb = Pb_base + i;
11460     //     Pm = Pm_base;
11461     //     Pn = Pn_base + i;
11462 
11463     //     Ra = *Pa;
11464     //     Rb = *Pb;
11465     //     Rm = *Pm;
11466     //     Rn = *Pn;
11467 
11468     //     int iters = i;
11469     //     for (j = 0; iters--; j++) {
11470     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11471     //       MACC(Ra, Rb, t0, t1, t2);
11472     //       Ra = *++Pa;
11473     //       Rb = *--Pb;
11474     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11475     //       MACC(Rm, Rn, t0, t1, t2);
11476     //       Rm = *++Pm;
11477     //       Rn = *--Pn;
11478     //     }
11479 
11480     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11481     //     MACC(Ra, Rb, t0, t1, t2);
11482     //     *Pm = Rm = t0 * inv;
11483     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11484     //     MACC(Rm, Rn, t0, t1, t2);
11485 
11486     //     assert(t0 == 0, "broken Montgomery multiply");
11487 
11488     //     t0 = t1; t1 = t2; t2 = 0;
11489     //   }
11490 
11491     //   for (i = len; i < 2*len; i++) {
11492     //     int j;
11493 
11494     //     Pa = Pa_base + i-len;
11495     //     Pb = Pb_base + len;
11496     //     Pm = Pm_base + i-len;
11497     //     Pn = Pn_base + len;
11498 
11499     //     Ra = *++Pa;
11500     //     Rb = *--Pb;
11501     //     Rm = *++Pm;
11502     //     Rn = *--Pn;
11503 
11504     //     int iters = len*2-i-1;
11505     //     for (j = i-len+1; iters--; j++) {
11506     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11507     //       MACC(Ra, Rb, t0, t1, t2);
11508     //       Ra = *++Pa;
11509     //       Rb = *--Pb;
11510     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11511     //       MACC(Rm, Rn, t0, t1, t2);
11512     //       Rm = *++Pm;
11513     //       Rn = *--Pn;
11514     //     }
11515 
11516     //     Pm_base[i-len] = t0;
11517     //     t0 = t1; t1 = t2; t2 = 0;
11518     //   }
11519 
11520     //   while (t0)
11521     //     t0 = sub(Pm_base, Pn_base, t0, len);
11522     // }
11523 
11524     /**
11525      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11526      * multiplies than Montgomery multiplication so it should be up to
11527      * 25% faster.  However, its loop control is more complex and it
11528      * may actually run slower on some machines.
11529      *
11530      * Arguments:
11531      *
11532      * Inputs:
11533      *   c_rarg0   - int array elements a
11534      *   c_rarg1   - int array elements n (the modulus)
11535      *   c_rarg2   - int length
11536      *   c_rarg3   - int inv
11537      *   c_rarg4   - int array elements m (the result)
11538      *
11539      */
11540     address generate_square() {
11541       Label argh;
11542       bind(argh);
11543       stop("MontgomeryMultiply total_allocation must be <= 8192");
11544 
11545       align(CodeEntryAlignment);
11546       address entry = pc();
11547 
11548       enter();
11549 
11550       // Make room.
11551       cmpw(Rlen, 512);
11552       br(Assembler::HI, argh);
11553       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11554       andr(sp, Ra, -2 * wordSize);
11555 
11556       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11557 
11558       {
11559         // Copy input args, reversing as we go.  We use Ra as a
11560         // temporary variable.
11561         reverse(Ra, Pa_base, Rlen, t0, t1);
11562         reverse(Ra, Pn_base, Rlen, t0, t1);
11563       }
11564 
11565       // Push all call-saved registers and also Pm_base which we'll need
11566       // at the end.
11567       save_regs();
11568 
11569       mov(Pm_base, Ra);
11570 
11571       mov(t0, zr);
11572       mov(t1, zr);
11573       mov(t2, zr);
11574 
11575       block_comment("for (int i = 0; i < len; i++) {");
11576       mov(Ri, zr); {
11577         Label loop, end;
11578         bind(loop);
11579         cmp(Ri, Rlen);
11580         br(Assembler::GE, end);
11581 
11582         pre1(Ri);
11583 
11584         block_comment("for (j = (i+1)/2; j; j--) {"); {
11585           add(Rj, Ri, 1);
11586           lsr(Rj, Rj, 1);
11587           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11588         } block_comment("  } // j");
11589 
11590         last_squaring(Ri);
11591 
11592         block_comment("  for (j = i/2; j; j--) {"); {
11593           lsr(Rj, Ri, 1);
11594           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11595         } block_comment("  } // j");
11596 
11597         post1_squaring();
11598         add(Ri, Ri, 1);
11599         cmp(Ri, Rlen);
11600         br(Assembler::LT, loop);
11601 
11602         bind(end);
11603         block_comment("} // i");
11604       }
11605 
11606       block_comment("for (int i = len; i < 2*len; i++) {");
11607       mov(Ri, Rlen); {
11608         Label loop, end;
11609         bind(loop);
11610         cmp(Ri, Rlen, Assembler::LSL, 1);
11611         br(Assembler::GE, end);
11612 
11613         pre2(Ri, Rlen);
11614 
11615         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11616           lsl(Rj, Rlen, 1);
11617           sub(Rj, Rj, Ri);
11618           sub(Rj, Rj, 1);
11619           lsr(Rj, Rj, 1);
11620           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11621         } block_comment("  } // j");
11622 
11623         last_squaring(Ri);
11624 
11625         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11626           lsl(Rj, Rlen, 1);
11627           sub(Rj, Rj, Ri);
11628           lsr(Rj, Rj, 1);
11629           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11630         } block_comment("  } // j");
11631 
11632         post2(Ri, Rlen);
11633         add(Ri, Ri, 1);
11634         cmp(Ri, Rlen, Assembler::LSL, 1);
11635 
11636         br(Assembler::LT, loop);
11637         bind(end);
11638         block_comment("} // i");
11639       }
11640 
11641       normalize(Rlen);
11642 
11643       mov(Ra, Pm_base);  // Save Pm_base in Ra
11644       restore_regs();  // Restore caller's Pm_base
11645 
11646       // Copy our result into caller's Pm_base
11647       reverse(Pm_base, Ra, Rlen, t0, t1);
11648 
11649       leave();
11650       ret(lr);
11651 
11652       return entry;
11653     }
11654     // In C, approximately:
11655 
11656     // void
11657     // montgomery_square(julong Pa_base[], julong Pn_base[],
11658     //                   julong Pm_base[], julong inv, int len) {
11659     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11660     //   julong *Pa, *Pb, *Pn, *Pm;
11661     //   julong Ra, Rb, Rn, Rm;
11662 
11663     //   int i;
11664 
11665     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11666 
11667     //   for (i = 0; i < len; i++) {
11668     //     int j;
11669 
11670     //     Pa = Pa_base;
11671     //     Pb = Pa_base + i;
11672     //     Pm = Pm_base;
11673     //     Pn = Pn_base + i;
11674 
11675     //     Ra = *Pa;
11676     //     Rb = *Pb;
11677     //     Rm = *Pm;
11678     //     Rn = *Pn;
11679 
11680     //     int iters = (i+1)/2;
11681     //     for (j = 0; iters--; j++) {
11682     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11683     //       MACC2(Ra, Rb, t0, t1, t2);
11684     //       Ra = *++Pa;
11685     //       Rb = *--Pb;
11686     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11687     //       MACC(Rm, Rn, t0, t1, t2);
11688     //       Rm = *++Pm;
11689     //       Rn = *--Pn;
11690     //     }
11691     //     if ((i & 1) == 0) {
11692     //       assert(Ra == Pa_base[j], "must be");
11693     //       MACC(Ra, Ra, t0, t1, t2);
11694     //     }
11695     //     iters = i/2;
11696     //     assert(iters == i-j, "must be");
11697     //     for (; iters--; j++) {
11698     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11699     //       MACC(Rm, Rn, t0, t1, t2);
11700     //       Rm = *++Pm;
11701     //       Rn = *--Pn;
11702     //     }
11703 
11704     //     *Pm = Rm = t0 * inv;
11705     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11706     //     MACC(Rm, Rn, t0, t1, t2);
11707 
11708     //     assert(t0 == 0, "broken Montgomery multiply");
11709 
11710     //     t0 = t1; t1 = t2; t2 = 0;
11711     //   }
11712 
11713     //   for (i = len; i < 2*len; i++) {
11714     //     int start = i-len+1;
11715     //     int end = start + (len - start)/2;
11716     //     int j;
11717 
11718     //     Pa = Pa_base + i-len;
11719     //     Pb = Pa_base + len;
11720     //     Pm = Pm_base + i-len;
11721     //     Pn = Pn_base + len;
11722 
11723     //     Ra = *++Pa;
11724     //     Rb = *--Pb;
11725     //     Rm = *++Pm;
11726     //     Rn = *--Pn;
11727 
11728     //     int iters = (2*len-i-1)/2;
11729     //     assert(iters == end-start, "must be");
11730     //     for (j = start; iters--; j++) {
11731     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11732     //       MACC2(Ra, Rb, t0, t1, t2);
11733     //       Ra = *++Pa;
11734     //       Rb = *--Pb;
11735     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11736     //       MACC(Rm, Rn, t0, t1, t2);
11737     //       Rm = *++Pm;
11738     //       Rn = *--Pn;
11739     //     }
11740     //     if ((i & 1) == 0) {
11741     //       assert(Ra == Pa_base[j], "must be");
11742     //       MACC(Ra, Ra, t0, t1, t2);
11743     //     }
11744     //     iters =  (2*len-i)/2;
11745     //     assert(iters == len-j, "must be");
11746     //     for (; iters--; j++) {
11747     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11748     //       MACC(Rm, Rn, t0, t1, t2);
11749     //       Rm = *++Pm;
11750     //       Rn = *--Pn;
11751     //     }
11752     //     Pm_base[i-len] = t0;
11753     //     t0 = t1; t1 = t2; t2 = 0;
11754     //   }
11755 
11756     //   while (t0)
11757     //     t0 = sub(Pm_base, Pn_base, t0, len);
11758     // }
11759   };
11760 
11761   // Call here from the interpreter or compiled code to either load
11762   // multiple returned values from the inline type instance being
11763   // returned to registers or to store returned values to a newly
11764   // allocated inline type instance.
11765   address generate_return_value_stub(address destination, const char* name, bool has_res) {
11766     // We need to save all registers the calling convention may use so
11767     // the runtime calls read or update those registers. This needs to
11768     // be in sync with SharedRuntime::java_return_convention().
11769     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11770     enum layout {
11771       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
11772       j_rarg6_off, j_rarg6_2,
11773       j_rarg5_off, j_rarg5_2,
11774       j_rarg4_off, j_rarg4_2,
11775       j_rarg3_off, j_rarg3_2,
11776       j_rarg2_off, j_rarg2_2,
11777       j_rarg1_off, j_rarg1_2,
11778       j_rarg0_off, j_rarg0_2,
11779 
11780       j_farg7_off, j_farg7_2,
11781       j_farg6_off, j_farg6_2,
11782       j_farg5_off, j_farg5_2,
11783       j_farg4_off, j_farg4_2,
11784       j_farg3_off, j_farg3_2,
11785       j_farg2_off, j_farg2_2,
11786       j_farg1_off, j_farg1_2,
11787       j_farg0_off, j_farg0_2,
11788 
11789       rfp_off, rfp_off2,
11790       return_off, return_off2,
11791 
11792       framesize // inclusive of return address
11793     };
11794 
11795     CodeBuffer code(name, 512, 64);
11796     MacroAssembler* masm = new MacroAssembler(&code);
11797 
11798     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11799     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11800     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11801     int frame_size_in_words = frame_size_in_bytes / wordSize;
11802 
11803     OopMapSet* oop_maps = new OopMapSet();
11804     OopMap* map = new OopMap(frame_size_in_slots, 0);
11805 
11806     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11807     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11808     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11809     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11810     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11811     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11812     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11813     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11814 
11815     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11816     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11817     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11818     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11819     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11820     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11821     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11822     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11823 
11824     address start = __ pc();
11825 
11826     __ enter(); // Save FP and LR before call
11827 
11828     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11829     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11830     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11831     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11832 
11833     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11834     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11835     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11836     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11837 
11838     int frame_complete = __ offset();
11839 
11840     // Set up last_Java_sp and last_Java_fp
11841     address the_pc = __ pc();
11842     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11843 
11844     // Call runtime
11845     __ mov(c_rarg1, r0);
11846     __ mov(c_rarg0, rthread);
11847 
11848     __ mov(rscratch1, destination);
11849     __ blr(rscratch1);
11850 
11851     oop_maps->add_gc_map(the_pc - start, map);
11852 
11853     __ reset_last_Java_frame(false);
11854 
11855     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11856     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11857     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11858     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11859 
11860     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11861     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11862     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11863     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11864 
11865     __ leave();
11866 
11867     // check for pending exceptions
11868     Label pending;
11869     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11870     __ cbnz(rscratch1, pending);
11871 
11872     if (has_res) {
11873       __ get_vm_result_oop(r0, rthread);
11874     }
11875 
11876     __ ret(lr);
11877 
11878     __ bind(pending);
11879     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11880 
11881     // -------------
11882     // make sure all code is generated
11883     masm->flush();
11884 
11885     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11886     return stub->entry_point();
11887   }
11888 
11889   // Initialization
11890   void generate_preuniverse_stubs() {
11891     // preuniverse stubs are not needed for aarch64
11892   }
11893 
11894   void generate_initial_stubs() {
11895     // Generate initial stubs and initializes the entry points
11896 
11897     // entry points that exist in all platforms Note: This is code
11898     // that could be shared among different platforms - however the
11899     // benefit seems to be smaller than the disadvantage of having a
11900     // much more complicated generator structure. See also comment in
11901     // stubRoutines.hpp.
11902 
11903     StubRoutines::_forward_exception_entry = generate_forward_exception();
11904 
11905     StubRoutines::_call_stub_entry =
11906       generate_call_stub(StubRoutines::_call_stub_return_address);
11907 
11908     // is referenced by megamorphic call
11909     StubRoutines::_catch_exception_entry = generate_catch_exception();
11910 
11911     // Initialize table for copy memory (arraycopy) check.
11912     if (UnsafeMemoryAccess::_table == nullptr) {
11913       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11914     }
11915 
11916     if (UseCRC32Intrinsics) {
11917       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11918     }
11919 
11920     if (UseCRC32CIntrinsics) {
11921       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11922     }
11923 
11924     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11925       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11926     }
11927 
11928     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11929       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11930     }
11931 
11932     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11933         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11934       StubRoutines::_hf2f = generate_float16ToFloat();
11935       StubRoutines::_f2hf = generate_floatToFloat16();
11936     }
11937 
11938     if (InlineTypeReturnedAsFields) {
11939       StubRoutines::_load_inline_type_fields_in_regs =
11940          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11941       StubRoutines::_store_inline_type_fields_to_buf =
11942          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11943     }
11944 
11945   }
11946 
11947   void generate_continuation_stubs() {
11948     // Continuation stubs:
11949     StubRoutines::_cont_thaw          = generate_cont_thaw();
11950     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11951     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11952     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11953   }
11954 
11955   void generate_final_stubs() {
11956     // support for verify_oop (must happen after universe_init)
11957     if (VerifyOops) {
11958       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11959     }
11960 
11961     // arraycopy stubs used by compilers
11962     generate_arraycopy_stubs();
11963 
11964     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11965 
11966     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11967 
11968     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11969     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11970 
11971 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11972 
11973     generate_atomic_entry_points();
11974 
11975 #endif // LINUX
11976 
11977 #ifdef COMPILER2
11978     if (UseSecondarySupersTable) {
11979       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11980       if (! InlineSecondarySupersTest) {
11981         generate_lookup_secondary_supers_table_stub();
11982       }
11983     }
11984 #endif
11985 
11986     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11987 
11988     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11989   }
11990 
11991   void generate_compiler_stubs() {
11992 #if COMPILER2_OR_JVMCI
11993 
11994     if (UseSVE == 0) {
11995       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11996     }
11997 
11998     // array equals stub for large arrays.
11999     if (!UseSimpleArrayEquals) {
12000       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12001     }
12002 
12003     // arrays_hascode stub for large arrays.
12004     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12005     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12006     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12007     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12008     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12009 
12010     // byte_array_inflate stub for large arrays.
12011     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12012 
12013     // countPositives stub for large arrays.
12014     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12015 
12016     generate_compare_long_strings();
12017 
12018     generate_string_indexof_stubs();
12019 
12020 #ifdef COMPILER2
12021     if (UseMultiplyToLenIntrinsic) {
12022       StubRoutines::_multiplyToLen = generate_multiplyToLen();
12023     }
12024 
12025     if (UseSquareToLenIntrinsic) {
12026       StubRoutines::_squareToLen = generate_squareToLen();
12027     }
12028 
12029     if (UseMulAddIntrinsic) {
12030       StubRoutines::_mulAdd = generate_mulAdd();
12031     }
12032 
12033     if (UseSIMDForBigIntegerShiftIntrinsics) {
12034       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12035       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
12036     }
12037 
12038     if (UseMontgomeryMultiplyIntrinsic) {
12039       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12040       StubCodeMark mark(this, stub_id);
12041       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12042       StubRoutines::_montgomeryMultiply = g.generate_multiply();
12043     }
12044 
12045     if (UseMontgomerySquareIntrinsic) {
12046       StubId stub_id = StubId::stubgen_montgomerySquare_id;
12047       StubCodeMark mark(this, stub_id);
12048       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12049       // We use generate_multiply() rather than generate_square()
12050       // because it's faster for the sizes of modulus we care about.
12051       StubRoutines::_montgomerySquare = g.generate_multiply();
12052     }
12053 
12054 #endif // COMPILER2
12055 
12056     if (UseChaCha20Intrinsics) {
12057       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12058     }
12059 
12060     if (UseKyberIntrinsics) {
12061       StubRoutines::_kyberNtt = generate_kyberNtt();
12062       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12063       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12064       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12065       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12066       StubRoutines::_kyber12To16 = generate_kyber12To16();
12067       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12068     }
12069 
12070     if (UseDilithiumIntrinsics) {
12071       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12072       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12073       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12074       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12075       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12076     }
12077 
12078     if (UseBASE64Intrinsics) {
12079         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12080         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12081     }
12082 
12083     // data cache line writeback
12084     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12085     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12086 
12087     if (UseAESIntrinsics) {
12088       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12089       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12090       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12091       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12092       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12093     }
12094     if (UseGHASHIntrinsics) {
12095       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12096       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
12097     }
12098     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12099       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12100     }
12101 
12102     if (UseMD5Intrinsics) {
12103       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12104       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12105     }
12106     if (UseSHA1Intrinsics) {
12107       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12108       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12109     }
12110     if (UseSHA256Intrinsics) {
12111       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12112       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12113     }
12114     if (UseSHA512Intrinsics) {
12115       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12116       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12117     }
12118     if (UseSHA3Intrinsics) {
12119 
12120       StubRoutines::_double_keccak         = generate_double_keccak();
12121       if (UseSIMDForSHA3Intrinsic) {
12122          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12123          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12124       } else {
12125          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12126          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12127       }
12128     }
12129 
12130     if (UsePoly1305Intrinsics) {
12131       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12132     }
12133 
12134     // generate Adler32 intrinsics code
12135     if (UseAdler32Intrinsics) {
12136       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12137     }
12138 
12139 #endif // COMPILER2_OR_JVMCI
12140   }
12141 
12142  public:
12143   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
12144     switch(blob_id) {
12145     case BlobId::stubgen_preuniverse_id:
12146       generate_preuniverse_stubs();
12147       break;
12148     case BlobId::stubgen_initial_id:
12149       generate_initial_stubs();
12150       break;
12151      case BlobId::stubgen_continuation_id:
12152       generate_continuation_stubs();
12153       break;
12154     case BlobId::stubgen_compiler_id:
12155       generate_compiler_stubs();
12156       break;
12157     case BlobId::stubgen_final_id:
12158       generate_final_stubs();
12159       break;
12160     default:
12161       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12162       break;
12163     };
12164   }
12165 }; // end class declaration
12166 
12167 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12168   StubGenerator g(code, blob_id);
12169 }
12170 
12171 
12172 #if defined (LINUX)
12173 
12174 // Define pointers to atomic stubs and initialize them to point to the
12175 // code in atomic_aarch64.S.
12176 
12177 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12178   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12179     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12180   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12181     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12182 
12183 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12184 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12185 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12186 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12187 DEFAULT_ATOMIC_OP(xchg, 4, )
12188 DEFAULT_ATOMIC_OP(xchg, 8, )
12189 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12190 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12191 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12192 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12193 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12194 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12195 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12196 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12197 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12198 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12199 
12200 #undef DEFAULT_ATOMIC_OP
12201 
12202 #endif // LINUX