1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubId stub_id = StubId::stubgen_call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     // All of j_rargN may be used to return inline type fields so be careful
  332     // not to clobber those.
  333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  334     // assignment of Rresult below.
  335     Register Rresult = r14, Rresult_type = r15;
  336     __ ldr(Rresult, result);
  337     Label is_long, is_float, is_double, check_prim, exit;
  338     __ ldr(Rresult_type, result_type);
  339     __ cmp(Rresult_type, (u1)T_OBJECT);
  340     __ br(Assembler::EQ, check_prim);
  341     __ cmp(Rresult_type, (u1)T_LONG);
  342     __ br(Assembler::EQ, is_long);
  343     __ cmp(Rresult_type, (u1)T_FLOAT);
  344     __ br(Assembler::EQ, is_float);
  345     __ cmp(Rresult_type, (u1)T_DOUBLE);
  346     __ br(Assembler::EQ, is_double);
  347 
  348     // handle T_INT case
  349     __ strw(r0, Address(Rresult));
  350 
  351     __ BIND(exit);
  352 
  353     // pop parameters
  354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  355 
  356 #ifdef ASSERT
  357     // verify that threads correspond
  358     {
  359       Label L, S;
  360       __ ldr(rscratch1, thread);
  361       __ cmp(rthread, rscratch1);
  362       __ br(Assembler::NE, S);
  363       __ get_thread(rscratch1);
  364       __ cmp(rthread, rscratch1);
  365       __ br(Assembler::EQ, L);
  366       __ BIND(S);
  367       __ stop("StubRoutines::call_stub: threads must correspond");
  368       __ BIND(L);
  369     }
  370 #endif
  371 
  372     __ pop_cont_fastpath(rthread);
  373 
  374     // restore callee-save registers
  375     __ ldpd(v15, v14,  d15_save);
  376     __ ldpd(v13, v12,  d13_save);
  377     __ ldpd(v11, v10,  d11_save);
  378     __ ldpd(v9,  v8,   d9_save);
  379 
  380     __ ldp(r28, r27,   r28_save);
  381     __ ldp(r26, r25,   r26_save);
  382     __ ldp(r24, r23,   r24_save);
  383     __ ldp(r22, r21,   r22_save);
  384     __ ldp(r20, r19,   r20_save);
  385 
  386     // restore fpcr
  387     __ ldr(rscratch1,  fpcr_save);
  388     __ set_fpcr(rscratch1);
  389 
  390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  391     __ ldrw(c_rarg2, result_type);
  392     __ ldr(c_rarg3,  method);
  393     __ ldp(c_rarg4, c_rarg5,  entry_point);
  394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  395 
  396     // leave frame and return to caller
  397     __ leave();
  398     __ ret(lr);
  399 
  400     // handle return types different from T_INT
  401     __ BIND(check_prim);
  402     if (InlineTypeReturnedAsFields) {
  403       // Check for scalarized return value
  404       __ tbz(r0, 0, is_long);
  405       // Load pack handler address
  406       __ andr(rscratch1, r0, -2);
  407       __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
  408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  409       __ blr(rscratch1);
  410       __ b(exit);
  411     }
  412 
  413     __ BIND(is_long);
  414     __ str(r0, Address(Rresult, 0));
  415     __ br(Assembler::AL, exit);
  416 
  417     __ BIND(is_float);
  418     __ strs(j_farg0, Address(Rresult, 0));
  419     __ br(Assembler::AL, exit);
  420 
  421     __ BIND(is_double);
  422     __ strd(j_farg0, Address(Rresult, 0));
  423     __ br(Assembler::AL, exit);
  424 
  425     return start;
  426   }
  427 
  428   // Return point for a Java call if there's an exception thrown in
  429   // Java code.  The exception is caught and transformed into a
  430   // pending exception stored in JavaThread that can be tested from
  431   // within the VM.
  432   //
  433   // Note: Usually the parameters are removed by the callee. In case
  434   // of an exception crossing an activation frame boundary, that is
  435   // not the case if the callee is compiled code => need to setup the
  436   // rsp.
  437   //
  438   // r0: exception oop
  439 
  440   address generate_catch_exception() {
  441     StubId stub_id = StubId::stubgen_catch_exception_id;
  442     StubCodeMark mark(this, stub_id);
  443     address start = __ pc();
  444 
  445     // same as in generate_call_stub():
  446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  447     const Address thread        (rfp, thread_off         * wordSize);
  448 
  449 #ifdef ASSERT
  450     // verify that threads correspond
  451     {
  452       Label L, S;
  453       __ ldr(rscratch1, thread);
  454       __ cmp(rthread, rscratch1);
  455       __ br(Assembler::NE, S);
  456       __ get_thread(rscratch1);
  457       __ cmp(rthread, rscratch1);
  458       __ br(Assembler::EQ, L);
  459       __ bind(S);
  460       __ stop("StubRoutines::catch_exception: threads must correspond");
  461       __ bind(L);
  462     }
  463 #endif
  464 
  465     // set pending exception
  466     __ verify_oop(r0);
  467 
  468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  469     __ mov(rscratch1, (address)__FILE__);
  470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  471     __ movw(rscratch1, (int)__LINE__);
  472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  473 
  474     // complete return to VM
  475     assert(StubRoutines::_call_stub_return_address != nullptr,
  476            "_call_stub_return_address must have been generated before");
  477     __ b(StubRoutines::_call_stub_return_address);
  478 
  479     return start;
  480   }
  481 
  482   // Continuation point for runtime calls returning with a pending
  483   // exception.  The pending exception check happened in the runtime
  484   // or native call stub.  The pending exception in Thread is
  485   // converted into a Java-level exception.
  486   //
  487   // Contract with Java-level exception handlers:
  488   // r0: exception
  489   // r3: throwing pc
  490   //
  491   // NOTE: At entry of this stub, exception-pc must be in LR !!
  492 
  493   // NOTE: this is always used as a jump target within generated code
  494   // so it just needs to be generated code with no x86 prolog
  495 
  496   address generate_forward_exception() {
  497     StubId stub_id = StubId::stubgen_forward_exception_id;
  498     StubCodeMark mark(this, stub_id);
  499     address start = __ pc();
  500 
  501     // Upon entry, LR points to the return address returning into
  502     // Java (interpreted or compiled) code; i.e., the return address
  503     // becomes the throwing pc.
  504     //
  505     // Arguments pushed before the runtime call are still on the stack
  506     // but the exception handler will reset the stack pointer ->
  507     // ignore them.  A potential result in registers can be ignored as
  508     // well.
  509 
  510 #ifdef ASSERT
  511     // make sure this code is only executed if there is a pending exception
  512     {
  513       Label L;
  514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  515       __ cbnz(rscratch1, L);
  516       __ stop("StubRoutines::forward exception: no pending exception (1)");
  517       __ bind(L);
  518     }
  519 #endif
  520 
  521     // compute exception handler into r19
  522 
  523     // call the VM to find the handler address associated with the
  524     // caller address. pass thread in r0 and caller pc (ret address)
  525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  526     // the stack.
  527     __ mov(c_rarg1, lr);
  528     // lr will be trashed by the VM call so we move it to R19
  529     // (callee-saved) because we also need to pass it to the handler
  530     // returned by this call.
  531     __ mov(r19, lr);
  532     BLOCK_COMMENT("call exception_handler_for_return_address");
  533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  534                          SharedRuntime::exception_handler_for_return_address),
  535                     rthread, c_rarg1);
  536     // Reinitialize the ptrue predicate register, in case the external runtime
  537     // call clobbers ptrue reg, as we may return to SVE compiled code.
  538     __ reinitialize_ptrue();
  539 
  540     // we should not really care that lr is no longer the callee
  541     // address. we saved the value the handler needs in r19 so we can
  542     // just copy it to r3. however, the C2 handler will push its own
  543     // frame and then calls into the VM and the VM code asserts that
  544     // the PC for the frame above the handler belongs to a compiled
  545     // Java method. So, we restore lr here to satisfy that assert.
  546     __ mov(lr, r19);
  547     // setup r0 & r3 & clear pending exception
  548     __ mov(r3, r19);
  549     __ mov(r19, r0);
  550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  552 
  553 #ifdef ASSERT
  554     // make sure exception is set
  555     {
  556       Label L;
  557       __ cbnz(r0, L);
  558       __ stop("StubRoutines::forward exception: no pending exception (2)");
  559       __ bind(L);
  560     }
  561 #endif
  562 
  563     // continue at exception handler
  564     // r0: exception
  565     // r3: throwing pc
  566     // r19: exception handler
  567     __ verify_oop(r0);
  568     __ br(r19);
  569 
  570     return start;
  571   }
  572 
  573   // Non-destructive plausibility checks for oops
  574   //
  575   // Arguments:
  576   //    r0: oop to verify
  577   //    rscratch1: error message
  578   //
  579   // Stack after saving c_rarg3:
  580   //    [tos + 0]: saved c_rarg3
  581   //    [tos + 1]: saved c_rarg2
  582   //    [tos + 2]: saved lr
  583   //    [tos + 3]: saved rscratch2
  584   //    [tos + 4]: saved r0
  585   //    [tos + 5]: saved rscratch1
  586   address generate_verify_oop() {
  587     StubId stub_id = StubId::stubgen_verify_oop_id;
  588     StubCodeMark mark(this, stub_id);
  589     address start = __ pc();
  590 
  591     Label exit, error;
  592 
  593     // save c_rarg2 and c_rarg3
  594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  595 
  596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  598     __ ldr(c_rarg3, Address(c_rarg2));
  599     __ add(c_rarg3, c_rarg3, 1);
  600     __ str(c_rarg3, Address(c_rarg2));
  601 
  602     // object is in r0
  603     // make sure object is 'reasonable'
  604     __ cbz(r0, exit); // if obj is null it is OK
  605 
  606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  608 
  609     // return if everything seems ok
  610     __ bind(exit);
  611 
  612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  613     __ ret(lr);
  614 
  615     // handle errors
  616     __ bind(error);
  617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  618 
  619     __ push(RegSet::range(r0, r29), sp);
  620     // debug(char* msg, int64_t pc, int64_t regs[])
  621     __ mov(c_rarg0, rscratch1);      // pass address of error message
  622     __ mov(c_rarg1, lr);             // pass return address
  623     __ mov(c_rarg2, sp);             // pass address of regs on stack
  624 #ifndef PRODUCT
  625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  626 #endif
  627     BLOCK_COMMENT("call MacroAssembler::debug");
  628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  629     __ blr(rscratch1);
  630     __ hlt(0);
  631 
  632     return start;
  633   }
  634 
  635   // Generate indices for iota vector.
  636   address generate_iota_indices(StubId stub_id) {
  637     __ align(CodeEntryAlignment);
  638     StubCodeMark mark(this, stub_id);
  639     address start = __ pc();
  640     // B
  641     __ emit_data64(0x0706050403020100, relocInfo::none);
  642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  643     // H
  644     __ emit_data64(0x0003000200010000, relocInfo::none);
  645     __ emit_data64(0x0007000600050004, relocInfo::none);
  646     // S
  647     __ emit_data64(0x0000000100000000, relocInfo::none);
  648     __ emit_data64(0x0000000300000002, relocInfo::none);
  649     // D
  650     __ emit_data64(0x0000000000000000, relocInfo::none);
  651     __ emit_data64(0x0000000000000001, relocInfo::none);
  652     // S - FP
  653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  655     // D - FP
  656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  658     return start;
  659   }
  660 
  661   // The inner part of zero_words().  This is the bulk operation,
  662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  663   // caller is responsible for zeroing the last few words.
  664   //
  665   // Inputs:
  666   // r10: the HeapWord-aligned base address of an array to zero.
  667   // r11: the count in HeapWords, r11 > 0.
  668   //
  669   // Returns r10 and r11, adjusted for the caller to clear.
  670   // r10: the base address of the tail of words left to clear.
  671   // r11: the number of words in the tail.
  672   //      r11 < MacroAssembler::zero_words_block_size.
  673 
  674   address generate_zero_blocks() {
  675     Label done;
  676     Label base_aligned;
  677 
  678     Register base = r10, cnt = r11;
  679 
  680     __ align(CodeEntryAlignment);
  681     StubId stub_id = StubId::stubgen_zero_blocks_id;
  682     StubCodeMark mark(this, stub_id);
  683     address start = __ pc();
  684 
  685     if (UseBlockZeroing) {
  686       int zva_length = VM_Version::zva_length();
  687 
  688       // Ensure ZVA length can be divided by 16. This is required by
  689       // the subsequent operations.
  690       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  691 
  692       __ tbz(base, 3, base_aligned);
  693       __ str(zr, Address(__ post(base, 8)));
  694       __ sub(cnt, cnt, 1);
  695       __ bind(base_aligned);
  696 
  697       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  698       // alignment.
  699       Label small;
  700       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  701       __ subs(rscratch1, cnt, low_limit >> 3);
  702       __ br(Assembler::LT, small);
  703       __ zero_dcache_blocks(base, cnt);
  704       __ bind(small);
  705     }
  706 
  707     {
  708       // Number of stp instructions we'll unroll
  709       const int unroll =
  710         MacroAssembler::zero_words_block_size / 2;
  711       // Clear the remaining blocks.
  712       Label loop;
  713       __ subs(cnt, cnt, unroll * 2);
  714       __ br(Assembler::LT, done);
  715       __ bind(loop);
  716       for (int i = 0; i < unroll; i++)
  717         __ stp(zr, zr, __ post(base, 16));
  718       __ subs(cnt, cnt, unroll * 2);
  719       __ br(Assembler::GE, loop);
  720       __ bind(done);
  721       __ add(cnt, cnt, unroll * 2);
  722     }
  723 
  724     __ ret(lr);
  725 
  726     return start;
  727   }
  728 
  729 
  730   typedef enum {
  731     copy_forwards = 1,
  732     copy_backwards = -1
  733   } copy_direction;
  734 
  735   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  736   // for arraycopy stubs.
  737   class ArrayCopyBarrierSetHelper : StackObj {
  738     BarrierSetAssembler* _bs_asm;
  739     MacroAssembler* _masm;
  740     DecoratorSet _decorators;
  741     BasicType _type;
  742     Register _gct1;
  743     Register _gct2;
  744     Register _gct3;
  745     FloatRegister _gcvt1;
  746     FloatRegister _gcvt2;
  747     FloatRegister _gcvt3;
  748 
  749   public:
  750     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  751                               DecoratorSet decorators,
  752                               BasicType type,
  753                               Register gct1,
  754                               Register gct2,
  755                               Register gct3,
  756                               FloatRegister gcvt1,
  757                               FloatRegister gcvt2,
  758                               FloatRegister gcvt3)
  759       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  760         _masm(masm),
  761         _decorators(decorators),
  762         _type(type),
  763         _gct1(gct1),
  764         _gct2(gct2),
  765         _gct3(gct3),
  766         _gcvt1(gcvt1),
  767         _gcvt2(gcvt2),
  768         _gcvt3(gcvt3) {
  769     }
  770 
  771     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  772       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  773                             dst1, dst2, src,
  774                             _gct1, _gct2, _gcvt1);
  775     }
  776 
  777     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  778       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  779                              dst, src1, src2,
  780                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  781     }
  782 
  783     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  784       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  785                             dst1, dst2, src,
  786                             _gct1);
  787     }
  788 
  789     void copy_store_at_16(Address dst, Register src1, Register src2) {
  790       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  791                              dst, src1, src2,
  792                              _gct1, _gct2, _gct3);
  793     }
  794 
  795     void copy_load_at_8(Register dst, Address src) {
  796       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  797                             dst, noreg, src,
  798                             _gct1);
  799     }
  800 
  801     void copy_store_at_8(Address dst, Register src) {
  802       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  803                              dst, src, noreg,
  804                              _gct1, _gct2, _gct3);
  805     }
  806   };
  807 
  808   // Bulk copy of blocks of 8 words.
  809   //
  810   // count is a count of words.
  811   //
  812   // Precondition: count >= 8
  813   //
  814   // Postconditions:
  815   //
  816   // The least significant bit of count contains the remaining count
  817   // of words to copy.  The rest of count is trash.
  818   //
  819   // s and d are adjusted to point to the remaining words to copy
  820   //
  821   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
  822     BasicType type;
  823     copy_direction direction;
  824 
  825     switch (stub_id) {
  826     case StubId::stubgen_copy_byte_f_id:
  827       direction = copy_forwards;
  828       type = T_BYTE;
  829       break;
  830     case StubId::stubgen_copy_byte_b_id:
  831       direction = copy_backwards;
  832       type = T_BYTE;
  833       break;
  834     case StubId::stubgen_copy_oop_f_id:
  835       direction = copy_forwards;
  836       type = T_OBJECT;
  837       break;
  838     case StubId::stubgen_copy_oop_b_id:
  839       direction = copy_backwards;
  840       type = T_OBJECT;
  841       break;
  842     case StubId::stubgen_copy_oop_uninit_f_id:
  843       direction = copy_forwards;
  844       type = T_OBJECT;
  845       break;
  846     case StubId::stubgen_copy_oop_uninit_b_id:
  847       direction = copy_backwards;
  848       type = T_OBJECT;
  849       break;
  850     default:
  851       ShouldNotReachHere();
  852     }
  853 
  854     int unit = wordSize * direction;
  855     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  856 
  857     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  858       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  859     const Register stride = r14;
  860     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  861     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  862     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  863 
  864     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  865     assert_different_registers(s, d, count, rscratch1, rscratch2);
  866 
  867     Label again, drain;
  868 
  869     __ align(CodeEntryAlignment);
  870 
  871     StubCodeMark mark(this, stub_id);
  872 
  873     address start = __ pc();
  874 
  875     Label unaligned_copy_long;
  876     if (AvoidUnalignedAccesses) {
  877       __ tbnz(d, 3, unaligned_copy_long);
  878     }
  879 
  880     if (direction == copy_forwards) {
  881       __ sub(s, s, bias);
  882       __ sub(d, d, bias);
  883     }
  884 
  885 #ifdef ASSERT
  886     // Make sure we are never given < 8 words
  887     {
  888       Label L;
  889       __ cmp(count, (u1)8);
  890       __ br(Assembler::GE, L);
  891       __ stop("genrate_copy_longs called with < 8 words");
  892       __ bind(L);
  893     }
  894 #endif
  895 
  896     // Fill 8 registers
  897     if (UseSIMDForMemoryOps) {
  898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  899       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  900     } else {
  901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  902       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  903       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  905     }
  906 
  907     __ subs(count, count, 16);
  908     __ br(Assembler::LO, drain);
  909 
  910     int prefetch = PrefetchCopyIntervalInBytes;
  911     bool use_stride = false;
  912     if (direction == copy_backwards) {
  913       use_stride = prefetch > 256;
  914       prefetch = -prefetch;
  915       if (use_stride) __ mov(stride, prefetch);
  916     }
  917 
  918     __ bind(again);
  919 
  920     if (PrefetchCopyIntervalInBytes > 0)
  921       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  922 
  923     if (UseSIMDForMemoryOps) {
  924       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  925       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  926       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  927       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  928     } else {
  929       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  930       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  931       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  932       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  933       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  934       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  937     }
  938 
  939     __ subs(count, count, 8);
  940     __ br(Assembler::HS, again);
  941 
  942     // Drain
  943     __ bind(drain);
  944     if (UseSIMDForMemoryOps) {
  945       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  946       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  947     } else {
  948       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  950       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  951       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  952     }
  953 
  954     {
  955       Label L1, L2;
  956       __ tbz(count, exact_log2(4), L1);
  957       if (UseSIMDForMemoryOps) {
  958         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  959         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  960       } else {
  961         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  962         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  963         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  964         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  965       }
  966       __ bind(L1);
  967 
  968       if (direction == copy_forwards) {
  969         __ add(s, s, bias);
  970         __ add(d, d, bias);
  971       }
  972 
  973       __ tbz(count, 1, L2);
  974       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  975       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  976       __ bind(L2);
  977     }
  978 
  979     __ ret(lr);
  980 
  981     if (AvoidUnalignedAccesses) {
  982       Label drain, again;
  983       // Register order for storing. Order is different for backward copy.
  984 
  985       __ bind(unaligned_copy_long);
  986 
  987       // source address is even aligned, target odd aligned
  988       //
  989       // when forward copying word pairs we read long pairs at offsets
  990       // {0, 2, 4, 6} (in long words). when backwards copying we read
  991       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  992       // address by -2 in the forwards case so we can compute the
  993       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  994       // or -1.
  995       //
  996       // when forward copying we need to store 1 word, 3 pairs and
  997       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  998       // zero offset We adjust the destination by -1 which means we
  999       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1000       //
 1001       // When backwards copyng we need to store 1 word, 3 pairs and
 1002       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1003       // offsets {1, 3, 5, 7, 8} * unit.
 1004 
 1005       if (direction == copy_forwards) {
 1006         __ sub(s, s, 16);
 1007         __ sub(d, d, 8);
 1008       }
 1009 
 1010       // Fill 8 registers
 1011       //
 1012       // for forwards copy s was offset by -16 from the original input
 1013       // value of s so the register contents are at these offsets
 1014       // relative to the 64 bit block addressed by that original input
 1015       // and so on for each successive 64 byte block when s is updated
 1016       //
 1017       // t0 at offset 0,  t1 at offset 8
 1018       // t2 at offset 16, t3 at offset 24
 1019       // t4 at offset 32, t5 at offset 40
 1020       // t6 at offset 48, t7 at offset 56
 1021 
 1022       // for backwards copy s was not offset so the register contents
 1023       // are at these offsets into the preceding 64 byte block
 1024       // relative to that original input and so on for each successive
 1025       // preceding 64 byte block when s is updated. this explains the
 1026       // slightly counter-intuitive looking pattern of register usage
 1027       // in the stp instructions for backwards copy.
 1028       //
 1029       // t0 at offset -16, t1 at offset -8
 1030       // t2 at offset -32, t3 at offset -24
 1031       // t4 at offset -48, t5 at offset -40
 1032       // t6 at offset -64, t7 at offset -56
 1033 
 1034       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1035       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1036       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1037       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1038 
 1039       __ subs(count, count, 16);
 1040       __ br(Assembler::LO, drain);
 1041 
 1042       int prefetch = PrefetchCopyIntervalInBytes;
 1043       bool use_stride = false;
 1044       if (direction == copy_backwards) {
 1045         use_stride = prefetch > 256;
 1046         prefetch = -prefetch;
 1047         if (use_stride) __ mov(stride, prefetch);
 1048       }
 1049 
 1050       __ bind(again);
 1051 
 1052       if (PrefetchCopyIntervalInBytes > 0)
 1053         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1054 
 1055       if (direction == copy_forwards) {
 1056         // allowing for the offset of -8 the store instructions place
 1057         // registers into the target 64 bit block at the following
 1058         // offsets
 1059         //
 1060         // t0 at offset 0
 1061         // t1 at offset 8,  t2 at offset 16
 1062         // t3 at offset 24, t4 at offset 32
 1063         // t5 at offset 40, t6 at offset 48
 1064         // t7 at offset 56
 1065 
 1066         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1067         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1068         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1069         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1070         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1071         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1072         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1073         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1074         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1075       } else {
 1076         // d was not offset when we started so the registers are
 1077         // written into the 64 bit block preceding d with the following
 1078         // offsets
 1079         //
 1080         // t1 at offset -8
 1081         // t3 at offset -24, t0 at offset -16
 1082         // t5 at offset -48, t2 at offset -32
 1083         // t7 at offset -56, t4 at offset -48
 1084         //                   t6 at offset -64
 1085         //
 1086         // note that this matches the offsets previously noted for the
 1087         // loads
 1088 
 1089         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1090         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1091         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1092         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1093         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1094         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1095         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1096         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1097         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1098       }
 1099 
 1100       __ subs(count, count, 8);
 1101       __ br(Assembler::HS, again);
 1102 
 1103       // Drain
 1104       //
 1105       // this uses the same pattern of offsets and register arguments
 1106       // as above
 1107       __ bind(drain);
 1108       if (direction == copy_forwards) {
 1109         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1110         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1111         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1112         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1113         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1114       } else {
 1115         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1116         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1117         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1118         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1119         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1120       }
 1121       // now we need to copy any remaining part block which may
 1122       // include a 4 word block subblock and/or a 2 word subblock.
 1123       // bits 2 and 1 in the count are the tell-tale for whether we
 1124       // have each such subblock
 1125       {
 1126         Label L1, L2;
 1127         __ tbz(count, exact_log2(4), L1);
 1128         // this is the same as above but copying only 4 longs hence
 1129         // with only one intervening stp between the str instructions
 1130         // but note that the offsets and registers still follow the
 1131         // same pattern
 1132         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1133         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1134         if (direction == copy_forwards) {
 1135           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1136           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1137           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1141           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1142         }
 1143         __ bind(L1);
 1144 
 1145         __ tbz(count, 1, L2);
 1146         // this is the same as above but copying only 2 longs hence
 1147         // there is no intervening stp between the str instructions
 1148         // but note that the offset and register patterns are still
 1149         // the same
 1150         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1151         if (direction == copy_forwards) {
 1152           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1153           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1154         } else {
 1155           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1156           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1157         }
 1158         __ bind(L2);
 1159 
 1160         // for forwards copy we need to re-adjust the offsets we
 1161         // applied so that s and d are follow the last words written
 1162 
 1163         if (direction == copy_forwards) {
 1164           __ add(s, s, 16);
 1165           __ add(d, d, 8);
 1166         }
 1167 
 1168       }
 1169 
 1170       __ ret(lr);
 1171     }
 1172 
 1173     return start;
 1174   }
 1175 
 1176   // Small copy: less than 16 bytes.
 1177   //
 1178   // NB: Ignores all of the bits of count which represent more than 15
 1179   // bytes, so a caller doesn't have to mask them.
 1180 
 1181   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1182     bool is_backwards = step < 0;
 1183     size_t granularity = g_uabs(step);
 1184     int direction = is_backwards ? -1 : 1;
 1185 
 1186     Label Lword, Lint, Lshort, Lbyte;
 1187 
 1188     assert(granularity
 1189            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1190 
 1191     const Register t0 = r3;
 1192     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1193     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1194 
 1195     // ??? I don't know if this bit-test-and-branch is the right thing
 1196     // to do.  It does a lot of jumping, resulting in several
 1197     // mispredicted branches.  It might make more sense to do this
 1198     // with something like Duff's device with a single computed branch.
 1199 
 1200     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1201     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1202     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1203     __ bind(Lword);
 1204 
 1205     if (granularity <= sizeof (jint)) {
 1206       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1207       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1208       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1209       __ bind(Lint);
 1210     }
 1211 
 1212     if (granularity <= sizeof (jshort)) {
 1213       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1214       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1215       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1216       __ bind(Lshort);
 1217     }
 1218 
 1219     if (granularity <= sizeof (jbyte)) {
 1220       __ tbz(count, 0, Lbyte);
 1221       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1222       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1223       __ bind(Lbyte);
 1224     }
 1225   }
 1226 
 1227   // All-singing all-dancing memory copy.
 1228   //
 1229   // Copy count units of memory from s to d.  The size of a unit is
 1230   // step, which can be positive or negative depending on the direction
 1231   // of copy.  If is_aligned is false, we align the source address.
 1232   //
 1233 
 1234   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1235                    Register s, Register d, Register count, int step) {
 1236     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1237     bool is_backwards = step < 0;
 1238     unsigned int granularity = g_uabs(step);
 1239     const Register t0 = r3, t1 = r4;
 1240 
 1241     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1242     // load all the data before writing anything
 1243     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1244     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1245     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1246     const Register send = r17, dend = r16;
 1247     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1248     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1249     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1250 
 1251     if (PrefetchCopyIntervalInBytes > 0)
 1252       __ prfm(Address(s, 0), PLDL1KEEP);
 1253     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1254     __ br(Assembler::HI, copy_big);
 1255 
 1256     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1257     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1258 
 1259     __ cmp(count, u1(16/granularity));
 1260     __ br(Assembler::LS, copy16);
 1261 
 1262     __ cmp(count, u1(64/granularity));
 1263     __ br(Assembler::HI, copy80);
 1264 
 1265     __ cmp(count, u1(32/granularity));
 1266     __ br(Assembler::LS, copy32);
 1267 
 1268     // 33..64 bytes
 1269     if (UseSIMDForMemoryOps) {
 1270       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1271       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1272       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1273       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1274     } else {
 1275       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1276       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1277       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1278       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1279 
 1280       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1281       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1282       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1283       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1284     }
 1285     __ b(finish);
 1286 
 1287     // 17..32 bytes
 1288     __ bind(copy32);
 1289     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1290     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1291 
 1292     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1293     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1294     __ b(finish);
 1295 
 1296     // 65..80/96 bytes
 1297     // (96 bytes if SIMD because we do 32 byes per instruction)
 1298     __ bind(copy80);
 1299     if (UseSIMDForMemoryOps) {
 1300       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1301       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1302       // Unaligned pointers can be an issue for copying.
 1303       // The issue has more chances to happen when granularity of data is
 1304       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1305       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1306       // The most performance drop has been seen for the range 65-80 bytes.
 1307       // For such cases using the pair of ldp/stp instead of the third pair of
 1308       // ldpq/stpq fixes the performance issue.
 1309       if (granularity < sizeof (jint)) {
 1310         Label copy96;
 1311         __ cmp(count, u1(80/granularity));
 1312         __ br(Assembler::HI, copy96);
 1313         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1314 
 1315         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1316         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1317 
 1318         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1319         __ b(finish);
 1320 
 1321         __ bind(copy96);
 1322       }
 1323       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1324 
 1325       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1326       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1327 
 1328       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1329     } else {
 1330       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1331       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1332       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1333       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1334       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1335 
 1336       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1337       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1338       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1339       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1340       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1341     }
 1342     __ b(finish);
 1343 
 1344     // 0..16 bytes
 1345     __ bind(copy16);
 1346     __ cmp(count, u1(8/granularity));
 1347     __ br(Assembler::LO, copy8);
 1348 
 1349     // 8..16 bytes
 1350     bs.copy_load_at_8(t0, Address(s, 0));
 1351     bs.copy_load_at_8(t1, Address(send, -8));
 1352     bs.copy_store_at_8(Address(d, 0), t0);
 1353     bs.copy_store_at_8(Address(dend, -8), t1);
 1354     __ b(finish);
 1355 
 1356     if (granularity < 8) {
 1357       // 4..7 bytes
 1358       __ bind(copy8);
 1359       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1360       __ ldrw(t0, Address(s, 0));
 1361       __ ldrw(t1, Address(send, -4));
 1362       __ strw(t0, Address(d, 0));
 1363       __ strw(t1, Address(dend, -4));
 1364       __ b(finish);
 1365       if (granularity < 4) {
 1366         // 0..3 bytes
 1367         __ bind(copy4);
 1368         __ cbz(count, finish); // get rid of 0 case
 1369         if (granularity == 2) {
 1370           __ ldrh(t0, Address(s, 0));
 1371           __ strh(t0, Address(d, 0));
 1372         } else { // granularity == 1
 1373           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1374           // the first and last byte.
 1375           // Handle the 3 byte case by loading and storing base + count/2
 1376           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1377           // This does means in the 1 byte case we load/store the same
 1378           // byte 3 times.
 1379           __ lsr(count, count, 1);
 1380           __ ldrb(t0, Address(s, 0));
 1381           __ ldrb(t1, Address(send, -1));
 1382           __ ldrb(t2, Address(s, count));
 1383           __ strb(t0, Address(d, 0));
 1384           __ strb(t1, Address(dend, -1));
 1385           __ strb(t2, Address(d, count));
 1386         }
 1387         __ b(finish);
 1388       }
 1389     }
 1390 
 1391     __ bind(copy_big);
 1392     if (is_backwards) {
 1393       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1394       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1395     }
 1396 
 1397     // Now we've got the small case out of the way we can align the
 1398     // source address on a 2-word boundary.
 1399 
 1400     // Here we will materialize a count in r15, which is used by copy_memory_small
 1401     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1402     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1403     // can not be used as a temp register, as it contains the count.
 1404 
 1405     Label aligned;
 1406 
 1407     if (is_aligned) {
 1408       // We may have to adjust by 1 word to get s 2-word-aligned.
 1409       __ tbz(s, exact_log2(wordSize), aligned);
 1410       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1411       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1412       __ sub(count, count, wordSize/granularity);
 1413     } else {
 1414       if (is_backwards) {
 1415         __ andr(r15, s, 2 * wordSize - 1);
 1416       } else {
 1417         __ neg(r15, s);
 1418         __ andr(r15, r15, 2 * wordSize - 1);
 1419       }
 1420       // r15 is the byte adjustment needed to align s.
 1421       __ cbz(r15, aligned);
 1422       int shift = exact_log2(granularity);
 1423       if (shift > 0) {
 1424         __ lsr(r15, r15, shift);
 1425       }
 1426       __ sub(count, count, r15);
 1427 
 1428 #if 0
 1429       // ?? This code is only correct for a disjoint copy.  It may or
 1430       // may not make sense to use it in that case.
 1431 
 1432       // Copy the first pair; s and d may not be aligned.
 1433       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1434       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1435 
 1436       // Align s and d, adjust count
 1437       if (is_backwards) {
 1438         __ sub(s, s, r15);
 1439         __ sub(d, d, r15);
 1440       } else {
 1441         __ add(s, s, r15);
 1442         __ add(d, d, r15);
 1443       }
 1444 #else
 1445       copy_memory_small(decorators, type, s, d, r15, step);
 1446 #endif
 1447     }
 1448 
 1449     __ bind(aligned);
 1450 
 1451     // s is now 2-word-aligned.
 1452 
 1453     // We have a count of units and some trailing bytes. Adjust the
 1454     // count and do a bulk copy of words. If the shift is zero
 1455     // perform a move instead to benefit from zero latency moves.
 1456     int shift = exact_log2(wordSize/granularity);
 1457     if (shift > 0) {
 1458       __ lsr(r15, count, shift);
 1459     } else {
 1460       __ mov(r15, count);
 1461     }
 1462     if (direction == copy_forwards) {
 1463       if (type != T_OBJECT) {
 1464         __ bl(StubRoutines::aarch64::copy_byte_f());
 1465       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1466         __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
 1467       } else {
 1468         __ bl(StubRoutines::aarch64::copy_oop_f());
 1469       }
 1470     } else {
 1471       if (type != T_OBJECT) {
 1472         __ bl(StubRoutines::aarch64::copy_byte_b());
 1473       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1474         __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
 1475       } else {
 1476         __ bl(StubRoutines::aarch64::copy_oop_b());
 1477       }
 1478     }
 1479 
 1480     // And the tail.
 1481     copy_memory_small(decorators, type, s, d, count, step);
 1482 
 1483     if (granularity >= 8) __ bind(copy8);
 1484     if (granularity >= 4) __ bind(copy4);
 1485     __ bind(finish);
 1486   }
 1487 
 1488 
 1489   void clobber_registers() {
 1490 #ifdef ASSERT
 1491     RegSet clobbered
 1492       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1493     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1494     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1495     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1496       __ mov(*it, rscratch1);
 1497     }
 1498 #endif
 1499 
 1500   }
 1501 
 1502   // Scan over array at a for count oops, verifying each one.
 1503   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1504   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1505     Label loop, end;
 1506     __ mov(rscratch1, a);
 1507     __ mov(rscratch2, zr);
 1508     __ bind(loop);
 1509     __ cmp(rscratch2, count);
 1510     __ br(Assembler::HS, end);
 1511     if (size == wordSize) {
 1512       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1513       __ verify_oop(temp);
 1514     } else {
 1515       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1516       __ decode_heap_oop(temp); // calls verify_oop
 1517     }
 1518     __ add(rscratch2, rscratch2, 1);
 1519     __ b(loop);
 1520     __ bind(end);
 1521   }
 1522 
 1523   // Arguments:
 1524   //   stub_id - is used to name the stub and identify all details of
 1525   //             how to perform the copy.
 1526   //
 1527   //   entry - is assigned to the stub's post push entry point unless
 1528   //           it is null
 1529   //
 1530   // Inputs:
 1531   //   c_rarg0   - source array address
 1532   //   c_rarg1   - destination array address
 1533   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1534   //
 1535   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1536   // the hardware handle it.  The two dwords within qwords that span
 1537   // cache line boundaries will still be loaded and stored atomically.
 1538   //
 1539   // Side Effects: nopush_entry is set to the (post push) entry point
 1540   //               so it can be used by the corresponding conjoint
 1541   //               copy method
 1542   //
 1543   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1544     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1545     RegSet saved_reg = RegSet::of(s, d, count);
 1546     int size;
 1547     bool aligned;
 1548     bool is_oop;
 1549     bool dest_uninitialized;
 1550     switch (stub_id) {
 1551     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1552       size = sizeof(jbyte);
 1553       aligned = false;
 1554       is_oop = false;
 1555       dest_uninitialized = false;
 1556       break;
 1557     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1558       size = sizeof(jbyte);
 1559       aligned = true;
 1560       is_oop = false;
 1561       dest_uninitialized = false;
 1562       break;
 1563     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1564       size = sizeof(jshort);
 1565       aligned = false;
 1566       is_oop = false;
 1567       dest_uninitialized = false;
 1568       break;
 1569     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1570       size = sizeof(jshort);
 1571       aligned = true;
 1572       is_oop = false;
 1573       dest_uninitialized = false;
 1574       break;
 1575     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1576       size = sizeof(jint);
 1577       aligned = false;
 1578       is_oop = false;
 1579       dest_uninitialized = false;
 1580       break;
 1581     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1582       size = sizeof(jint);
 1583       aligned = true;
 1584       is_oop = false;
 1585       dest_uninitialized = false;
 1586       break;
 1587     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1588       // since this is always aligned we can (should!) use the same
 1589       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1590       ShouldNotReachHere();
 1591       break;
 1592     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1593       size = sizeof(jlong);
 1594       aligned = true;
 1595       is_oop = false;
 1596       dest_uninitialized = false;
 1597       break;
 1598     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1599       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1600       aligned = !UseCompressedOops;
 1601       is_oop = true;
 1602       dest_uninitialized = false;
 1603       break;
 1604     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1605       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1606       aligned = !UseCompressedOops;
 1607       is_oop = true;
 1608       dest_uninitialized = false;
 1609       break;
 1610     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1611       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1612       aligned = !UseCompressedOops;
 1613       is_oop = true;
 1614       dest_uninitialized = true;
 1615       break;
 1616     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1617       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1618       aligned = !UseCompressedOops;
 1619       is_oop = true;
 1620       dest_uninitialized = true;
 1621       break;
 1622     default:
 1623       ShouldNotReachHere();
 1624       break;
 1625     }
 1626 
 1627     __ align(CodeEntryAlignment);
 1628     StubCodeMark mark(this, stub_id);
 1629     address start = __ pc();
 1630     __ enter();
 1631 
 1632     if (nopush_entry != nullptr) {
 1633       *nopush_entry = __ pc();
 1634       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1635       BLOCK_COMMENT("Entry:");
 1636     }
 1637 
 1638     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1639     if (dest_uninitialized) {
 1640       decorators |= IS_DEST_UNINITIALIZED;
 1641     }
 1642     if (aligned) {
 1643       decorators |= ARRAYCOPY_ALIGNED;
 1644     }
 1645 
 1646     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1647     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1648 
 1649     if (is_oop) {
 1650       // save regs before copy_memory
 1651       __ push(RegSet::of(d, count), sp);
 1652     }
 1653     {
 1654       // UnsafeMemoryAccess page error: continue after unsafe access
 1655       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1656       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1657       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1658     }
 1659 
 1660     if (is_oop) {
 1661       __ pop(RegSet::of(d, count), sp);
 1662       if (VerifyOops)
 1663         verify_oop_array(size, d, count, r16);
 1664     }
 1665 
 1666     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1667 
 1668     __ leave();
 1669     __ mov(r0, zr); // return 0
 1670     __ ret(lr);
 1671     return start;
 1672   }
 1673 
 1674   // Arguments:
 1675   //   stub_id - is used to name the stub and identify all details of
 1676   //             how to perform the copy.
 1677   //
 1678   //   nooverlap_target - identifes the (post push) entry for the
 1679   //             corresponding disjoint copy routine which can be
 1680   //             jumped to if the ranges do not actually overlap
 1681   //
 1682   //   entry - is assigned to the stub's post push entry point unless
 1683   //           it is null
 1684   //
 1685   //
 1686   // Inputs:
 1687   //   c_rarg0   - source array address
 1688   //   c_rarg1   - destination array address
 1689   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1690   //
 1691   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1692   // the hardware handle it.  The two dwords within qwords that span
 1693   // cache line boundaries will still be loaded and stored atomically.
 1694   //
 1695   // Side Effects:
 1696   //   nopush_entry is set to the no-overlap entry point so it can be
 1697   //   used by some other conjoint copy method
 1698   //
 1699   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1700     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1701     RegSet saved_regs = RegSet::of(s, d, count);
 1702     int size;
 1703     bool aligned;
 1704     bool is_oop;
 1705     bool dest_uninitialized;
 1706     switch (stub_id) {
 1707     case StubId::stubgen_jbyte_arraycopy_id:
 1708       size = sizeof(jbyte);
 1709       aligned = false;
 1710       is_oop = false;
 1711       dest_uninitialized = false;
 1712       break;
 1713     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1714       size = sizeof(jbyte);
 1715       aligned = true;
 1716       is_oop = false;
 1717       dest_uninitialized = false;
 1718       break;
 1719     case StubId::stubgen_jshort_arraycopy_id:
 1720       size = sizeof(jshort);
 1721       aligned = false;
 1722       is_oop = false;
 1723       dest_uninitialized = false;
 1724       break;
 1725     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1726       size = sizeof(jshort);
 1727       aligned = true;
 1728       is_oop = false;
 1729       dest_uninitialized = false;
 1730       break;
 1731     case StubId::stubgen_jint_arraycopy_id:
 1732       size = sizeof(jint);
 1733       aligned = false;
 1734       is_oop = false;
 1735       dest_uninitialized = false;
 1736       break;
 1737     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1738       size = sizeof(jint);
 1739       aligned = true;
 1740       is_oop = false;
 1741       dest_uninitialized = false;
 1742       break;
 1743     case StubId::stubgen_jlong_arraycopy_id:
 1744       // since this is always aligned we can (should!) use the same
 1745       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1746       ShouldNotReachHere();
 1747       break;
 1748     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1749       size = sizeof(jlong);
 1750       aligned = true;
 1751       is_oop = false;
 1752       dest_uninitialized = false;
 1753       break;
 1754     case StubId::stubgen_oop_arraycopy_id:
 1755       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1756       aligned = !UseCompressedOops;
 1757       is_oop = true;
 1758       dest_uninitialized = false;
 1759       break;
 1760     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1761       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1762       aligned = !UseCompressedOops;
 1763       is_oop = true;
 1764       dest_uninitialized = false;
 1765       break;
 1766     case StubId::stubgen_oop_arraycopy_uninit_id:
 1767       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1768       aligned = !UseCompressedOops;
 1769       is_oop = true;
 1770       dest_uninitialized = true;
 1771       break;
 1772     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1773       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1774       aligned = !UseCompressedOops;
 1775       is_oop = true;
 1776       dest_uninitialized = true;
 1777       break;
 1778     default:
 1779       ShouldNotReachHere();
 1780     }
 1781 
 1782     StubCodeMark mark(this, stub_id);
 1783     address start = __ pc();
 1784     __ enter();
 1785 
 1786     if (nopush_entry != nullptr) {
 1787       *nopush_entry = __ pc();
 1788       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1789       BLOCK_COMMENT("Entry:");
 1790     }
 1791 
 1792     // use fwd copy when (d-s) above_equal (count*size)
 1793     Label L_overlapping;
 1794     __ sub(rscratch1, d, s);
 1795     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1796     __ br(Assembler::LO, L_overlapping);
 1797     __ b(RuntimeAddress(nooverlap_target));
 1798     __ bind(L_overlapping);
 1799 
 1800     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1801     if (dest_uninitialized) {
 1802       decorators |= IS_DEST_UNINITIALIZED;
 1803     }
 1804     if (aligned) {
 1805       decorators |= ARRAYCOPY_ALIGNED;
 1806     }
 1807 
 1808     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1809     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1810 
 1811     if (is_oop) {
 1812       // save regs before copy_memory
 1813       __ push(RegSet::of(d, count), sp);
 1814     }
 1815     {
 1816       // UnsafeMemoryAccess page error: continue after unsafe access
 1817       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1818       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1819       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1820     }
 1821     if (is_oop) {
 1822       __ pop(RegSet::of(d, count), sp);
 1823       if (VerifyOops)
 1824         verify_oop_array(size, d, count, r16);
 1825     }
 1826     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1827     __ leave();
 1828     __ mov(r0, zr); // return 0
 1829     __ ret(lr);
 1830     return start;
 1831   }
 1832 
 1833   // Helper for generating a dynamic type check.
 1834   // Smashes rscratch1, rscratch2.
 1835   void generate_type_check(Register sub_klass,
 1836                            Register super_check_offset,
 1837                            Register super_klass,
 1838                            Register temp1,
 1839                            Register temp2,
 1840                            Register result,
 1841                            Label& L_success) {
 1842     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1843 
 1844     BLOCK_COMMENT("type_check:");
 1845 
 1846     Label L_miss;
 1847 
 1848     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1849                                      super_check_offset);
 1850     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1851 
 1852     // Fall through on failure!
 1853     __ BIND(L_miss);
 1854   }
 1855 
 1856   //
 1857   //  Generate checkcasting array copy stub
 1858   //
 1859   //  Input:
 1860   //    c_rarg0   - source array address
 1861   //    c_rarg1   - destination array address
 1862   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1863   //    c_rarg3   - size_t ckoff (super_check_offset)
 1864   //    c_rarg4   - oop ckval (super_klass)
 1865   //
 1866   //  Output:
 1867   //    r0 ==  0  -  success
 1868   //    r0 == -1^K - failure, where K is partial transfer count
 1869   //
 1870   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 1871     bool dest_uninitialized;
 1872     switch (stub_id) {
 1873     case StubId::stubgen_checkcast_arraycopy_id:
 1874       dest_uninitialized = false;
 1875       break;
 1876     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1877       dest_uninitialized = true;
 1878       break;
 1879     default:
 1880       ShouldNotReachHere();
 1881     }
 1882 
 1883     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1884 
 1885     // Input registers (after setup_arg_regs)
 1886     const Register from        = c_rarg0;   // source array address
 1887     const Register to          = c_rarg1;   // destination array address
 1888     const Register count       = c_rarg2;   // elementscount
 1889     const Register ckoff       = c_rarg3;   // super_check_offset
 1890     const Register ckval       = c_rarg4;   // super_klass
 1891 
 1892     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1893 
 1894     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1895     const Register copied_oop  = r22;       // actual oop copied
 1896     const Register count_save  = r21;       // orig elementscount
 1897     const Register start_to    = r20;       // destination array start address
 1898     const Register r19_klass   = r19;       // oop._klass
 1899 
 1900     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1901     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1902 
 1903     //---------------------------------------------------------------
 1904     // Assembler stub will be used for this call to arraycopy
 1905     // if the two arrays are subtypes of Object[] but the
 1906     // destination array type is not equal to or a supertype
 1907     // of the source type.  Each element must be separately
 1908     // checked.
 1909 
 1910     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1911                                copied_oop, r19_klass, count_save);
 1912 
 1913     __ align(CodeEntryAlignment);
 1914     StubCodeMark mark(this, stub_id);
 1915     address start = __ pc();
 1916 
 1917     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1918 
 1919 #ifdef ASSERT
 1920     // caller guarantees that the arrays really are different
 1921     // otherwise, we would have to make conjoint checks
 1922     { Label L;
 1923       __ b(L);                  // conjoint check not yet implemented
 1924       __ stop("checkcast_copy within a single array");
 1925       __ bind(L);
 1926     }
 1927 #endif //ASSERT
 1928 
 1929     // Caller of this entry point must set up the argument registers.
 1930     if (nopush_entry != nullptr) {
 1931       *nopush_entry = __ pc();
 1932       BLOCK_COMMENT("Entry:");
 1933     }
 1934 
 1935      // Empty array:  Nothing to do.
 1936     __ cbz(count, L_done);
 1937     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1938 
 1939 #ifdef ASSERT
 1940     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1941     // The ckoff and ckval must be mutually consistent,
 1942     // even though caller generates both.
 1943     { Label L;
 1944       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1945       __ ldrw(start_to, Address(ckval, sco_offset));
 1946       __ cmpw(ckoff, start_to);
 1947       __ br(Assembler::EQ, L);
 1948       __ stop("super_check_offset inconsistent");
 1949       __ bind(L);
 1950     }
 1951 #endif //ASSERT
 1952 
 1953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1954     bool is_oop = true;
 1955     int element_size = UseCompressedOops ? 4 : 8;
 1956     if (dest_uninitialized) {
 1957       decorators |= IS_DEST_UNINITIALIZED;
 1958     }
 1959 
 1960     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1961     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1962 
 1963     // save the original count
 1964     __ mov(count_save, count);
 1965 
 1966     // Copy from low to high addresses
 1967     __ mov(start_to, to);              // Save destination array start address
 1968     __ b(L_load_element);
 1969 
 1970     // ======== begin loop ========
 1971     // (Loop is rotated; its entry is L_load_element.)
 1972     // Loop control:
 1973     //   for (; count != 0; count--) {
 1974     //     copied_oop = load_heap_oop(from++);
 1975     //     ... generate_type_check ...;
 1976     //     store_heap_oop(to++, copied_oop);
 1977     //   }
 1978     __ align(OptoLoopAlignment);
 1979 
 1980     __ BIND(L_store_element);
 1981     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1982                       __ post(to, element_size), copied_oop, noreg,
 1983                       gct1, gct2, gct3);
 1984     __ sub(count, count, 1);
 1985     __ cbz(count, L_do_card_marks);
 1986 
 1987     // ======== loop entry is here ========
 1988     __ BIND(L_load_element);
 1989     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1990                      copied_oop, noreg, __ post(from, element_size),
 1991                      gct1);
 1992     __ cbz(copied_oop, L_store_element);
 1993 
 1994     __ load_klass(r19_klass, copied_oop);// query the object klass
 1995 
 1996     BLOCK_COMMENT("type_check:");
 1997     generate_type_check(/*sub_klass*/r19_klass,
 1998                         /*super_check_offset*/ckoff,
 1999                         /*super_klass*/ckval,
 2000                         /*r_array_base*/gct1,
 2001                         /*temp2*/gct2,
 2002                         /*result*/r10, L_store_element);
 2003 
 2004     // Fall through on failure!
 2005 
 2006     // ======== end loop ========
 2007 
 2008     // It was a real error; we must depend on the caller to finish the job.
 2009     // Register count = remaining oops, count_orig = total oops.
 2010     // Emit GC store barriers for the oops we have copied and report
 2011     // their number to the caller.
 2012 
 2013     __ subs(count, count_save, count);     // K = partially copied oop count
 2014     __ eon(count, count, zr);              // report (-1^K) to caller
 2015     __ br(Assembler::EQ, L_done_pop);
 2016 
 2017     __ BIND(L_do_card_marks);
 2018     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2019 
 2020     __ bind(L_done_pop);
 2021     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2022     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2023 
 2024     __ bind(L_done);
 2025     __ mov(r0, count);
 2026     __ leave();
 2027     __ ret(lr);
 2028 
 2029     return start;
 2030   }
 2031 
 2032   // Perform range checks on the proposed arraycopy.
 2033   // Kills temp, but nothing else.
 2034   // Also, clean the sign bits of src_pos and dst_pos.
 2035   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2036                               Register src_pos, // source position (c_rarg1)
 2037                               Register dst,     // destination array oo (c_rarg2)
 2038                               Register dst_pos, // destination position (c_rarg3)
 2039                               Register length,
 2040                               Register temp,
 2041                               Label& L_failed) {
 2042     BLOCK_COMMENT("arraycopy_range_checks:");
 2043 
 2044     assert_different_registers(rscratch1, temp);
 2045 
 2046     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2047     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2048     __ addw(temp, length, src_pos);
 2049     __ cmpw(temp, rscratch1);
 2050     __ br(Assembler::HI, L_failed);
 2051 
 2052     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2053     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2054     __ addw(temp, length, dst_pos);
 2055     __ cmpw(temp, rscratch1);
 2056     __ br(Assembler::HI, L_failed);
 2057 
 2058     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2059     __ movw(src_pos, src_pos);
 2060     __ movw(dst_pos, dst_pos);
 2061 
 2062     BLOCK_COMMENT("arraycopy_range_checks done");
 2063   }
 2064 
 2065   // These stubs get called from some dumb test routine.
 2066   // I'll write them properly when they're called from
 2067   // something that's actually doing something.
 2068   static void fake_arraycopy_stub(address src, address dst, int count) {
 2069     assert(count == 0, "huh?");
 2070   }
 2071 
 2072 
 2073   //
 2074   //  Generate 'unsafe' array copy stub
 2075   //  Though just as safe as the other stubs, it takes an unscaled
 2076   //  size_t argument instead of an element count.
 2077   //
 2078   //  Input:
 2079   //    c_rarg0   - source array address
 2080   //    c_rarg1   - destination array address
 2081   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2082   //
 2083   // Examines the alignment of the operands and dispatches
 2084   // to a long, int, short, or byte copy loop.
 2085   //
 2086   address generate_unsafe_copy(address byte_copy_entry,
 2087                                address short_copy_entry,
 2088                                address int_copy_entry,
 2089                                address long_copy_entry) {
 2090     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2091 
 2092     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2093     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2094 
 2095     __ align(CodeEntryAlignment);
 2096     StubCodeMark mark(this, stub_id);
 2097     address start = __ pc();
 2098     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2099 
 2100     // bump this on entry, not on exit:
 2101     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2102 
 2103     __ orr(rscratch1, s, d);
 2104     __ orr(rscratch1, rscratch1, count);
 2105 
 2106     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2107     __ cbz(rscratch1, L_long_aligned);
 2108     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2109     __ cbz(rscratch1, L_int_aligned);
 2110     __ tbz(rscratch1, 0, L_short_aligned);
 2111     __ b(RuntimeAddress(byte_copy_entry));
 2112 
 2113     __ BIND(L_short_aligned);
 2114     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2115     __ b(RuntimeAddress(short_copy_entry));
 2116     __ BIND(L_int_aligned);
 2117     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2118     __ b(RuntimeAddress(int_copy_entry));
 2119     __ BIND(L_long_aligned);
 2120     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2121     __ b(RuntimeAddress(long_copy_entry));
 2122 
 2123     return start;
 2124   }
 2125 
 2126   //
 2127   //  Generate generic array copy stubs
 2128   //
 2129   //  Input:
 2130   //    c_rarg0    -  src oop
 2131   //    c_rarg1    -  src_pos (32-bits)
 2132   //    c_rarg2    -  dst oop
 2133   //    c_rarg3    -  dst_pos (32-bits)
 2134   //    c_rarg4    -  element count (32-bits)
 2135   //
 2136   //  Output:
 2137   //    r0 ==  0  -  success
 2138   //    r0 == -1^K - failure, where K is partial transfer count
 2139   //
 2140   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2141                                 address int_copy_entry, address oop_copy_entry,
 2142                                 address long_copy_entry, address checkcast_copy_entry) {
 2143     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2144 
 2145     Label L_failed, L_objArray;
 2146     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2147 
 2148     // Input registers
 2149     const Register src        = c_rarg0;  // source array oop
 2150     const Register src_pos    = c_rarg1;  // source position
 2151     const Register dst        = c_rarg2;  // destination array oop
 2152     const Register dst_pos    = c_rarg3;  // destination position
 2153     const Register length     = c_rarg4;
 2154 
 2155 
 2156     // Registers used as temps
 2157     const Register dst_klass  = c_rarg5;
 2158 
 2159     __ align(CodeEntryAlignment);
 2160 
 2161     StubCodeMark mark(this, stub_id);
 2162 
 2163     address start = __ pc();
 2164 
 2165     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2166 
 2167     // bump this on entry, not on exit:
 2168     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2169 
 2170     //-----------------------------------------------------------------------
 2171     // Assembler stub will be used for this call to arraycopy
 2172     // if the following conditions are met:
 2173     //
 2174     // (1) src and dst must not be null.
 2175     // (2) src_pos must not be negative.
 2176     // (3) dst_pos must not be negative.
 2177     // (4) length  must not be negative.
 2178     // (5) src klass and dst klass should be the same and not null.
 2179     // (6) src and dst should be arrays.
 2180     // (7) src_pos + length must not exceed length of src.
 2181     // (8) dst_pos + length must not exceed length of dst.
 2182     //
 2183 
 2184     //  if (src == nullptr) return -1;
 2185     __ cbz(src, L_failed);
 2186 
 2187     //  if (src_pos < 0) return -1;
 2188     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     //  if (dst == nullptr) return -1;
 2191     __ cbz(dst, L_failed);
 2192 
 2193     //  if (dst_pos < 0) return -1;
 2194     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2195 
 2196     // registers used as temp
 2197     const Register scratch_length    = r16; // elements count to copy
 2198     const Register scratch_src_klass = r17; // array klass
 2199     const Register lh                = r15; // layout helper
 2200 
 2201     //  if (length < 0) return -1;
 2202     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2203     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2204 
 2205     __ load_klass(scratch_src_klass, src);
 2206 #ifdef ASSERT
 2207     //  assert(src->klass() != nullptr);
 2208     {
 2209       BLOCK_COMMENT("assert klasses not null {");
 2210       Label L1, L2;
 2211       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2212       __ bind(L1);
 2213       __ stop("broken null klass");
 2214       __ bind(L2);
 2215       __ load_klass(rscratch1, dst);
 2216       __ cbz(rscratch1, L1);     // this would be broken also
 2217       BLOCK_COMMENT("} assert klasses not null done");
 2218     }
 2219 #endif
 2220 
 2221     // Load layout helper (32-bits)
 2222     //
 2223     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2224     // 32        30    24            16              8     2                 0
 2225     //
 2226     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2227     //
 2228 
 2229     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2230 
 2231     // Handle objArrays completely differently...
 2232     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2233     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2234     __ movw(rscratch1, objArray_lh);
 2235     __ eorw(rscratch2, lh, rscratch1);
 2236     __ cbzw(rscratch2, L_objArray);
 2237 
 2238     //  if (src->klass() != dst->klass()) return -1;
 2239     __ load_klass(rscratch2, dst);
 2240     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2241     __ cbnz(rscratch2, L_failed);
 2242 
 2243     // Check for flat inline type array -> return -1
 2244     __ test_flat_array_oop(src, rscratch2, L_failed);
 2245 
 2246     // Check for null-free (non-flat) inline type array -> handle as object array
 2247     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2248 
 2249     //  if (!src->is_Array()) return -1;
 2250     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2251 
 2252     // At this point, it is known to be a typeArray (array_tag 0x3).
 2253 #ifdef ASSERT
 2254     {
 2255       BLOCK_COMMENT("assert primitive array {");
 2256       Label L;
 2257       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2258       __ cmpw(lh, rscratch2);
 2259       __ br(Assembler::GE, L);
 2260       __ stop("must be a primitive array");
 2261       __ bind(L);
 2262       BLOCK_COMMENT("} assert primitive array done");
 2263     }
 2264 #endif
 2265 
 2266     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2267                            rscratch2, L_failed);
 2268 
 2269     // TypeArrayKlass
 2270     //
 2271     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2272     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2273     //
 2274 
 2275     const Register rscratch1_offset = rscratch1;    // array offset
 2276     const Register r15_elsize = lh; // element size
 2277 
 2278     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2279            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2280     __ add(src, src, rscratch1_offset);           // src array offset
 2281     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2282     BLOCK_COMMENT("choose copy loop based on element size");
 2283 
 2284     // next registers should be set before the jump to corresponding stub
 2285     const Register from     = c_rarg0;  // source array address
 2286     const Register to       = c_rarg1;  // destination array address
 2287     const Register count    = c_rarg2;  // elements count
 2288 
 2289     // 'from', 'to', 'count' registers should be set in such order
 2290     // since they are the same as 'src', 'src_pos', 'dst'.
 2291 
 2292     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2293 
 2294     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2295     // size in bytes).  We do a simple bitwise binary search.
 2296   __ BIND(L_copy_bytes);
 2297     __ tbnz(r15_elsize, 1, L_copy_ints);
 2298     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2299     __ lea(from, Address(src, src_pos));// src_addr
 2300     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2301     __ movw(count, scratch_length); // length
 2302     __ b(RuntimeAddress(byte_copy_entry));
 2303 
 2304   __ BIND(L_copy_shorts);
 2305     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2307     __ movw(count, scratch_length); // length
 2308     __ b(RuntimeAddress(short_copy_entry));
 2309 
 2310   __ BIND(L_copy_ints);
 2311     __ tbnz(r15_elsize, 0, L_copy_longs);
 2312     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2313     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2314     __ movw(count, scratch_length); // length
 2315     __ b(RuntimeAddress(int_copy_entry));
 2316 
 2317   __ BIND(L_copy_longs);
 2318 #ifdef ASSERT
 2319     {
 2320       BLOCK_COMMENT("assert long copy {");
 2321       Label L;
 2322       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2323       __ cmpw(r15_elsize, LogBytesPerLong);
 2324       __ br(Assembler::EQ, L);
 2325       __ stop("must be long copy, but elsize is wrong");
 2326       __ bind(L);
 2327       BLOCK_COMMENT("} assert long copy done");
 2328     }
 2329 #endif
 2330     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2331     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2332     __ movw(count, scratch_length); // length
 2333     __ b(RuntimeAddress(long_copy_entry));
 2334 
 2335     // ObjArrayKlass
 2336   __ BIND(L_objArray);
 2337     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2338 
 2339     Label L_plain_copy, L_checkcast_copy;
 2340     //  test array classes for subtyping
 2341     __ load_klass(r15, dst);
 2342     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2343     __ br(Assembler::NE, L_checkcast_copy);
 2344 
 2345     // Identically typed arrays can be copied without element-wise checks.
 2346     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                            rscratch2, L_failed);
 2348 
 2349     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2350     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2351     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2352     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353     __ movw(count, scratch_length); // length
 2354   __ BIND(L_plain_copy);
 2355     __ b(RuntimeAddress(oop_copy_entry));
 2356 
 2357   __ BIND(L_checkcast_copy);
 2358     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2359     {
 2360       // Before looking at dst.length, make sure dst is also an objArray.
 2361       __ ldrw(rscratch1, Address(r15, lh_offset));
 2362       __ movw(rscratch2, objArray_lh);
 2363       __ eorw(rscratch1, rscratch1, rscratch2);
 2364       __ cbnzw(rscratch1, L_failed);
 2365 
 2366       // It is safe to examine both src.length and dst.length.
 2367       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2368                              r15, L_failed);
 2369 
 2370       __ load_klass(dst_klass, dst); // reload
 2371 
 2372       // Marshal the base address arguments now, freeing registers.
 2373       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2374       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2375       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2376       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2377       __ movw(count, length);           // length (reloaded)
 2378       Register sco_temp = c_rarg3;      // this register is free now
 2379       assert_different_registers(from, to, count, sco_temp,
 2380                                  dst_klass, scratch_src_klass);
 2381       // assert_clean_int(count, sco_temp);
 2382 
 2383       // Generate the type check.
 2384       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2386 
 2387       // Smashes rscratch1, rscratch2
 2388       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2389                           L_plain_copy);
 2390 
 2391       // Fetch destination element klass from the ObjArrayKlass header.
 2392       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2393       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2394       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2395 
 2396       // the checkcast_copy loop needs two extra arguments:
 2397       assert(c_rarg3 == sco_temp, "#3 already in place");
 2398       // Set up arguments for checkcast_copy_entry.
 2399       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2400       __ b(RuntimeAddress(checkcast_copy_entry));
 2401     }
 2402 
 2403   __ BIND(L_failed);
 2404     __ mov(r0, -1);
 2405     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2406     __ ret(lr);
 2407 
 2408     return start;
 2409   }
 2410 
 2411   //
 2412   // Generate stub for array fill. If "aligned" is true, the
 2413   // "to" address is assumed to be heapword aligned.
 2414   //
 2415   // Arguments for generated stub:
 2416   //   to:    c_rarg0
 2417   //   value: c_rarg1
 2418   //   count: c_rarg2 treated as signed
 2419   //
 2420   address generate_fill(StubId stub_id) {
 2421     BasicType t;
 2422     bool aligned;
 2423 
 2424     switch (stub_id) {
 2425     case StubId::stubgen_jbyte_fill_id:
 2426       t = T_BYTE;
 2427       aligned = false;
 2428       break;
 2429     case StubId::stubgen_jshort_fill_id:
 2430       t = T_SHORT;
 2431       aligned = false;
 2432       break;
 2433     case StubId::stubgen_jint_fill_id:
 2434       t = T_INT;
 2435       aligned = false;
 2436       break;
 2437     case StubId::stubgen_arrayof_jbyte_fill_id:
 2438       t = T_BYTE;
 2439       aligned = true;
 2440       break;
 2441     case StubId::stubgen_arrayof_jshort_fill_id:
 2442       t = T_SHORT;
 2443       aligned = true;
 2444       break;
 2445     case StubId::stubgen_arrayof_jint_fill_id:
 2446       t = T_INT;
 2447       aligned = true;
 2448       break;
 2449     default:
 2450       ShouldNotReachHere();
 2451     };
 2452 
 2453     __ align(CodeEntryAlignment);
 2454     StubCodeMark mark(this, stub_id);
 2455     address start = __ pc();
 2456 
 2457     BLOCK_COMMENT("Entry:");
 2458 
 2459     const Register to        = c_rarg0;  // source array address
 2460     const Register value     = c_rarg1;  // value
 2461     const Register count     = c_rarg2;  // elements count
 2462 
 2463     const Register bz_base = r10;        // base for block_zero routine
 2464     const Register cnt_words = r11;      // temp register
 2465 
 2466     __ enter();
 2467 
 2468     Label L_fill_elements, L_exit1;
 2469 
 2470     int shift = -1;
 2471     switch (t) {
 2472       case T_BYTE:
 2473         shift = 0;
 2474         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2475         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2476         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2477         __ br(Assembler::LO, L_fill_elements);
 2478         break;
 2479       case T_SHORT:
 2480         shift = 1;
 2481         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2482         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2483         __ br(Assembler::LO, L_fill_elements);
 2484         break;
 2485       case T_INT:
 2486         shift = 2;
 2487         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2488         __ br(Assembler::LO, L_fill_elements);
 2489         break;
 2490       default: ShouldNotReachHere();
 2491     }
 2492 
 2493     // Align source address at 8 bytes address boundary.
 2494     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2495     if (!aligned) {
 2496       switch (t) {
 2497         case T_BYTE:
 2498           // One byte misalignment happens only for byte arrays.
 2499           __ tbz(to, 0, L_skip_align1);
 2500           __ strb(value, Address(__ post(to, 1)));
 2501           __ subw(count, count, 1);
 2502           __ bind(L_skip_align1);
 2503           // Fallthrough
 2504         case T_SHORT:
 2505           // Two bytes misalignment happens only for byte and short (char) arrays.
 2506           __ tbz(to, 1, L_skip_align2);
 2507           __ strh(value, Address(__ post(to, 2)));
 2508           __ subw(count, count, 2 >> shift);
 2509           __ bind(L_skip_align2);
 2510           // Fallthrough
 2511         case T_INT:
 2512           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2513           __ tbz(to, 2, L_skip_align4);
 2514           __ strw(value, Address(__ post(to, 4)));
 2515           __ subw(count, count, 4 >> shift);
 2516           __ bind(L_skip_align4);
 2517           break;
 2518         default: ShouldNotReachHere();
 2519       }
 2520     }
 2521 
 2522     //
 2523     //  Fill large chunks
 2524     //
 2525     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2526     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2527     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2528     if (UseBlockZeroing) {
 2529       Label non_block_zeroing, rest;
 2530       // If the fill value is zero we can use the fast zero_words().
 2531       __ cbnz(value, non_block_zeroing);
 2532       __ mov(bz_base, to);
 2533       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2534       address tpc = __ zero_words(bz_base, cnt_words);
 2535       if (tpc == nullptr) {
 2536         fatal("CodeCache is full at generate_fill");
 2537       }
 2538       __ b(rest);
 2539       __ bind(non_block_zeroing);
 2540       __ fill_words(to, cnt_words, value);
 2541       __ bind(rest);
 2542     } else {
 2543       __ fill_words(to, cnt_words, value);
 2544     }
 2545 
 2546     // Remaining count is less than 8 bytes. Fill it by a single store.
 2547     // Note that the total length is no less than 8 bytes.
 2548     if (t == T_BYTE || t == T_SHORT) {
 2549       Label L_exit1;
 2550       __ cbzw(count, L_exit1);
 2551       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2552       __ str(value, Address(to, -8));    // overwrite some elements
 2553       __ bind(L_exit1);
 2554       __ leave();
 2555       __ ret(lr);
 2556     }
 2557 
 2558     // Handle copies less than 8 bytes.
 2559     Label L_fill_2, L_fill_4, L_exit2;
 2560     __ bind(L_fill_elements);
 2561     switch (t) {
 2562       case T_BYTE:
 2563         __ tbz(count, 0, L_fill_2);
 2564         __ strb(value, Address(__ post(to, 1)));
 2565         __ bind(L_fill_2);
 2566         __ tbz(count, 1, L_fill_4);
 2567         __ strh(value, Address(__ post(to, 2)));
 2568         __ bind(L_fill_4);
 2569         __ tbz(count, 2, L_exit2);
 2570         __ strw(value, Address(to));
 2571         break;
 2572       case T_SHORT:
 2573         __ tbz(count, 0, L_fill_4);
 2574         __ strh(value, Address(__ post(to, 2)));
 2575         __ bind(L_fill_4);
 2576         __ tbz(count, 1, L_exit2);
 2577         __ strw(value, Address(to));
 2578         break;
 2579       case T_INT:
 2580         __ cbzw(count, L_exit2);
 2581         __ strw(value, Address(to));
 2582         break;
 2583       default: ShouldNotReachHere();
 2584     }
 2585     __ bind(L_exit2);
 2586     __ leave();
 2587     __ ret(lr);
 2588     return start;
 2589   }
 2590 
 2591   address generate_unsafecopy_common_error_exit() {
 2592     address start_pc = __ pc();
 2593       __ leave();
 2594       __ mov(r0, 0);
 2595       __ ret(lr);
 2596     return start_pc;
 2597   }
 2598 
 2599   //
 2600   //  Generate 'unsafe' set memory stub
 2601   //  Though just as safe as the other stubs, it takes an unscaled
 2602   //  size_t (# bytes) argument instead of an element count.
 2603   //
 2604   //  This fill operation is atomicity preserving: as long as the
 2605   //  address supplied is sufficiently aligned, all writes of up to 64
 2606   //  bits in size are single-copy atomic.
 2607   //
 2608   //  Input:
 2609   //    c_rarg0   - destination array address
 2610   //    c_rarg1   - byte count (size_t)
 2611   //    c_rarg2   - byte value
 2612   //
 2613   address generate_unsafe_setmemory() {
 2614     __ align(CodeEntryAlignment);
 2615     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2616     address start = __ pc();
 2617 
 2618     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2619     Label tail;
 2620 
 2621     UnsafeMemoryAccessMark umam(this, true, false);
 2622 
 2623     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2624 
 2625     __ dup(v0, __ T16B, value);
 2626 
 2627     if (AvoidUnalignedAccesses) {
 2628       __ cmp(count, (u1)16);
 2629       __ br(__ LO, tail);
 2630 
 2631       __ mov(rscratch1, 16);
 2632       __ andr(rscratch2, dest, 15);
 2633       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2634       __ strq(v0, Address(dest));
 2635       __ sub(count, count, rscratch1);
 2636       __ add(dest, dest, rscratch1);
 2637     }
 2638 
 2639     __ subs(count, count, (u1)64);
 2640     __ br(__ LO, tail);
 2641     {
 2642       Label again;
 2643       __ bind(again);
 2644       __ stpq(v0, v0, Address(dest));
 2645       __ stpq(v0, v0, Address(dest, 32));
 2646 
 2647       __ subs(count, count, 64);
 2648       __ add(dest, dest, 64);
 2649       __ br(__ HS, again);
 2650     }
 2651 
 2652     __ bind(tail);
 2653     // The count of bytes is off by 64, but we don't need to correct
 2654     // it because we're only going to use the least-significant few
 2655     // count bits from here on.
 2656     // __ add(count, count, 64);
 2657 
 2658     {
 2659       Label dont;
 2660       __ tbz(count, exact_log2(32), dont);
 2661       __ stpq(v0, v0, __ post(dest, 32));
 2662       __ bind(dont);
 2663     }
 2664     {
 2665       Label dont;
 2666       __ tbz(count, exact_log2(16), dont);
 2667       __ strq(v0, __ post(dest, 16));
 2668       __ bind(dont);
 2669     }
 2670     {
 2671       Label dont;
 2672       __ tbz(count, exact_log2(8), dont);
 2673       __ strd(v0, __ post(dest, 8));
 2674       __ bind(dont);
 2675     }
 2676 
 2677     Label finished;
 2678     __ tst(count, 7);
 2679     __ br(__ EQ, finished);
 2680 
 2681     {
 2682       Label dont;
 2683       __ tbz(count, exact_log2(4), dont);
 2684       __ strs(v0, __ post(dest, 4));
 2685       __ bind(dont);
 2686     }
 2687     {
 2688       Label dont;
 2689       __ tbz(count, exact_log2(2), dont);
 2690       __ bfi(value, value, 8, 8);
 2691       __ strh(value, __ post(dest, 2));
 2692       __ bind(dont);
 2693     }
 2694     {
 2695       Label dont;
 2696       __ tbz(count, exact_log2(1), dont);
 2697       __ strb(value, Address(dest));
 2698       __ bind(dont);
 2699     }
 2700 
 2701     __ bind(finished);
 2702     __ leave();
 2703     __ ret(lr);
 2704 
 2705     return start;
 2706   }
 2707 
 2708   address generate_data_cache_writeback() {
 2709     const Register line        = c_rarg0;  // address of line to write back
 2710 
 2711     __ align(CodeEntryAlignment);
 2712 
 2713     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2714     StubCodeMark mark(this, stub_id);
 2715 
 2716     address start = __ pc();
 2717     __ enter();
 2718     __ cache_wb(Address(line, 0));
 2719     __ leave();
 2720     __ ret(lr);
 2721 
 2722     return start;
 2723   }
 2724 
 2725   address generate_data_cache_writeback_sync() {
 2726     const Register is_pre     = c_rarg0;  // pre or post sync
 2727 
 2728     __ align(CodeEntryAlignment);
 2729 
 2730     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2731     StubCodeMark mark(this, stub_id);
 2732 
 2733     // pre wbsync is a no-op
 2734     // post wbsync translates to an sfence
 2735 
 2736     Label skip;
 2737     address start = __ pc();
 2738     __ enter();
 2739     __ cbnz(is_pre, skip);
 2740     __ cache_wbsync(false);
 2741     __ bind(skip);
 2742     __ leave();
 2743     __ ret(lr);
 2744 
 2745     return start;
 2746   }
 2747 
 2748   void generate_arraycopy_stubs() {
 2749     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 2750     // entry immediately following their stack push. This can be used
 2751     // as a post-push branch target for compatible stubs when they
 2752     // identify a special case that can be handled by the fallback
 2753     // stub e.g a disjoint copy stub may be use as a special case
 2754     // fallback for its compatible conjoint copy stub.
 2755     //
 2756     // A no push entry is always returned in the following local and
 2757     // then published by assigning to the appropriate entry field in
 2758     // class StubRoutines. The entry value is then passed to the
 2759     // generator for the compatible stub. That means the entry must be
 2760     // listed when saving to/restoring from the AOT cache, ensuring
 2761     // that the inter-stub jumps are noted at AOT-cache save and
 2762     // relocated at AOT cache load.
 2763     address nopush_entry;
 2764 
 2765     // generate the common exit first so later stubs can rely on it if
 2766     // they want an UnsafeMemoryAccess exit non-local to the stub
 2767     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2768     // register the stub as the default exit with class UnsafeMemoryAccess
 2769     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2770 
 2771     // generate and publish arch64-specific bulk copy routines first
 2772     // so we can call them from other copy stubs
 2773     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2774     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2775 
 2776     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2777     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2778 
 2779     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2780     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2781 
 2782     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2783 
 2784     //*** jbyte
 2785     // Always need aligned and unaligned versions
 2786     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2787     // disjoint nopush entry is needed by conjoint copy
 2788     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2789     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 2790     // conjoint nopush entry is needed by generic/unsafe copy
 2791     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 2792     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2793     // disjoint arrayof nopush entry is needed by conjoint copy
 2794     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2795     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 2796 
 2797     //*** jshort
 2798     // Always need aligned and unaligned versions
 2799     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 2800     // disjoint nopush entry is needed by conjoint copy
 2801     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 2802     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 2803     // conjoint nopush entry is used by generic/unsafe copy
 2804     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 2805     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 2806     // disjoint arrayof nopush entry is needed by conjoint copy
 2807     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 2808     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 2809 
 2810     //*** jint
 2811     // Aligned versions
 2812     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 2813     // disjoint arrayof nopush entry is needed by conjoint copy
 2814     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 2815     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 2816     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2817     // jint_arraycopy_nopush always points to the unaligned version
 2818     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 2819     // disjoint nopush entry is needed by conjoint copy
 2820     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 2821     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 2822     // conjoint nopush entry is needed by generic/unsafe copy
 2823     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 2824 
 2825     //*** jlong
 2826     // It is always aligned
 2827     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 2828     // disjoint arrayof nopush entry is needed by conjoint copy
 2829     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 2830     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 2831     // conjoint nopush entry is needed by generic/unsafe copy
 2832     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 2833     // disjoint normal/nopush and conjoint normal entries are not
 2834     // generated since the arrayof versions are the same
 2835     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2836     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 2837     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2838 
 2839     //*** oops
 2840     {
 2841       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2842         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 2843       // disjoint arrayof nopush entry is needed by conjoint copy
 2844       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 2845       StubRoutines::_arrayof_oop_arraycopy
 2846         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 2847       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 2848       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 2849       // Aligned versions without pre-barriers
 2850       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2851         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 2852       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 2853       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 2854       // note that we don't need a returned nopush entry because the
 2855       // generic/unsafe copy does not cater for uninit arrays.
 2856       StubRoutines::_arrayof_oop_arraycopy_uninit
 2857         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 2858     }
 2859 
 2860     // for oop copies reuse arrayof entries for non-arrayof cases
 2861     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2862     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 2863     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2864     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2865     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 2866     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2867 
 2868     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 2869     // checkcast nopush entry is needed by generic copy
 2870     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 2871     // note that we don't need a returned nopush entry because the
 2872     // generic copy does not cater for uninit arrays.
 2873     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2874 
 2875     // unsafe arraycopy may fallback on conjoint stubs
 2876     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2877                                                               StubRoutines::_jshort_arraycopy_nopush,
 2878                                                               StubRoutines::_jint_arraycopy_nopush,
 2879                                                               StubRoutines::_jlong_arraycopy_nopush);
 2880 
 2881     // generic arraycopy may fallback on conjoint stubs
 2882     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2883                                                                StubRoutines::_jshort_arraycopy_nopush,
 2884                                                                StubRoutines::_jint_arraycopy_nopush,
 2885                                                                StubRoutines::_oop_arraycopy_nopush,
 2886                                                                StubRoutines::_jlong_arraycopy_nopush,
 2887                                                                StubRoutines::_checkcast_arraycopy_nopush);
 2888 
 2889     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2890     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2891     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2892     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2893     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2894     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2895   }
 2896 
 2897   void generate_math_stubs() { Unimplemented(); }
 2898 
 2899   // Arguments:
 2900   //
 2901   // Inputs:
 2902   //   c_rarg0   - source byte array address
 2903   //   c_rarg1   - destination byte array address
 2904   //   c_rarg2   - sessionKe (key) in little endian int array
 2905   //
 2906   address generate_aescrypt_encryptBlock() {
 2907     __ align(CodeEntryAlignment);
 2908     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2909     StubCodeMark mark(this, stub_id);
 2910 
 2911     const Register from        = c_rarg0;  // source array address
 2912     const Register to          = c_rarg1;  // destination array address
 2913     const Register key         = c_rarg2;  // key array address
 2914     const Register keylen      = rscratch1;
 2915 
 2916     address start = __ pc();
 2917     __ enter();
 2918 
 2919     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2920 
 2921     __ aesenc_loadkeys(key, keylen);
 2922     __ aesecb_encrypt(from, to, keylen);
 2923 
 2924     __ mov(r0, 0);
 2925 
 2926     __ leave();
 2927     __ ret(lr);
 2928 
 2929     return start;
 2930   }
 2931 
 2932   // Arguments:
 2933   //
 2934   // Inputs:
 2935   //   c_rarg0   - source byte array address
 2936   //   c_rarg1   - destination byte array address
 2937   //   c_rarg2   - sessionKd (key) in little endian int array
 2938   //
 2939   address generate_aescrypt_decryptBlock() {
 2940     assert(UseAES, "need AES cryptographic extension support");
 2941     __ align(CodeEntryAlignment);
 2942     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2943     StubCodeMark mark(this, stub_id);
 2944     Label L_doLast;
 2945 
 2946     const Register from        = c_rarg0;  // source array address
 2947     const Register to          = c_rarg1;  // destination array address
 2948     const Register key         = c_rarg2;  // key array address
 2949     const Register keylen      = rscratch1;
 2950 
 2951     address start = __ pc();
 2952     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2953 
 2954     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2955 
 2956     __ aesecb_decrypt(from, to, key, keylen);
 2957 
 2958     __ mov(r0, 0);
 2959 
 2960     __ leave();
 2961     __ ret(lr);
 2962 
 2963     return start;
 2964   }
 2965 
 2966   // Arguments:
 2967   //
 2968   // Inputs:
 2969   //   c_rarg0   - source byte array address
 2970   //   c_rarg1   - destination byte array address
 2971   //   c_rarg2   - sessionKe (key) in little endian int array
 2972   //   c_rarg3   - r vector byte array address
 2973   //   c_rarg4   - input length
 2974   //
 2975   // Output:
 2976   //   x0        - input length
 2977   //
 2978   address generate_cipherBlockChaining_encryptAESCrypt() {
 2979     assert(UseAES, "need AES cryptographic extension support");
 2980     __ align(CodeEntryAlignment);
 2981     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2982     StubCodeMark mark(this, stub_id);
 2983 
 2984     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2985 
 2986     const Register from        = c_rarg0;  // source array address
 2987     const Register to          = c_rarg1;  // destination array address
 2988     const Register key         = c_rarg2;  // key array address
 2989     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2990                                            // and left with the results of the last encryption block
 2991     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2992     const Register keylen      = rscratch1;
 2993 
 2994     address start = __ pc();
 2995 
 2996       __ enter();
 2997 
 2998       __ movw(rscratch2, len_reg);
 2999 
 3000       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3001 
 3002       __ ld1(v0, __ T16B, rvec);
 3003 
 3004       __ cmpw(keylen, 52);
 3005       __ br(Assembler::CC, L_loadkeys_44);
 3006       __ br(Assembler::EQ, L_loadkeys_52);
 3007 
 3008       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3009       __ rev32(v17, __ T16B, v17);
 3010       __ rev32(v18, __ T16B, v18);
 3011     __ BIND(L_loadkeys_52);
 3012       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3013       __ rev32(v19, __ T16B, v19);
 3014       __ rev32(v20, __ T16B, v20);
 3015     __ BIND(L_loadkeys_44);
 3016       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3017       __ rev32(v21, __ T16B, v21);
 3018       __ rev32(v22, __ T16B, v22);
 3019       __ rev32(v23, __ T16B, v23);
 3020       __ rev32(v24, __ T16B, v24);
 3021       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3022       __ rev32(v25, __ T16B, v25);
 3023       __ rev32(v26, __ T16B, v26);
 3024       __ rev32(v27, __ T16B, v27);
 3025       __ rev32(v28, __ T16B, v28);
 3026       __ ld1(v29, v30, v31, __ T16B, key);
 3027       __ rev32(v29, __ T16B, v29);
 3028       __ rev32(v30, __ T16B, v30);
 3029       __ rev32(v31, __ T16B, v31);
 3030 
 3031     __ BIND(L_aes_loop);
 3032       __ ld1(v1, __ T16B, __ post(from, 16));
 3033       __ eor(v0, __ T16B, v0, v1);
 3034 
 3035       __ br(Assembler::CC, L_rounds_44);
 3036       __ br(Assembler::EQ, L_rounds_52);
 3037 
 3038       __ aese(v0, v17); __ aesmc(v0, v0);
 3039       __ aese(v0, v18); __ aesmc(v0, v0);
 3040     __ BIND(L_rounds_52);
 3041       __ aese(v0, v19); __ aesmc(v0, v0);
 3042       __ aese(v0, v20); __ aesmc(v0, v0);
 3043     __ BIND(L_rounds_44);
 3044       __ aese(v0, v21); __ aesmc(v0, v0);
 3045       __ aese(v0, v22); __ aesmc(v0, v0);
 3046       __ aese(v0, v23); __ aesmc(v0, v0);
 3047       __ aese(v0, v24); __ aesmc(v0, v0);
 3048       __ aese(v0, v25); __ aesmc(v0, v0);
 3049       __ aese(v0, v26); __ aesmc(v0, v0);
 3050       __ aese(v0, v27); __ aesmc(v0, v0);
 3051       __ aese(v0, v28); __ aesmc(v0, v0);
 3052       __ aese(v0, v29); __ aesmc(v0, v0);
 3053       __ aese(v0, v30);
 3054       __ eor(v0, __ T16B, v0, v31);
 3055 
 3056       __ st1(v0, __ T16B, __ post(to, 16));
 3057 
 3058       __ subw(len_reg, len_reg, 16);
 3059       __ cbnzw(len_reg, L_aes_loop);
 3060 
 3061       __ st1(v0, __ T16B, rvec);
 3062 
 3063       __ mov(r0, rscratch2);
 3064 
 3065       __ leave();
 3066       __ ret(lr);
 3067 
 3068       return start;
 3069   }
 3070 
 3071   // Arguments:
 3072   //
 3073   // Inputs:
 3074   //   c_rarg0   - source byte array address
 3075   //   c_rarg1   - destination byte array address
 3076   //   c_rarg2   - sessionKd (key) in little endian int array
 3077   //   c_rarg3   - r vector byte array address
 3078   //   c_rarg4   - input length
 3079   //
 3080   // Output:
 3081   //   r0        - input length
 3082   //
 3083   address generate_cipherBlockChaining_decryptAESCrypt() {
 3084     assert(UseAES, "need AES cryptographic extension support");
 3085     __ align(CodeEntryAlignment);
 3086     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3087     StubCodeMark mark(this, stub_id);
 3088 
 3089     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3090 
 3091     const Register from        = c_rarg0;  // source array address
 3092     const Register to          = c_rarg1;  // destination array address
 3093     const Register key         = c_rarg2;  // key array address
 3094     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3095                                            // and left with the results of the last encryption block
 3096     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3097     const Register keylen      = rscratch1;
 3098 
 3099     address start = __ pc();
 3100 
 3101       __ enter();
 3102 
 3103       __ movw(rscratch2, len_reg);
 3104 
 3105       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3106 
 3107       __ ld1(v2, __ T16B, rvec);
 3108 
 3109       __ ld1(v31, __ T16B, __ post(key, 16));
 3110       __ rev32(v31, __ T16B, v31);
 3111 
 3112       __ cmpw(keylen, 52);
 3113       __ br(Assembler::CC, L_loadkeys_44);
 3114       __ br(Assembler::EQ, L_loadkeys_52);
 3115 
 3116       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3117       __ rev32(v17, __ T16B, v17);
 3118       __ rev32(v18, __ T16B, v18);
 3119     __ BIND(L_loadkeys_52);
 3120       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3121       __ rev32(v19, __ T16B, v19);
 3122       __ rev32(v20, __ T16B, v20);
 3123     __ BIND(L_loadkeys_44);
 3124       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3125       __ rev32(v21, __ T16B, v21);
 3126       __ rev32(v22, __ T16B, v22);
 3127       __ rev32(v23, __ T16B, v23);
 3128       __ rev32(v24, __ T16B, v24);
 3129       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3130       __ rev32(v25, __ T16B, v25);
 3131       __ rev32(v26, __ T16B, v26);
 3132       __ rev32(v27, __ T16B, v27);
 3133       __ rev32(v28, __ T16B, v28);
 3134       __ ld1(v29, v30, __ T16B, key);
 3135       __ rev32(v29, __ T16B, v29);
 3136       __ rev32(v30, __ T16B, v30);
 3137 
 3138     __ BIND(L_aes_loop);
 3139       __ ld1(v0, __ T16B, __ post(from, 16));
 3140       __ orr(v1, __ T16B, v0, v0);
 3141 
 3142       __ br(Assembler::CC, L_rounds_44);
 3143       __ br(Assembler::EQ, L_rounds_52);
 3144 
 3145       __ aesd(v0, v17); __ aesimc(v0, v0);
 3146       __ aesd(v0, v18); __ aesimc(v0, v0);
 3147     __ BIND(L_rounds_52);
 3148       __ aesd(v0, v19); __ aesimc(v0, v0);
 3149       __ aesd(v0, v20); __ aesimc(v0, v0);
 3150     __ BIND(L_rounds_44);
 3151       __ aesd(v0, v21); __ aesimc(v0, v0);
 3152       __ aesd(v0, v22); __ aesimc(v0, v0);
 3153       __ aesd(v0, v23); __ aesimc(v0, v0);
 3154       __ aesd(v0, v24); __ aesimc(v0, v0);
 3155       __ aesd(v0, v25); __ aesimc(v0, v0);
 3156       __ aesd(v0, v26); __ aesimc(v0, v0);
 3157       __ aesd(v0, v27); __ aesimc(v0, v0);
 3158       __ aesd(v0, v28); __ aesimc(v0, v0);
 3159       __ aesd(v0, v29); __ aesimc(v0, v0);
 3160       __ aesd(v0, v30);
 3161       __ eor(v0, __ T16B, v0, v31);
 3162       __ eor(v0, __ T16B, v0, v2);
 3163 
 3164       __ st1(v0, __ T16B, __ post(to, 16));
 3165       __ orr(v2, __ T16B, v1, v1);
 3166 
 3167       __ subw(len_reg, len_reg, 16);
 3168       __ cbnzw(len_reg, L_aes_loop);
 3169 
 3170       __ st1(v2, __ T16B, rvec);
 3171 
 3172       __ mov(r0, rscratch2);
 3173 
 3174       __ leave();
 3175       __ ret(lr);
 3176 
 3177     return start;
 3178   }
 3179 
 3180   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3181   // Inputs: 128-bits. in is preserved.
 3182   // The least-significant 64-bit word is in the upper dword of each vector.
 3183   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3184   // Output: result
 3185   void be_add_128_64(FloatRegister result, FloatRegister in,
 3186                      FloatRegister inc, FloatRegister tmp) {
 3187     assert_different_registers(result, tmp, inc);
 3188 
 3189     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3190                                            // input
 3191     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3192     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3193                                            // MSD == 0 (must be!) to LSD
 3194     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3195   }
 3196 
 3197   // CTR AES crypt.
 3198   // Arguments:
 3199   //
 3200   // Inputs:
 3201   //   c_rarg0   - source byte array address
 3202   //   c_rarg1   - destination byte array address
 3203   //   c_rarg2   - sessionKe (key) in little endian int array
 3204   //   c_rarg3   - counter vector byte array address
 3205   //   c_rarg4   - input length
 3206   //   c_rarg5   - saved encryptedCounter start
 3207   //   c_rarg6   - saved used length
 3208   //
 3209   // Output:
 3210   //   r0       - input length
 3211   //
 3212   address generate_counterMode_AESCrypt() {
 3213     const Register in = c_rarg0;
 3214     const Register out = c_rarg1;
 3215     const Register key = c_rarg2;
 3216     const Register counter = c_rarg3;
 3217     const Register saved_len = c_rarg4, len = r10;
 3218     const Register saved_encrypted_ctr = c_rarg5;
 3219     const Register used_ptr = c_rarg6, used = r12;
 3220 
 3221     const Register offset = r7;
 3222     const Register keylen = r11;
 3223 
 3224     const unsigned char block_size = 16;
 3225     const int bulk_width = 4;
 3226     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3227     // performance with larger data sizes, but it also means that the
 3228     // fast path isn't used until you have at least 8 blocks, and up
 3229     // to 127 bytes of data will be executed on the slow path. For
 3230     // that reason, and also so as not to blow away too much icache, 4
 3231     // blocks seems like a sensible compromise.
 3232 
 3233     // Algorithm:
 3234     //
 3235     //    if (len == 0) {
 3236     //        goto DONE;
 3237     //    }
 3238     //    int result = len;
 3239     //    do {
 3240     //        if (used >= blockSize) {
 3241     //            if (len >= bulk_width * blockSize) {
 3242     //                CTR_large_block();
 3243     //                if (len == 0)
 3244     //                    goto DONE;
 3245     //            }
 3246     //            for (;;) {
 3247     //                16ByteVector v0 = counter;
 3248     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3249     //                used = 0;
 3250     //                if (len < blockSize)
 3251     //                    break;    /* goto NEXT */
 3252     //                16ByteVector v1 = load16Bytes(in, offset);
 3253     //                v1 = v1 ^ encryptedCounter;
 3254     //                store16Bytes(out, offset);
 3255     //                used = blockSize;
 3256     //                offset += blockSize;
 3257     //                len -= blockSize;
 3258     //                if (len == 0)
 3259     //                    goto DONE;
 3260     //            }
 3261     //        }
 3262     //      NEXT:
 3263     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3264     //        len--;
 3265     //    } while (len != 0);
 3266     //  DONE:
 3267     //    return result;
 3268     //
 3269     // CTR_large_block()
 3270     //    Wide bulk encryption of whole blocks.
 3271 
 3272     __ align(CodeEntryAlignment);
 3273     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3274     StubCodeMark mark(this, stub_id);
 3275     const address start = __ pc();
 3276     __ enter();
 3277 
 3278     Label DONE, CTR_large_block, large_block_return;
 3279     __ ldrw(used, Address(used_ptr));
 3280     __ cbzw(saved_len, DONE);
 3281 
 3282     __ mov(len, saved_len);
 3283     __ mov(offset, 0);
 3284 
 3285     // Compute #rounds for AES based on the length of the key array
 3286     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3287 
 3288     __ aesenc_loadkeys(key, keylen);
 3289 
 3290     {
 3291       Label L_CTR_loop, NEXT;
 3292 
 3293       __ bind(L_CTR_loop);
 3294 
 3295       __ cmp(used, block_size);
 3296       __ br(__ LO, NEXT);
 3297 
 3298       // Maybe we have a lot of data
 3299       __ subsw(rscratch1, len, bulk_width * block_size);
 3300       __ br(__ HS, CTR_large_block);
 3301       __ BIND(large_block_return);
 3302       __ cbzw(len, DONE);
 3303 
 3304       // Setup the counter
 3305       __ movi(v4, __ T4S, 0);
 3306       __ movi(v5, __ T4S, 1);
 3307       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3308 
 3309       // 128-bit big-endian increment
 3310       __ ld1(v0, __ T16B, counter);
 3311       __ rev64(v16, __ T16B, v0);
 3312       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3313       __ rev64(v16, __ T16B, v16);
 3314       __ st1(v16, __ T16B, counter);
 3315       // Previous counter value is in v0
 3316       // v4 contains { 0, 1 }
 3317 
 3318       {
 3319         // We have fewer than bulk_width blocks of data left. Encrypt
 3320         // them one by one until there is less than a full block
 3321         // remaining, being careful to save both the encrypted counter
 3322         // and the counter.
 3323 
 3324         Label inner_loop;
 3325         __ bind(inner_loop);
 3326         // Counter to encrypt is in v0
 3327         __ aesecb_encrypt(noreg, noreg, keylen);
 3328         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3329 
 3330         // Do we have a remaining full block?
 3331 
 3332         __ mov(used, 0);
 3333         __ cmp(len, block_size);
 3334         __ br(__ LO, NEXT);
 3335 
 3336         // Yes, we have a full block
 3337         __ ldrq(v1, Address(in, offset));
 3338         __ eor(v1, __ T16B, v1, v0);
 3339         __ strq(v1, Address(out, offset));
 3340         __ mov(used, block_size);
 3341         __ add(offset, offset, block_size);
 3342 
 3343         __ subw(len, len, block_size);
 3344         __ cbzw(len, DONE);
 3345 
 3346         // Increment the counter, store it back
 3347         __ orr(v0, __ T16B, v16, v16);
 3348         __ rev64(v16, __ T16B, v16);
 3349         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3350         __ rev64(v16, __ T16B, v16);
 3351         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3352 
 3353         __ b(inner_loop);
 3354       }
 3355 
 3356       __ BIND(NEXT);
 3357 
 3358       // Encrypt a single byte, and loop.
 3359       // We expect this to be a rare event.
 3360       __ ldrb(rscratch1, Address(in, offset));
 3361       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3362       __ eor(rscratch1, rscratch1, rscratch2);
 3363       __ strb(rscratch1, Address(out, offset));
 3364       __ add(offset, offset, 1);
 3365       __ add(used, used, 1);
 3366       __ subw(len, len,1);
 3367       __ cbnzw(len, L_CTR_loop);
 3368     }
 3369 
 3370     __ bind(DONE);
 3371     __ strw(used, Address(used_ptr));
 3372     __ mov(r0, saved_len);
 3373 
 3374     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3375     __ ret(lr);
 3376 
 3377     // Bulk encryption
 3378 
 3379     __ BIND (CTR_large_block);
 3380     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3381 
 3382     if (bulk_width == 8) {
 3383       __ sub(sp, sp, 4 * 16);
 3384       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3385     }
 3386     __ sub(sp, sp, 4 * 16);
 3387     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3388     RegSet saved_regs = (RegSet::of(in, out, offset)
 3389                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3390     __ push(saved_regs, sp);
 3391     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3392     __ add(in, in, offset);
 3393     __ add(out, out, offset);
 3394 
 3395     // Keys should already be loaded into the correct registers
 3396 
 3397     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3398     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3399 
 3400     // AES/CTR loop
 3401     {
 3402       Label L_CTR_loop;
 3403       __ BIND(L_CTR_loop);
 3404 
 3405       // Setup the counters
 3406       __ movi(v8, __ T4S, 0);
 3407       __ movi(v9, __ T4S, 1);
 3408       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3409 
 3410       for (int i = 0; i < bulk_width; i++) {
 3411         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3412         __ rev64(v0_ofs, __ T16B, v16);
 3413         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3414       }
 3415 
 3416       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3417 
 3418       // Encrypt the counters
 3419       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3420 
 3421       if (bulk_width == 8) {
 3422         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3423       }
 3424 
 3425       // XOR the encrypted counters with the inputs
 3426       for (int i = 0; i < bulk_width; i++) {
 3427         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3428         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3429         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3430       }
 3431 
 3432       // Write the encrypted data
 3433       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3434       if (bulk_width == 8) {
 3435         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3436       }
 3437 
 3438       __ subw(len, len, 16 * bulk_width);
 3439       __ cbnzw(len, L_CTR_loop);
 3440     }
 3441 
 3442     // Save the counter back where it goes
 3443     __ rev64(v16, __ T16B, v16);
 3444     __ st1(v16, __ T16B, counter);
 3445 
 3446     __ pop(saved_regs, sp);
 3447 
 3448     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3449     if (bulk_width == 8) {
 3450       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3451     }
 3452 
 3453     __ andr(rscratch1, len, -16 * bulk_width);
 3454     __ sub(len, len, rscratch1);
 3455     __ add(offset, offset, rscratch1);
 3456     __ mov(used, 16);
 3457     __ strw(used, Address(used_ptr));
 3458     __ b(large_block_return);
 3459 
 3460     return start;
 3461   }
 3462 
 3463   // Vector AES Galois Counter Mode implementation. Parameters:
 3464   //
 3465   // in = c_rarg0
 3466   // len = c_rarg1
 3467   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3468   // out = c_rarg3
 3469   // key = c_rarg4
 3470   // state = c_rarg5 - GHASH.state
 3471   // subkeyHtbl = c_rarg6 - powers of H
 3472   // counter = c_rarg7 - 16 bytes of CTR
 3473   // return - number of processed bytes
 3474   address generate_galoisCounterMode_AESCrypt() {
 3475     Label ghash_polynomial; // local data generated after code
 3476 
 3477    __ align(CodeEntryAlignment);
 3478     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3479     StubCodeMark mark(this, stub_id);
 3480     address start = __ pc();
 3481     __ enter();
 3482 
 3483     const Register in = c_rarg0;
 3484     const Register len = c_rarg1;
 3485     const Register ct = c_rarg2;
 3486     const Register out = c_rarg3;
 3487     // and updated with the incremented counter in the end
 3488 
 3489     const Register key = c_rarg4;
 3490     const Register state = c_rarg5;
 3491 
 3492     const Register subkeyHtbl = c_rarg6;
 3493 
 3494     const Register counter = c_rarg7;
 3495 
 3496     const Register keylen = r10;
 3497     // Save state before entering routine
 3498     __ sub(sp, sp, 4 * 16);
 3499     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3500     __ sub(sp, sp, 4 * 16);
 3501     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3502 
 3503     // __ andr(len, len, -512);
 3504     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3505     __ str(len, __ pre(sp, -2 * wordSize));
 3506 
 3507     Label DONE;
 3508     __ cbz(len, DONE);
 3509 
 3510     // Compute #rounds for AES based on the length of the key array
 3511     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3512 
 3513     __ aesenc_loadkeys(key, keylen);
 3514     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3515     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3516 
 3517     // AES/CTR loop
 3518     {
 3519       Label L_CTR_loop;
 3520       __ BIND(L_CTR_loop);
 3521 
 3522       // Setup the counters
 3523       __ movi(v8, __ T4S, 0);
 3524       __ movi(v9, __ T4S, 1);
 3525       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3526 
 3527       assert(v0->encoding() < v8->encoding(), "");
 3528       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3529         FloatRegister f = as_FloatRegister(i);
 3530         __ rev32(f, __ T16B, v16);
 3531         __ addv(v16, __ T4S, v16, v8);
 3532       }
 3533 
 3534       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3535 
 3536       // Encrypt the counters
 3537       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3538 
 3539       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3540 
 3541       // XOR the encrypted counters with the inputs
 3542       for (int i = 0; i < 8; i++) {
 3543         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3544         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3545         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3546       }
 3547       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3548       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3549 
 3550       __ subw(len, len, 16 * 8);
 3551       __ cbnzw(len, L_CTR_loop);
 3552     }
 3553 
 3554     __ rev32(v16, __ T16B, v16);
 3555     __ st1(v16, __ T16B, counter);
 3556 
 3557     __ ldr(len, Address(sp));
 3558     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3559 
 3560     // GHASH/CTR loop
 3561     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3562                                 len, /*unrolls*/4);
 3563 
 3564 #ifdef ASSERT
 3565     { Label L;
 3566       __ cmp(len, (unsigned char)0);
 3567       __ br(Assembler::EQ, L);
 3568       __ stop("stubGenerator: abort");
 3569       __ bind(L);
 3570   }
 3571 #endif
 3572 
 3573   __ bind(DONE);
 3574     // Return the number of bytes processed
 3575     __ ldr(r0, __ post(sp, 2 * wordSize));
 3576 
 3577     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3578     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3579 
 3580     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3581     __ ret(lr);
 3582 
 3583     // bind label and generate polynomial data
 3584     __ align(wordSize * 2);
 3585     __ bind(ghash_polynomial);
 3586     __ emit_int64(0x87);  // The low-order bits of the field
 3587                           // polynomial (i.e. p = z^7+z^2+z+1)
 3588                           // repeated in the low and high parts of a
 3589                           // 128-bit vector
 3590     __ emit_int64(0x87);
 3591 
 3592     return start;
 3593   }
 3594 
 3595   class Cached64Bytes {
 3596   private:
 3597     MacroAssembler *_masm;
 3598     Register _regs[8];
 3599 
 3600   public:
 3601     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3602       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3603       auto it = rs.begin();
 3604       for (auto &r: _regs) {
 3605         r = *it;
 3606         ++it;
 3607       }
 3608     }
 3609 
 3610     void gen_loads(Register base) {
 3611       for (int i = 0; i < 8; i += 2) {
 3612         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3613       }
 3614     }
 3615 
 3616     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3617     void extract_u32(Register dest, int i) {
 3618       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3619     }
 3620   };
 3621 
 3622   // Utility routines for md5.
 3623   // Clobbers r10 and r11.
 3624   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3625               int k, int s, int t) {
 3626     Register rscratch3 = r10;
 3627     Register rscratch4 = r11;
 3628 
 3629     __ eorw(rscratch3, r3, r4);
 3630     __ movw(rscratch2, t);
 3631     __ andw(rscratch3, rscratch3, r2);
 3632     __ addw(rscratch4, r1, rscratch2);
 3633     reg_cache.extract_u32(rscratch1, k);
 3634     __ eorw(rscratch3, rscratch3, r4);
 3635     __ addw(rscratch4, rscratch4, rscratch1);
 3636     __ addw(rscratch3, rscratch3, rscratch4);
 3637     __ rorw(rscratch2, rscratch3, 32 - s);
 3638     __ addw(r1, rscratch2, r2);
 3639   }
 3640 
 3641   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3642               int k, int s, int t) {
 3643     Register rscratch3 = r10;
 3644     Register rscratch4 = r11;
 3645 
 3646     reg_cache.extract_u32(rscratch1, k);
 3647     __ movw(rscratch2, t);
 3648     __ addw(rscratch4, r1, rscratch2);
 3649     __ addw(rscratch4, rscratch4, rscratch1);
 3650     __ bicw(rscratch2, r3, r4);
 3651     __ andw(rscratch3, r2, r4);
 3652     __ addw(rscratch2, rscratch2, rscratch4);
 3653     __ addw(rscratch2, rscratch2, rscratch3);
 3654     __ rorw(rscratch2, rscratch2, 32 - s);
 3655     __ addw(r1, rscratch2, r2);
 3656   }
 3657 
 3658   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3659               int k, int s, int t) {
 3660     Register rscratch3 = r10;
 3661     Register rscratch4 = r11;
 3662 
 3663     __ eorw(rscratch3, r3, r4);
 3664     __ movw(rscratch2, t);
 3665     __ addw(rscratch4, r1, rscratch2);
 3666     reg_cache.extract_u32(rscratch1, k);
 3667     __ eorw(rscratch3, rscratch3, r2);
 3668     __ addw(rscratch4, rscratch4, rscratch1);
 3669     __ addw(rscratch3, rscratch3, rscratch4);
 3670     __ rorw(rscratch2, rscratch3, 32 - s);
 3671     __ addw(r1, rscratch2, r2);
 3672   }
 3673 
 3674   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3675               int k, int s, int t) {
 3676     Register rscratch3 = r10;
 3677     Register rscratch4 = r11;
 3678 
 3679     __ movw(rscratch3, t);
 3680     __ ornw(rscratch2, r2, r4);
 3681     __ addw(rscratch4, r1, rscratch3);
 3682     reg_cache.extract_u32(rscratch1, k);
 3683     __ eorw(rscratch3, rscratch2, r3);
 3684     __ addw(rscratch4, rscratch4, rscratch1);
 3685     __ addw(rscratch3, rscratch3, rscratch4);
 3686     __ rorw(rscratch2, rscratch3, 32 - s);
 3687     __ addw(r1, rscratch2, r2);
 3688   }
 3689 
 3690   // Arguments:
 3691   //
 3692   // Inputs:
 3693   //   c_rarg0   - byte[]  source+offset
 3694   //   c_rarg1   - int[]   SHA.state
 3695   //   c_rarg2   - int     offset
 3696   //   c_rarg3   - int     limit
 3697   //
 3698   address generate_md5_implCompress(StubId stub_id) {
 3699     bool multi_block;
 3700     switch (stub_id) {
 3701     case StubId::stubgen_md5_implCompress_id:
 3702       multi_block = false;
 3703       break;
 3704     case StubId::stubgen_md5_implCompressMB_id:
 3705       multi_block = true;
 3706       break;
 3707     default:
 3708       ShouldNotReachHere();
 3709     }
 3710     __ align(CodeEntryAlignment);
 3711 
 3712     StubCodeMark mark(this, stub_id);
 3713     address start = __ pc();
 3714 
 3715     Register buf       = c_rarg0;
 3716     Register state     = c_rarg1;
 3717     Register ofs       = c_rarg2;
 3718     Register limit     = c_rarg3;
 3719     Register a         = r4;
 3720     Register b         = r5;
 3721     Register c         = r6;
 3722     Register d         = r7;
 3723     Register rscratch3 = r10;
 3724     Register rscratch4 = r11;
 3725 
 3726     Register state_regs[2] = { r12, r13 };
 3727     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3728     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3729 
 3730     __ push(saved_regs, sp);
 3731 
 3732     __ ldp(state_regs[0], state_regs[1], Address(state));
 3733     __ ubfx(a, state_regs[0],  0, 32);
 3734     __ ubfx(b, state_regs[0], 32, 32);
 3735     __ ubfx(c, state_regs[1],  0, 32);
 3736     __ ubfx(d, state_regs[1], 32, 32);
 3737 
 3738     Label md5_loop;
 3739     __ BIND(md5_loop);
 3740 
 3741     reg_cache.gen_loads(buf);
 3742 
 3743     // Round 1
 3744     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3745     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3746     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3747     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3748     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3749     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3750     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3751     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3752     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3753     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3754     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3755     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3756     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3757     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3758     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3759     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3760 
 3761     // Round 2
 3762     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3763     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3764     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3765     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3766     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3767     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3768     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3769     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3770     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3771     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3772     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3773     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3774     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3775     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3776     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3777     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3778 
 3779     // Round 3
 3780     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3781     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3782     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3783     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3784     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3785     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3786     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3787     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3788     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3789     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3790     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3791     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3792     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3793     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3794     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3795     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3796 
 3797     // Round 4
 3798     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3799     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3800     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3801     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3802     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3803     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3804     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3805     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3806     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3807     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3808     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3809     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3810     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3811     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3812     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3813     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3814 
 3815     __ addw(a, state_regs[0], a);
 3816     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3817     __ addw(b, rscratch2, b);
 3818     __ addw(c, state_regs[1], c);
 3819     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3820     __ addw(d, rscratch4, d);
 3821 
 3822     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3823     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3824 
 3825     if (multi_block) {
 3826       __ add(buf, buf, 64);
 3827       __ add(ofs, ofs, 64);
 3828       __ cmp(ofs, limit);
 3829       __ br(Assembler::LE, md5_loop);
 3830       __ mov(c_rarg0, ofs); // return ofs
 3831     }
 3832 
 3833     // write hash values back in the correct order
 3834     __ stp(state_regs[0], state_regs[1], Address(state));
 3835 
 3836     __ pop(saved_regs, sp);
 3837 
 3838     __ ret(lr);
 3839 
 3840     return start;
 3841   }
 3842 
 3843   // Arguments:
 3844   //
 3845   // Inputs:
 3846   //   c_rarg0   - byte[]  source+offset
 3847   //   c_rarg1   - int[]   SHA.state
 3848   //   c_rarg2   - int     offset
 3849   //   c_rarg3   - int     limit
 3850   //
 3851   address generate_sha1_implCompress(StubId stub_id) {
 3852     bool multi_block;
 3853     switch (stub_id) {
 3854     case StubId::stubgen_sha1_implCompress_id:
 3855       multi_block = false;
 3856       break;
 3857     case StubId::stubgen_sha1_implCompressMB_id:
 3858       multi_block = true;
 3859       break;
 3860     default:
 3861       ShouldNotReachHere();
 3862     }
 3863 
 3864     __ align(CodeEntryAlignment);
 3865 
 3866     StubCodeMark mark(this, stub_id);
 3867     address start = __ pc();
 3868 
 3869     Register buf   = c_rarg0;
 3870     Register state = c_rarg1;
 3871     Register ofs   = c_rarg2;
 3872     Register limit = c_rarg3;
 3873 
 3874     Label keys;
 3875     Label sha1_loop;
 3876 
 3877     // load the keys into v0..v3
 3878     __ adr(rscratch1, keys);
 3879     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3880     // load 5 words state into v6, v7
 3881     __ ldrq(v6, Address(state, 0));
 3882     __ ldrs(v7, Address(state, 16));
 3883 
 3884 
 3885     __ BIND(sha1_loop);
 3886     // load 64 bytes of data into v16..v19
 3887     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3888     __ rev32(v16, __ T16B, v16);
 3889     __ rev32(v17, __ T16B, v17);
 3890     __ rev32(v18, __ T16B, v18);
 3891     __ rev32(v19, __ T16B, v19);
 3892 
 3893     // do the sha1
 3894     __ addv(v4, __ T4S, v16, v0);
 3895     __ orr(v20, __ T16B, v6, v6);
 3896 
 3897     FloatRegister d0 = v16;
 3898     FloatRegister d1 = v17;
 3899     FloatRegister d2 = v18;
 3900     FloatRegister d3 = v19;
 3901 
 3902     for (int round = 0; round < 20; round++) {
 3903       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3904       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3905       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3906       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3907       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3908 
 3909       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3910       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3911       __ sha1h(tmp2, __ T4S, v20);
 3912       if (round < 5)
 3913         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3914       else if (round < 10 || round >= 15)
 3915         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3916       else
 3917         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3918       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3919 
 3920       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3921     }
 3922 
 3923     __ addv(v7, __ T2S, v7, v21);
 3924     __ addv(v6, __ T4S, v6, v20);
 3925 
 3926     if (multi_block) {
 3927       __ add(ofs, ofs, 64);
 3928       __ cmp(ofs, limit);
 3929       __ br(Assembler::LE, sha1_loop);
 3930       __ mov(c_rarg0, ofs); // return ofs
 3931     }
 3932 
 3933     __ strq(v6, Address(state, 0));
 3934     __ strs(v7, Address(state, 16));
 3935 
 3936     __ ret(lr);
 3937 
 3938     __ bind(keys);
 3939     __ emit_int32(0x5a827999);
 3940     __ emit_int32(0x6ed9eba1);
 3941     __ emit_int32(0x8f1bbcdc);
 3942     __ emit_int32(0xca62c1d6);
 3943 
 3944     return start;
 3945   }
 3946 
 3947 
 3948   // Arguments:
 3949   //
 3950   // Inputs:
 3951   //   c_rarg0   - byte[]  source+offset
 3952   //   c_rarg1   - int[]   SHA.state
 3953   //   c_rarg2   - int     offset
 3954   //   c_rarg3   - int     limit
 3955   //
 3956   address generate_sha256_implCompress(StubId stub_id) {
 3957     bool multi_block;
 3958     switch (stub_id) {
 3959     case StubId::stubgen_sha256_implCompress_id:
 3960       multi_block = false;
 3961       break;
 3962     case StubId::stubgen_sha256_implCompressMB_id:
 3963       multi_block = true;
 3964       break;
 3965     default:
 3966       ShouldNotReachHere();
 3967     }
 3968 
 3969     static const uint32_t round_consts[64] = {
 3970       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3971       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3972       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3973       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3974       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3975       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3976       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3977       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3978       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3979       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3980       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3981       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3982       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3983       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3984       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3985       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3986     };
 3987 
 3988     __ align(CodeEntryAlignment);
 3989 
 3990     StubCodeMark mark(this, stub_id);
 3991     address start = __ pc();
 3992 
 3993     Register buf   = c_rarg0;
 3994     Register state = c_rarg1;
 3995     Register ofs   = c_rarg2;
 3996     Register limit = c_rarg3;
 3997 
 3998     Label sha1_loop;
 3999 
 4000     __ stpd(v8, v9, __ pre(sp, -32));
 4001     __ stpd(v10, v11, Address(sp, 16));
 4002 
 4003 // dga == v0
 4004 // dgb == v1
 4005 // dg0 == v2
 4006 // dg1 == v3
 4007 // dg2 == v4
 4008 // t0 == v6
 4009 // t1 == v7
 4010 
 4011     // load 16 keys to v16..v31
 4012     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4013     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4014     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4015     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4016     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4017 
 4018     // load 8 words (256 bits) state
 4019     __ ldpq(v0, v1, state);
 4020 
 4021     __ BIND(sha1_loop);
 4022     // load 64 bytes of data into v8..v11
 4023     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4024     __ rev32(v8, __ T16B, v8);
 4025     __ rev32(v9, __ T16B, v9);
 4026     __ rev32(v10, __ T16B, v10);
 4027     __ rev32(v11, __ T16B, v11);
 4028 
 4029     __ addv(v6, __ T4S, v8, v16);
 4030     __ orr(v2, __ T16B, v0, v0);
 4031     __ orr(v3, __ T16B, v1, v1);
 4032 
 4033     FloatRegister d0 = v8;
 4034     FloatRegister d1 = v9;
 4035     FloatRegister d2 = v10;
 4036     FloatRegister d3 = v11;
 4037 
 4038 
 4039     for (int round = 0; round < 16; round++) {
 4040       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4041       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4042       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4043       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4044 
 4045       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4046        __ orr(v4, __ T16B, v2, v2);
 4047       if (round < 15)
 4048         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4049       __ sha256h(v2, __ T4S, v3, tmp2);
 4050       __ sha256h2(v3, __ T4S, v4, tmp2);
 4051       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4052 
 4053       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4054     }
 4055 
 4056     __ addv(v0, __ T4S, v0, v2);
 4057     __ addv(v1, __ T4S, v1, v3);
 4058 
 4059     if (multi_block) {
 4060       __ add(ofs, ofs, 64);
 4061       __ cmp(ofs, limit);
 4062       __ br(Assembler::LE, sha1_loop);
 4063       __ mov(c_rarg0, ofs); // return ofs
 4064     }
 4065 
 4066     __ ldpd(v10, v11, Address(sp, 16));
 4067     __ ldpd(v8, v9, __ post(sp, 32));
 4068 
 4069     __ stpq(v0, v1, state);
 4070 
 4071     __ ret(lr);
 4072 
 4073     return start;
 4074   }
 4075 
 4076   // Double rounds for sha512.
 4077   void sha512_dround(int dr,
 4078                      FloatRegister vi0, FloatRegister vi1,
 4079                      FloatRegister vi2, FloatRegister vi3,
 4080                      FloatRegister vi4, FloatRegister vrc0,
 4081                      FloatRegister vrc1, FloatRegister vin0,
 4082                      FloatRegister vin1, FloatRegister vin2,
 4083                      FloatRegister vin3, FloatRegister vin4) {
 4084       if (dr < 36) {
 4085         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4086       }
 4087       __ addv(v5, __ T2D, vrc0, vin0);
 4088       __ ext(v6, __ T16B, vi2, vi3, 8);
 4089       __ ext(v5, __ T16B, v5, v5, 8);
 4090       __ ext(v7, __ T16B, vi1, vi2, 8);
 4091       __ addv(vi3, __ T2D, vi3, v5);
 4092       if (dr < 32) {
 4093         __ ext(v5, __ T16B, vin3, vin4, 8);
 4094         __ sha512su0(vin0, __ T2D, vin1);
 4095       }
 4096       __ sha512h(vi3, __ T2D, v6, v7);
 4097       if (dr < 32) {
 4098         __ sha512su1(vin0, __ T2D, vin2, v5);
 4099       }
 4100       __ addv(vi4, __ T2D, vi1, vi3);
 4101       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4102   }
 4103 
 4104   // Arguments:
 4105   //
 4106   // Inputs:
 4107   //   c_rarg0   - byte[]  source+offset
 4108   //   c_rarg1   - int[]   SHA.state
 4109   //   c_rarg2   - int     offset
 4110   //   c_rarg3   - int     limit
 4111   //
 4112   address generate_sha512_implCompress(StubId stub_id) {
 4113     bool multi_block;
 4114     switch (stub_id) {
 4115     case StubId::stubgen_sha512_implCompress_id:
 4116       multi_block = false;
 4117       break;
 4118     case StubId::stubgen_sha512_implCompressMB_id:
 4119       multi_block = true;
 4120       break;
 4121     default:
 4122       ShouldNotReachHere();
 4123     }
 4124 
 4125     static const uint64_t round_consts[80] = {
 4126       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4127       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4128       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4129       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4130       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4131       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4132       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4133       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4134       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4135       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4136       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4137       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4138       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4139       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4140       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4141       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4142       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4143       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4144       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4145       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4146       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4147       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4148       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4149       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4150       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4151       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4152       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4153     };
 4154 
 4155     __ align(CodeEntryAlignment);
 4156 
 4157     StubCodeMark mark(this, stub_id);
 4158     address start = __ pc();
 4159 
 4160     Register buf   = c_rarg0;
 4161     Register state = c_rarg1;
 4162     Register ofs   = c_rarg2;
 4163     Register limit = c_rarg3;
 4164 
 4165     __ stpd(v8, v9, __ pre(sp, -64));
 4166     __ stpd(v10, v11, Address(sp, 16));
 4167     __ stpd(v12, v13, Address(sp, 32));
 4168     __ stpd(v14, v15, Address(sp, 48));
 4169 
 4170     Label sha512_loop;
 4171 
 4172     // load state
 4173     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4174 
 4175     // load first 4 round constants
 4176     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4177     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4178 
 4179     __ BIND(sha512_loop);
 4180     // load 128B of data into v12..v19
 4181     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4182     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4183     __ rev64(v12, __ T16B, v12);
 4184     __ rev64(v13, __ T16B, v13);
 4185     __ rev64(v14, __ T16B, v14);
 4186     __ rev64(v15, __ T16B, v15);
 4187     __ rev64(v16, __ T16B, v16);
 4188     __ rev64(v17, __ T16B, v17);
 4189     __ rev64(v18, __ T16B, v18);
 4190     __ rev64(v19, __ T16B, v19);
 4191 
 4192     __ mov(rscratch2, rscratch1);
 4193 
 4194     __ mov(v0, __ T16B, v8);
 4195     __ mov(v1, __ T16B, v9);
 4196     __ mov(v2, __ T16B, v10);
 4197     __ mov(v3, __ T16B, v11);
 4198 
 4199     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4200     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4201     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4202     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4203     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4204     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4205     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4206     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4207     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4208     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4209     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4210     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4211     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4212     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4213     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4214     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4215     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4216     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4217     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4218     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4219     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4220     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4221     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4222     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4223     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4224     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4225     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4226     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4227     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4228     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4229     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4230     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4231     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4232     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4233     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4234     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4235     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4236     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4237     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4238     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4239 
 4240     __ addv(v8, __ T2D, v8, v0);
 4241     __ addv(v9, __ T2D, v9, v1);
 4242     __ addv(v10, __ T2D, v10, v2);
 4243     __ addv(v11, __ T2D, v11, v3);
 4244 
 4245     if (multi_block) {
 4246       __ add(ofs, ofs, 128);
 4247       __ cmp(ofs, limit);
 4248       __ br(Assembler::LE, sha512_loop);
 4249       __ mov(c_rarg0, ofs); // return ofs
 4250     }
 4251 
 4252     __ st1(v8, v9, v10, v11, __ T2D, state);
 4253 
 4254     __ ldpd(v14, v15, Address(sp, 48));
 4255     __ ldpd(v12, v13, Address(sp, 32));
 4256     __ ldpd(v10, v11, Address(sp, 16));
 4257     __ ldpd(v8, v9, __ post(sp, 64));
 4258 
 4259     __ ret(lr);
 4260 
 4261     return start;
 4262   }
 4263 
 4264   // Execute one round of keccak of two computations in parallel.
 4265   // One of the states should be loaded into the lower halves of
 4266   // the vector registers v0-v24, the other should be loaded into
 4267   // the upper halves of those registers. The ld1r instruction loads
 4268   // the round constant into both halves of register v31.
 4269   // Intermediate results c0...c5 and d0...d5 are computed
 4270   // in registers v25...v30.
 4271   // All vector instructions that are used operate on both register
 4272   // halves in parallel.
 4273   // If only a single computation is needed, one can only load the lower halves.
 4274   void keccak_round(Register rscratch1) {
 4275   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4276   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4277   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4278   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4279   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4280   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4281   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4282   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4283   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4284   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4285 
 4286   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4287   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4288   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4289   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4290   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4291 
 4292   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4293   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4294   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4295   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4296   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4297   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4298   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4299   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4300   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4301   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4302   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4303   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4304   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4305   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4306   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4307   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4308   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4309   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4310   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4311   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4312   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4313   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4314   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4315   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4316   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4317 
 4318   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4319   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4320   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4321   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4322   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4323 
 4324   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4325 
 4326   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4327   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4328   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4329   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4330   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4331 
 4332   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4333   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4334   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4335   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4336   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4337 
 4338   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4339   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4340   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4341   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4342   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4343 
 4344   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4345   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4346   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4347   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4348   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4349 
 4350   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4351   }
 4352 
 4353   // Arguments:
 4354   //
 4355   // Inputs:
 4356   //   c_rarg0   - byte[]  source+offset
 4357   //   c_rarg1   - byte[]  SHA.state
 4358   //   c_rarg2   - int     block_size
 4359   //   c_rarg3   - int     offset
 4360   //   c_rarg4   - int     limit
 4361   //
 4362   address generate_sha3_implCompress(StubId stub_id) {
 4363     bool multi_block;
 4364     switch (stub_id) {
 4365     case StubId::stubgen_sha3_implCompress_id:
 4366       multi_block = false;
 4367       break;
 4368     case StubId::stubgen_sha3_implCompressMB_id:
 4369       multi_block = true;
 4370       break;
 4371     default:
 4372       ShouldNotReachHere();
 4373     }
 4374 
 4375     static const uint64_t round_consts[24] = {
 4376       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4377       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4378       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4379       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4380       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4381       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4382       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4383       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4384     };
 4385 
 4386     __ align(CodeEntryAlignment);
 4387 
 4388     StubCodeMark mark(this, stub_id);
 4389     address start = __ pc();
 4390 
 4391     Register buf           = c_rarg0;
 4392     Register state         = c_rarg1;
 4393     Register block_size    = c_rarg2;
 4394     Register ofs           = c_rarg3;
 4395     Register limit         = c_rarg4;
 4396 
 4397     Label sha3_loop, rounds24_loop;
 4398     Label sha3_512_or_sha3_384, shake128;
 4399 
 4400     __ stpd(v8, v9, __ pre(sp, -64));
 4401     __ stpd(v10, v11, Address(sp, 16));
 4402     __ stpd(v12, v13, Address(sp, 32));
 4403     __ stpd(v14, v15, Address(sp, 48));
 4404 
 4405     // load state
 4406     __ add(rscratch1, state, 32);
 4407     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4408     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4409     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4410     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4411     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4412     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4413     __ ld1(v24, __ T1D, rscratch1);
 4414 
 4415     __ BIND(sha3_loop);
 4416 
 4417     // 24 keccak rounds
 4418     __ movw(rscratch2, 24);
 4419 
 4420     // load round_constants base
 4421     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4422 
 4423     // load input
 4424     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4425     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4426     __ eor(v0, __ T8B, v0, v25);
 4427     __ eor(v1, __ T8B, v1, v26);
 4428     __ eor(v2, __ T8B, v2, v27);
 4429     __ eor(v3, __ T8B, v3, v28);
 4430     __ eor(v4, __ T8B, v4, v29);
 4431     __ eor(v5, __ T8B, v5, v30);
 4432     __ eor(v6, __ T8B, v6, v31);
 4433 
 4434     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4435     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4436 
 4437     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4438     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4439     __ eor(v7, __ T8B, v7, v25);
 4440     __ eor(v8, __ T8B, v8, v26);
 4441     __ eor(v9, __ T8B, v9, v27);
 4442     __ eor(v10, __ T8B, v10, v28);
 4443     __ eor(v11, __ T8B, v11, v29);
 4444     __ eor(v12, __ T8B, v12, v30);
 4445     __ eor(v13, __ T8B, v13, v31);
 4446 
 4447     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4448     __ eor(v14, __ T8B, v14, v25);
 4449     __ eor(v15, __ T8B, v15, v26);
 4450     __ eor(v16, __ T8B, v16, v27);
 4451 
 4452     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4453     __ andw(c_rarg5, block_size, 48);
 4454     __ cbzw(c_rarg5, rounds24_loop);
 4455 
 4456     __ tbnz(block_size, 5, shake128);
 4457     // block_size == 144, bit5 == 0, SHA3-224
 4458     __ ldrd(v28, __ post(buf, 8));
 4459     __ eor(v17, __ T8B, v17, v28);
 4460     __ b(rounds24_loop);
 4461 
 4462     __ BIND(shake128);
 4463     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4464     __ eor(v17, __ T8B, v17, v28);
 4465     __ eor(v18, __ T8B, v18, v29);
 4466     __ eor(v19, __ T8B, v19, v30);
 4467     __ eor(v20, __ T8B, v20, v31);
 4468     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4469 
 4470     __ BIND(sha3_512_or_sha3_384);
 4471     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4472     __ eor(v7, __ T8B, v7, v25);
 4473     __ eor(v8, __ T8B, v8, v26);
 4474     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4475 
 4476     // SHA3-384
 4477     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4478     __ eor(v9,  __ T8B, v9,  v27);
 4479     __ eor(v10, __ T8B, v10, v28);
 4480     __ eor(v11, __ T8B, v11, v29);
 4481     __ eor(v12, __ T8B, v12, v30);
 4482 
 4483     __ BIND(rounds24_loop);
 4484     __ subw(rscratch2, rscratch2, 1);
 4485 
 4486     keccak_round(rscratch1);
 4487 
 4488     __ cbnzw(rscratch2, rounds24_loop);
 4489 
 4490     if (multi_block) {
 4491       __ add(ofs, ofs, block_size);
 4492       __ cmp(ofs, limit);
 4493       __ br(Assembler::LE, sha3_loop);
 4494       __ mov(c_rarg0, ofs); // return ofs
 4495     }
 4496 
 4497     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4498     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4499     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4500     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4501     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4502     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4503     __ st1(v24, __ T1D, state);
 4504 
 4505     // restore callee-saved registers
 4506     __ ldpd(v14, v15, Address(sp, 48));
 4507     __ ldpd(v12, v13, Address(sp, 32));
 4508     __ ldpd(v10, v11, Address(sp, 16));
 4509     __ ldpd(v8, v9, __ post(sp, 64));
 4510 
 4511     __ ret(lr);
 4512 
 4513     return start;
 4514   }
 4515 
 4516   // Inputs:
 4517   //   c_rarg0   - long[]  state0
 4518   //   c_rarg1   - long[]  state1
 4519   address generate_double_keccak() {
 4520     static const uint64_t round_consts[24] = {
 4521       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4522       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4523       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4524       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4525       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4526       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4527       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4528       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4529     };
 4530 
 4531     // Implements the double_keccak() method of the
 4532     // sun.secyrity.provider.SHA3Parallel class
 4533     __ align(CodeEntryAlignment);
 4534     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4535     address start = __ pc();
 4536     __ enter();
 4537 
 4538     Register state0        = c_rarg0;
 4539     Register state1        = c_rarg1;
 4540 
 4541     Label rounds24_loop;
 4542 
 4543     // save callee-saved registers
 4544     __ stpd(v8, v9, __ pre(sp, -64));
 4545     __ stpd(v10, v11, Address(sp, 16));
 4546     __ stpd(v12, v13, Address(sp, 32));
 4547     __ stpd(v14, v15, Address(sp, 48));
 4548 
 4549     // load states
 4550     __ add(rscratch1, state0, 32);
 4551     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4552     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4553     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4554     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4555     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4556     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4557     __ ld1(v24, __ D, 0, rscratch1);
 4558     __ add(rscratch1, state1, 32);
 4559     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4560     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4561     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4562     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4563     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4564     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4565     __ ld1(v24, __ D, 1, rscratch1);
 4566 
 4567     // 24 keccak rounds
 4568     __ movw(rscratch2, 24);
 4569 
 4570     // load round_constants base
 4571     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4572 
 4573     __ BIND(rounds24_loop);
 4574     __ subw(rscratch2, rscratch2, 1);
 4575     keccak_round(rscratch1);
 4576     __ cbnzw(rscratch2, rounds24_loop);
 4577 
 4578     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4579     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4580     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4581     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4582     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4583     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4584     __ st1(v24, __ D, 0, state0);
 4585     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4586     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4587     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4588     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4589     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4590     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4591     __ st1(v24, __ D, 1, state1);
 4592 
 4593     // restore callee-saved vector registers
 4594     __ ldpd(v14, v15, Address(sp, 48));
 4595     __ ldpd(v12, v13, Address(sp, 32));
 4596     __ ldpd(v10, v11, Address(sp, 16));
 4597     __ ldpd(v8, v9, __ post(sp, 64));
 4598 
 4599     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4600     __ mov(r0, zr); // return 0
 4601     __ ret(lr);
 4602 
 4603     return start;
 4604   }
 4605 
 4606   // ChaCha20 block function.  This version parallelizes the 32-bit
 4607   // state elements on each of 16 vectors, producing 4 blocks of
 4608   // keystream at a time.
 4609   //
 4610   // state (int[16]) = c_rarg0
 4611   // keystream (byte[256]) = c_rarg1
 4612   // return - number of bytes of produced keystream (always 256)
 4613   //
 4614   // This implementation takes each 32-bit integer from the state
 4615   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4616   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4617   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4618   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4619   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4620   // operations represented by one QUARTERROUND function, we instead stack all
 4621   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4622   // and then do the same for the second set of 4 quarter rounds.  This removes
 4623   // some latency that would otherwise be incurred by waiting for an add to
 4624   // complete before performing an xor (which depends on the result of the
 4625   // add), etc. An adjustment happens between the first and second groups of 4
 4626   // quarter rounds, but this is done only in the inputs to the macro functions
 4627   // that generate the assembly instructions - these adjustments themselves are
 4628   // not part of the resulting assembly.
 4629   // The 4 registers v0-v3 are used during the quarter round operations as
 4630   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4631   // registers become the vectors involved in adding the start state back onto
 4632   // the post-QR working state.  After the adds are complete, each of the 16
 4633   // vectors write their first lane back to the keystream buffer, followed
 4634   // by the second lane from all vectors and so on.
 4635   address generate_chacha20Block_blockpar() {
 4636     Label L_twoRounds, L_cc20_const;
 4637     __ align(CodeEntryAlignment);
 4638     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4639     StubCodeMark mark(this, stub_id);
 4640     address start = __ pc();
 4641     __ enter();
 4642 
 4643     int i, j;
 4644     const Register state = c_rarg0;
 4645     const Register keystream = c_rarg1;
 4646     const Register loopCtr = r10;
 4647     const Register tmpAddr = r11;
 4648     const FloatRegister ctrAddOverlay = v28;
 4649     const FloatRegister lrot8Tbl = v29;
 4650 
 4651     // Organize SIMD registers in an array that facilitates
 4652     // putting repetitive opcodes into loop structures.  It is
 4653     // important that each grouping of 4 registers is monotonically
 4654     // increasing to support the requirements of multi-register
 4655     // instructions (e.g. ld4r, st4, etc.)
 4656     const FloatRegister workSt[16] = {
 4657          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4658         v20, v21, v22, v23, v24, v25, v26, v27
 4659     };
 4660 
 4661     // Pull in constant data.  The first 16 bytes are the add overlay
 4662     // which is applied to the vector holding the counter (state[12]).
 4663     // The second 16 bytes is the index register for the 8-bit left
 4664     // rotation tbl instruction.
 4665     __ adr(tmpAddr, L_cc20_const);
 4666     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4667 
 4668     // Load from memory and interlace across 16 SIMD registers,
 4669     // With each word from memory being broadcast to all lanes of
 4670     // each successive SIMD register.
 4671     //      Addr(0) -> All lanes in workSt[i]
 4672     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4673     __ mov(tmpAddr, state);
 4674     for (i = 0; i < 16; i += 4) {
 4675       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4676           __ post(tmpAddr, 16));
 4677     }
 4678     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4679 
 4680     // Before entering the loop, create 5 4-register arrays.  These
 4681     // will hold the 4 registers that represent the a/b/c/d fields
 4682     // in the quarter round operation.  For instance the "b" field
 4683     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4684     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4685     // since it is part of a diagonal organization.  The aSet and scratch
 4686     // register sets are defined at declaration time because they do not change
 4687     // organization at any point during the 20-round processing.
 4688     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4689     FloatRegister bSet[4];
 4690     FloatRegister cSet[4];
 4691     FloatRegister dSet[4];
 4692     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4693 
 4694     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4695     __ mov(loopCtr, 10);
 4696     __ BIND(L_twoRounds);
 4697 
 4698     // Set to columnar organization and do the following 4 quarter-rounds:
 4699     // QUARTERROUND(0, 4, 8, 12)
 4700     // QUARTERROUND(1, 5, 9, 13)
 4701     // QUARTERROUND(2, 6, 10, 14)
 4702     // QUARTERROUND(3, 7, 11, 15)
 4703     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4704     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4705     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4706 
 4707     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4708     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4709     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4710 
 4711     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4712     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4713     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4714 
 4715     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4716     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4717     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4718 
 4719     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4720     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4721     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4722 
 4723     // Set to diagonal organization and do the next 4 quarter-rounds:
 4724     // QUARTERROUND(0, 5, 10, 15)
 4725     // QUARTERROUND(1, 6, 11, 12)
 4726     // QUARTERROUND(2, 7, 8, 13)
 4727     // QUARTERROUND(3, 4, 9, 14)
 4728     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4729     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4730     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4731 
 4732     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4733     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4734     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4735 
 4736     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4737     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4738     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4739 
 4740     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4741     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4742     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4743 
 4744     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4745     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4746     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4747 
 4748     // Decrement and iterate
 4749     __ sub(loopCtr, loopCtr, 1);
 4750     __ cbnz(loopCtr, L_twoRounds);
 4751 
 4752     __ mov(tmpAddr, state);
 4753 
 4754     // Add the starting state back to the post-loop keystream
 4755     // state.  We read/interlace the state array from memory into
 4756     // 4 registers similar to what we did in the beginning.  Then
 4757     // add the counter overlay onto workSt[12] at the end.
 4758     for (i = 0; i < 16; i += 4) {
 4759       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4760       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4761       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4762       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4763       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4764     }
 4765     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4766 
 4767     // Write working state into the keystream buffer.  This is accomplished
 4768     // by taking the lane "i" from each of the four vectors and writing
 4769     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4770     // repeating with the next 4 vectors until all 16 vectors have been used.
 4771     // Then move to the next lane and repeat the process until all lanes have
 4772     // been written.
 4773     for (i = 0; i < 4; i++) {
 4774       for (j = 0; j < 16; j += 4) {
 4775         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4776             __ post(keystream, 16));
 4777       }
 4778     }
 4779 
 4780     __ mov(r0, 256);             // Return length of output keystream
 4781     __ leave();
 4782     __ ret(lr);
 4783 
 4784     // bind label and generate local constant data used by this stub
 4785     // The constant data is broken into two 128-bit segments to be loaded
 4786     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4787     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4788     // The second 128-bits is a table constant used for 8-bit left rotations.
 4789     __ BIND(L_cc20_const);
 4790     __ emit_int64(0x0000000100000000UL);
 4791     __ emit_int64(0x0000000300000002UL);
 4792     __ emit_int64(0x0605040702010003UL);
 4793     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4794 
 4795     return start;
 4796   }
 4797 
 4798   // Helpers to schedule parallel operation bundles across vector
 4799   // register sequences of size 2, 4 or 8.
 4800 
 4801   // Implement various primitive computations across vector sequences
 4802 
 4803   template<int N>
 4804   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4805                const VSeq<N>& v1, const VSeq<N>& v2) {
 4806     // output must not be constant
 4807     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4808     // output cannot overwrite pending inputs
 4809     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4810     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4811     for (int i = 0; i < N; i++) {
 4812       __ addv(v[i], T, v1[i], v2[i]);
 4813     }
 4814   }
 4815 
 4816   template<int N>
 4817   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4818                const VSeq<N>& v1, const VSeq<N>& v2) {
 4819     // output must not be constant
 4820     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4821     // output cannot overwrite pending inputs
 4822     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4823     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4824     for (int i = 0; i < N; i++) {
 4825       __ subv(v[i], T, v1[i], v2[i]);
 4826     }
 4827   }
 4828 
 4829   template<int N>
 4830   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4831                const VSeq<N>& v1, const VSeq<N>& v2) {
 4832     // output must not be constant
 4833     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4834     // output cannot overwrite pending inputs
 4835     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4836     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4837     for (int i = 0; i < N; i++) {
 4838       __ mulv(v[i], T, v1[i], v2[i]);
 4839     }
 4840   }
 4841 
 4842   template<int N>
 4843   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4844     // output must not be constant
 4845     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4846     // output cannot overwrite pending inputs
 4847     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4848     for (int i = 0; i < N; i++) {
 4849       __ negr(v[i], T, v1[i]);
 4850     }
 4851   }
 4852 
 4853   template<int N>
 4854   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4855                const VSeq<N>& v1, int shift) {
 4856     // output must not be constant
 4857     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4858     // output cannot overwrite pending inputs
 4859     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4860     for (int i = 0; i < N; i++) {
 4861       __ sshr(v[i], T, v1[i], shift);
 4862     }
 4863   }
 4864 
 4865   template<int N>
 4866   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4867     // output must not be constant
 4868     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4869     // output cannot overwrite pending inputs
 4870     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4871     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4872     for (int i = 0; i < N; i++) {
 4873       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4874     }
 4875   }
 4876 
 4877   template<int N>
 4878   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4879     // output must not be constant
 4880     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4881     // output cannot overwrite pending inputs
 4882     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4883     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4884     for (int i = 0; i < N; i++) {
 4885       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4886     }
 4887   }
 4888 
 4889   template<int N>
 4890   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4891     // output must not be constant
 4892     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4893     // output cannot overwrite pending inputs
 4894     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4895     for (int i = 0; i < N; i++) {
 4896       __ notr(v[i], __ T16B, v1[i]);
 4897     }
 4898   }
 4899 
 4900   template<int N>
 4901   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4902     // output must not be constant
 4903     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4904     // output cannot overwrite pending inputs
 4905     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4906     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4907     for (int i = 0; i < N; i++) {
 4908       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4909     }
 4910   }
 4911 
 4912   template<int N>
 4913   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4914     // output must not be constant
 4915     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4916     // output cannot overwrite pending inputs
 4917     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4918     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4919     for (int i = 0; i < N; i++) {
 4920       __ mlsv(v[i], T, v1[i], v2[i]);
 4921     }
 4922   }
 4923 
 4924   // load N/2 successive pairs of quadword values from memory in order
 4925   // into N successive vector registers of the sequence via the
 4926   // address supplied in base.
 4927   template<int N>
 4928   void vs_ldpq(const VSeq<N>& v, Register base) {
 4929     for (int i = 0; i < N; i += 2) {
 4930       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4931     }
 4932   }
 4933 
 4934   // load N/2 successive pairs of quadword values from memory in order
 4935   // into N vector registers of the sequence via the address supplied
 4936   // in base using post-increment addressing
 4937   template<int N>
 4938   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4939     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4940     for (int i = 0; i < N; i += 2) {
 4941       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4942     }
 4943   }
 4944 
 4945   // store N successive vector registers of the sequence into N/2
 4946   // successive pairs of quadword memory locations via the address
 4947   // supplied in base using post-increment addressing
 4948   template<int N>
 4949   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4950     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4951     for (int i = 0; i < N; i += 2) {
 4952       __ stpq(v[i], v[i+1], __ post(base, 32));
 4953     }
 4954   }
 4955 
 4956   // load N/2 pairs of quadword values from memory de-interleaved into
 4957   // N vector registers 2 at a time via the address supplied in base
 4958   // using post-increment addressing.
 4959   template<int N>
 4960   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4961     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4962     for (int i = 0; i < N; i += 2) {
 4963       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4964     }
 4965   }
 4966 
 4967   // store N vector registers interleaved into N/2 pairs of quadword
 4968   // memory locations via the address supplied in base using
 4969   // post-increment addressing.
 4970   template<int N>
 4971   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4972     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4973     for (int i = 0; i < N; i += 2) {
 4974       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4975     }
 4976   }
 4977 
 4978   // load N quadword values from memory de-interleaved into N vector
 4979   // registers 3 elements at a time via the address supplied in base.
 4980   template<int N>
 4981   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4982     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4983     for (int i = 0; i < N; i += 3) {
 4984       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4985     }
 4986   }
 4987 
 4988   // load N quadword values from memory de-interleaved into N vector
 4989   // registers 3 elements at a time via the address supplied in base
 4990   // using post-increment addressing.
 4991   template<int N>
 4992   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4993     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4994     for (int i = 0; i < N; i += 3) {
 4995       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4996     }
 4997   }
 4998 
 4999   // load N/2 pairs of quadword values from memory into N vector
 5000   // registers via the address supplied in base with each pair indexed
 5001   // using the the start offset plus the corresponding entry in the
 5002   // offsets array
 5003   template<int N>
 5004   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5005     for (int i = 0; i < N/2; i++) {
 5006       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5007     }
 5008   }
 5009 
 5010   // store N vector registers into N/2 pairs of quadword memory
 5011   // locations via the address supplied in base with each pair indexed
 5012   // using the the start offset plus the corresponding entry in the
 5013   // offsets array
 5014   template<int N>
 5015   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5016     for (int i = 0; i < N/2; i++) {
 5017       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5018     }
 5019   }
 5020 
 5021   // load N single quadword values from memory into N vector registers
 5022   // via the address supplied in base with each value indexed using
 5023   // the the start offset plus the corresponding entry in the offsets
 5024   // array
 5025   template<int N>
 5026   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5027                       int start, int (&offsets)[N]) {
 5028     for (int i = 0; i < N; i++) {
 5029       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5030     }
 5031   }
 5032 
 5033   // store N vector registers into N single quadword memory locations
 5034   // via the address supplied in base with each value indexed using
 5035   // the the start offset plus the corresponding entry in the offsets
 5036   // array
 5037   template<int N>
 5038   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5039                       int start, int (&offsets)[N]) {
 5040     for (int i = 0; i < N; i++) {
 5041       __ str(v[i], T, Address(base, start + offsets[i]));
 5042     }
 5043   }
 5044 
 5045   // load N/2 pairs of quadword values from memory de-interleaved into
 5046   // N vector registers 2 at a time via the address supplied in base
 5047   // with each pair indexed using the the start offset plus the
 5048   // corresponding entry in the offsets array
 5049   template<int N>
 5050   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5051                       Register tmp, int start, int (&offsets)[N/2]) {
 5052     for (int i = 0; i < N/2; i++) {
 5053       __ add(tmp, base, start + offsets[i]);
 5054       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5055     }
 5056   }
 5057 
 5058   // store N vector registers 2 at a time interleaved into N/2 pairs
 5059   // of quadword memory locations via the address supplied in base
 5060   // with each pair indexed using the the start offset plus the
 5061   // corresponding entry in the offsets array
 5062   template<int N>
 5063   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5064                       Register tmp, int start, int (&offsets)[N/2]) {
 5065     for (int i = 0; i < N/2; i++) {
 5066       __ add(tmp, base, start + offsets[i]);
 5067       __ st2(v[2*i], v[2*i+1], T, tmp);
 5068     }
 5069   }
 5070 
 5071   // Helper routines for various flavours of Montgomery multiply
 5072 
 5073   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5074   // multiplications in parallel
 5075   //
 5076 
 5077   // See the montMul() method of the sun.security.provider.ML_DSA
 5078   // class.
 5079   //
 5080   // Computes 4x4S results or 8x8H results
 5081   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5082   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5083   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5084   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5085   // Outputs: va - 4x4S or 4x8H vector register sequences
 5086   // vb, vc, vtmp and vq must all be disjoint
 5087   // va must be disjoint from all other inputs/temps or must equal vc
 5088   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5089   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5090   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5091                    Assembler::SIMD_Arrangement T,
 5092                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5093     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5094     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5095     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5096     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5097 
 5098     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5099     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5100 
 5101     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5102 
 5103     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5104     assert(vs_disjoint(va, vb), "va and vb overlap");
 5105     assert(vs_disjoint(va, vq), "va and vq overlap");
 5106     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5107     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5108 
 5109     // schedule 4 streams of instructions across the vector sequences
 5110     for (int i = 0; i < 4; i++) {
 5111       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5112       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5113     }
 5114 
 5115     for (int i = 0; i < 4; i++) {
 5116       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5117     }
 5118 
 5119     for (int i = 0; i < 4; i++) {
 5120       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5121     }
 5122 
 5123     for (int i = 0; i < 4; i++) {
 5124       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5125     }
 5126   }
 5127 
 5128   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5129   // multiplications in parallel
 5130   //
 5131 
 5132   // See the montMul() method of the sun.security.provider.ML_DSA
 5133   // class.
 5134   //
 5135   // Computes 4x4S results or 8x8H results
 5136   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5137   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5138   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5139   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5140   // Outputs: va - 4x4S or 4x8H vector register sequences
 5141   // vb, vc, vtmp and vq must all be disjoint
 5142   // va must be disjoint from all other inputs/temps or must equal vc
 5143   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5144   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5145   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5146                    Assembler::SIMD_Arrangement T,
 5147                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5148     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5149     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5150     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5151     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5152 
 5153     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5154     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5155 
 5156     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5157 
 5158     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5159     assert(vs_disjoint(va, vb), "va and vb overlap");
 5160     assert(vs_disjoint(va, vq), "va and vq overlap");
 5161     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5162     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5163 
 5164     // schedule 2 streams of instructions across the vector sequences
 5165     for (int i = 0; i < 2; i++) {
 5166       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5167       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5168     }
 5169 
 5170     for (int i = 0; i < 2; i++) {
 5171       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5172     }
 5173 
 5174     for (int i = 0; i < 2; i++) {
 5175       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5176     }
 5177 
 5178     for (int i = 0; i < 2; i++) {
 5179       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5180     }
 5181   }
 5182 
 5183   // Perform 16 16-bit Montgomery multiplications in parallel.
 5184   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5185                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5186     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5187     // It will assert that the register use is valid
 5188     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5189   }
 5190 
 5191   // Perform 32 16-bit Montgomery multiplications in parallel.
 5192   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5193                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5194     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5195     // It will assert that the register use is valid
 5196     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5197   }
 5198 
 5199   // Perform 64 16-bit Montgomery multiplications in parallel.
 5200   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5201                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5202     // Schedule two successive 4x8H multiplies via the montmul helper
 5203     // on the front and back halves of va, vb and vc. The helper will
 5204     // assert that the register use has no overlap conflicts on each
 5205     // individual call but we also need to ensure that the necessary
 5206     // disjoint/equality constraints are met across both calls.
 5207 
 5208     // vb, vc, vtmp and vq must be disjoint. va must either be
 5209     // disjoint from all other registers or equal vc
 5210 
 5211     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5212     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5213     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5214 
 5215     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5216     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5217 
 5218     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5219 
 5220     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5221     assert(vs_disjoint(va, vb), "va and vb overlap");
 5222     assert(vs_disjoint(va, vq), "va and vq overlap");
 5223     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5224 
 5225     // we multiply the front and back halves of each sequence 4 at a
 5226     // time because
 5227     //
 5228     // 1) we are currently only able to get 4-way instruction
 5229     // parallelism at best
 5230     //
 5231     // 2) we need registers for the constants in vq and temporary
 5232     // scratch registers to hold intermediate results so vtmp can only
 5233     // be a VSeq<4> which means we only have 4 scratch slots
 5234 
 5235     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5236     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5237   }
 5238 
 5239   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5240                                const VSeq<4>& vc,
 5241                                const VSeq<4>& vtmp,
 5242                                const VSeq<2>& vq) {
 5243     // compute a = montmul(a1, c)
 5244     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5245     // ouptut a1 = a0 - a
 5246     vs_subv(va1, __ T8H, va0, vc);
 5247     //    and a0 = a0 + a
 5248     vs_addv(va0, __ T8H, va0, vc);
 5249   }
 5250 
 5251   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5252                                const VSeq<4>& vb,
 5253                                const VSeq<4>& vtmp1,
 5254                                const VSeq<4>& vtmp2,
 5255                                const VSeq<2>& vq) {
 5256     // compute c = a0 - a1
 5257     vs_subv(vtmp1, __ T8H, va0, va1);
 5258     // output a0 = a0 + a1
 5259     vs_addv(va0, __ T8H, va0, va1);
 5260     // output a1 = b montmul c
 5261     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5262   }
 5263 
 5264   void load64shorts(const VSeq<8>& v, Register shorts) {
 5265     vs_ldpq_post(v, shorts);
 5266   }
 5267 
 5268   void load32shorts(const VSeq<4>& v, Register shorts) {
 5269     vs_ldpq_post(v, shorts);
 5270   }
 5271 
 5272   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5273     vs_stpq_post(v, tmpAddr);
 5274   }
 5275 
 5276   // Kyber NTT function.
 5277   // Implements
 5278   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5279   //
 5280   // coeffs (short[256]) = c_rarg0
 5281   // ntt_zetas (short[256]) = c_rarg1
 5282   address generate_kyberNtt() {
 5283 
 5284     __ align(CodeEntryAlignment);
 5285     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5286     StubCodeMark mark(this, stub_id);
 5287     address start = __ pc();
 5288     __ enter();
 5289 
 5290     const Register coeffs = c_rarg0;
 5291     const Register zetas = c_rarg1;
 5292 
 5293     const Register kyberConsts = r10;
 5294     const Register tmpAddr = r11;
 5295 
 5296     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5297     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5298     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5299 
 5300     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5301     // load the montmul constants
 5302     vs_ldpq(vq, kyberConsts);
 5303 
 5304     // Each level corresponds to an iteration of the outermost loop of the
 5305     // Java method seilerNTT(int[] coeffs). There are some differences
 5306     // from what is done in the seilerNTT() method, though:
 5307     // 1. The computation is using 16-bit signed values, we do not convert them
 5308     // to ints here.
 5309     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5310     // this array for each level, it is easier that way to fill up the vector
 5311     // registers.
 5312     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5313     // multiplications (this is because that way there should not be any
 5314     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5315     // that we can use the 16-bit arithmetic in the vector unit.
 5316     //
 5317     // On each level, we fill up the vector registers in such a way that the
 5318     // array elements that need to be multiplied by the zetas go into one
 5319     // set of vector registers while the corresponding ones that don't need to
 5320     // be multiplied, go into another set.
 5321     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5322     // registers interleaving the steps of 4 identical computations,
 5323     // each done on 8 16-bit values per register.
 5324 
 5325     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5326     // to the zetas occur in discrete blocks whose size is some multiple
 5327     // of 32.
 5328 
 5329     // level 0
 5330     __ add(tmpAddr, coeffs, 256);
 5331     load64shorts(vs1, tmpAddr);
 5332     load64shorts(vs2, zetas);
 5333     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5334     __ add(tmpAddr, coeffs, 0);
 5335     load64shorts(vs1, tmpAddr);
 5336     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5337     vs_addv(vs1, __ T8H, vs1, vs2);
 5338     __ add(tmpAddr, coeffs, 0);
 5339     vs_stpq_post(vs1, tmpAddr);
 5340     __ add(tmpAddr, coeffs, 256);
 5341     vs_stpq_post(vs3, tmpAddr);
 5342     // restore montmul constants
 5343     vs_ldpq(vq, kyberConsts);
 5344     load64shorts(vs1, tmpAddr);
 5345     load64shorts(vs2, zetas);
 5346     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5347     __ add(tmpAddr, coeffs, 128);
 5348     load64shorts(vs1, tmpAddr);
 5349     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5350     vs_addv(vs1, __ T8H, vs1, vs2);
 5351     __ add(tmpAddr, coeffs, 128);
 5352     store64shorts(vs1, tmpAddr);
 5353     __ add(tmpAddr, coeffs, 384);
 5354     store64shorts(vs3, tmpAddr);
 5355 
 5356     // level 1
 5357     // restore montmul constants
 5358     vs_ldpq(vq, kyberConsts);
 5359     __ add(tmpAddr, coeffs, 128);
 5360     load64shorts(vs1, tmpAddr);
 5361     load64shorts(vs2, zetas);
 5362     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5363     __ add(tmpAddr, coeffs, 0);
 5364     load64shorts(vs1, tmpAddr);
 5365     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5366     vs_addv(vs1, __ T8H, vs1, vs2);
 5367     __ add(tmpAddr, coeffs, 0);
 5368     store64shorts(vs1, tmpAddr);
 5369     store64shorts(vs3, tmpAddr);
 5370     vs_ldpq(vq, kyberConsts);
 5371     __ add(tmpAddr, coeffs, 384);
 5372     load64shorts(vs1, tmpAddr);
 5373     load64shorts(vs2, zetas);
 5374     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5375     __ add(tmpAddr, coeffs, 256);
 5376     load64shorts(vs1, tmpAddr);
 5377     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5378     vs_addv(vs1, __ T8H, vs1, vs2);
 5379     __ add(tmpAddr, coeffs, 256);
 5380     store64shorts(vs1, tmpAddr);
 5381     store64shorts(vs3, tmpAddr);
 5382 
 5383     // level 2
 5384     vs_ldpq(vq, kyberConsts);
 5385     int offsets1[4] = { 0, 32, 128, 160 };
 5386     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5387     load64shorts(vs2, zetas);
 5388     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5389     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5390     // kyber_subv_addv64();
 5391     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5392     vs_addv(vs1, __ T8H, vs1, vs2);
 5393     __ add(tmpAddr, coeffs, 0);
 5394     vs_stpq_post(vs_front(vs1), tmpAddr);
 5395     vs_stpq_post(vs_front(vs3), tmpAddr);
 5396     vs_stpq_post(vs_back(vs1), tmpAddr);
 5397     vs_stpq_post(vs_back(vs3), tmpAddr);
 5398     vs_ldpq(vq, kyberConsts);
 5399     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5400     load64shorts(vs2, zetas);
 5401     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5402     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5403     // kyber_subv_addv64();
 5404     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5405     vs_addv(vs1, __ T8H, vs1, vs2);
 5406     __ add(tmpAddr, coeffs, 256);
 5407     vs_stpq_post(vs_front(vs1), tmpAddr);
 5408     vs_stpq_post(vs_front(vs3), tmpAddr);
 5409     vs_stpq_post(vs_back(vs1), tmpAddr);
 5410     vs_stpq_post(vs_back(vs3), tmpAddr);
 5411 
 5412     // level 3
 5413     vs_ldpq(vq, kyberConsts);
 5414     int offsets2[4] = { 0, 64, 128, 192 };
 5415     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5416     load64shorts(vs2, zetas);
 5417     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5418     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5419     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5420     vs_addv(vs1, __ T8H, vs1, vs2);
 5421     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5422     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5423 
 5424     vs_ldpq(vq, kyberConsts);
 5425     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5426     load64shorts(vs2, zetas);
 5427     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5428     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5429     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5430     vs_addv(vs1, __ T8H, vs1, vs2);
 5431     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5432     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5433 
 5434     // level 4
 5435     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5436     // so they are loaded using employing an ldr at 8 distinct offsets.
 5437 
 5438     vs_ldpq(vq, kyberConsts);
 5439     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5440     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5441     load64shorts(vs2, zetas);
 5442     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5443     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5444     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5445     vs_addv(vs1, __ T8H, vs1, vs2);
 5446     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5447     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5448 
 5449     vs_ldpq(vq, kyberConsts);
 5450     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5451     load64shorts(vs2, zetas);
 5452     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5453     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5454     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5455     vs_addv(vs1, __ T8H, vs1, vs2);
 5456     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5457     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5458 
 5459     // level 5
 5460     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5461     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5462 
 5463     vs_ldpq(vq, kyberConsts);
 5464     int offsets4[4] = { 0, 32, 64, 96 };
 5465     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5466     load32shorts(vs_front(vs2), zetas);
 5467     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5468     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5469     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5470     load32shorts(vs_front(vs2), zetas);
 5471     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5472     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5473     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5474     load32shorts(vs_front(vs2), zetas);
 5475     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5476     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5477 
 5478     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5479     load32shorts(vs_front(vs2), zetas);
 5480     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5481     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5482 
 5483     // level 6
 5484     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5485     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5486 
 5487     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5488     load32shorts(vs_front(vs2), zetas);
 5489     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5490     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5491     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5492     // __ ldpq(v18, v19, __ post(zetas, 32));
 5493     load32shorts(vs_front(vs2), zetas);
 5494     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5495     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5496 
 5497     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5498     load32shorts(vs_front(vs2), zetas);
 5499     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5500     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5501 
 5502     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5503     load32shorts(vs_front(vs2), zetas);
 5504     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5505     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5506 
 5507     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5508     __ mov(r0, zr); // return 0
 5509     __ ret(lr);
 5510 
 5511     return start;
 5512   }
 5513 
 5514   // Kyber Inverse NTT function
 5515   // Implements
 5516   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5517   //
 5518   // coeffs (short[256]) = c_rarg0
 5519   // ntt_zetas (short[256]) = c_rarg1
 5520   address generate_kyberInverseNtt() {
 5521 
 5522     __ align(CodeEntryAlignment);
 5523     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5524     StubCodeMark mark(this, stub_id);
 5525     address start = __ pc();
 5526     __ enter();
 5527 
 5528     const Register coeffs = c_rarg0;
 5529     const Register zetas = c_rarg1;
 5530 
 5531     const Register kyberConsts = r10;
 5532     const Register tmpAddr = r11;
 5533     const Register tmpAddr2 = c_rarg2;
 5534 
 5535     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5536     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5537     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5538 
 5539     __ lea(kyberConsts,
 5540              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5541 
 5542     // level 0
 5543     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5544     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5545 
 5546     vs_ldpq(vq, kyberConsts);
 5547     int offsets4[4] = { 0, 32, 64, 96 };
 5548     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5549     load32shorts(vs_front(vs2), zetas);
 5550     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5551                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5552     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5553     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5554     load32shorts(vs_front(vs2), zetas);
 5555     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5556                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5557     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5558     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5559     load32shorts(vs_front(vs2), zetas);
 5560     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5561                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5562     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5563     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5564     load32shorts(vs_front(vs2), zetas);
 5565     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5566                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5567     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5568 
 5569     // level 1
 5570     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5571     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5572 
 5573     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5574     load32shorts(vs_front(vs2), zetas);
 5575     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5576                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5577     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5578     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5579     load32shorts(vs_front(vs2), zetas);
 5580     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5581                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5582     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5583 
 5584     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5585     load32shorts(vs_front(vs2), zetas);
 5586     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5587                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5588     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5589     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5590     load32shorts(vs_front(vs2), zetas);
 5591     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5592                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5593     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5594 
 5595     // level 2
 5596     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5597     // so they are loaded using employing an ldr at 8 distinct offsets.
 5598 
 5599     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5600     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5601     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5602     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5603     vs_subv(vs1, __ T8H, vs1, vs2);
 5604     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5605     load64shorts(vs2, zetas);
 5606     vs_ldpq(vq, kyberConsts);
 5607     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5608     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5609 
 5610     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5611     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5612     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5613     vs_subv(vs1, __ T8H, vs1, vs2);
 5614     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5615     load64shorts(vs2, zetas);
 5616     vs_ldpq(vq, kyberConsts);
 5617     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5618     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5619 
 5620     // Barrett reduction at indexes where overflow may happen
 5621 
 5622     // load q and the multiplier for the Barrett reduction
 5623     __ add(tmpAddr, kyberConsts, 16);
 5624     vs_ldpq(vq, tmpAddr);
 5625 
 5626     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5627     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5628     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5629     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5630     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5631     vs_sshr(vs2, __ T8H, vs2, 11);
 5632     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5633     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5634     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5635     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5636     vs_sshr(vs2, __ T8H, vs2, 11);
 5637     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5638     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5639 
 5640     // level 3
 5641     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5642     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5643 
 5644     int offsets2[4] = { 0, 64, 128, 192 };
 5645     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5646     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5647     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5648     vs_subv(vs1, __ T8H, vs1, vs2);
 5649     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5650     load64shorts(vs2, zetas);
 5651     vs_ldpq(vq, kyberConsts);
 5652     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5653     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5654 
 5655     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5656     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5657     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5658     vs_subv(vs1, __ T8H, vs1, vs2);
 5659     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5660     load64shorts(vs2, zetas);
 5661     vs_ldpq(vq, kyberConsts);
 5662     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5663     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5664 
 5665     // level 4
 5666 
 5667     int offsets1[4] = { 0, 32, 128, 160 };
 5668     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5669     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5670     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5671     vs_subv(vs1, __ T8H, vs1, vs2);
 5672     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5673     load64shorts(vs2, zetas);
 5674     vs_ldpq(vq, kyberConsts);
 5675     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5676     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5677 
 5678     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5679     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5680     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5681     vs_subv(vs1, __ T8H, vs1, vs2);
 5682     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5683     load64shorts(vs2, zetas);
 5684     vs_ldpq(vq, kyberConsts);
 5685     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5686     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5687 
 5688     // level 5
 5689 
 5690     __ add(tmpAddr, coeffs, 0);
 5691     load64shorts(vs1, tmpAddr);
 5692     __ add(tmpAddr, coeffs, 128);
 5693     load64shorts(vs2, tmpAddr);
 5694     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5695     vs_subv(vs1, __ T8H, vs1, vs2);
 5696     __ add(tmpAddr, coeffs, 0);
 5697     store64shorts(vs3, tmpAddr);
 5698     load64shorts(vs2, zetas);
 5699     vs_ldpq(vq, kyberConsts);
 5700     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5701     __ add(tmpAddr, coeffs, 128);
 5702     store64shorts(vs2, tmpAddr);
 5703 
 5704     load64shorts(vs1, tmpAddr);
 5705     __ add(tmpAddr, coeffs, 384);
 5706     load64shorts(vs2, tmpAddr);
 5707     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5708     vs_subv(vs1, __ T8H, vs1, vs2);
 5709     __ add(tmpAddr, coeffs, 256);
 5710     store64shorts(vs3, tmpAddr);
 5711     load64shorts(vs2, zetas);
 5712     vs_ldpq(vq, kyberConsts);
 5713     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5714     __ add(tmpAddr, coeffs, 384);
 5715     store64shorts(vs2, tmpAddr);
 5716 
 5717     // Barrett reduction at indexes where overflow may happen
 5718 
 5719     // load q and the multiplier for the Barrett reduction
 5720     __ add(tmpAddr, kyberConsts, 16);
 5721     vs_ldpq(vq, tmpAddr);
 5722 
 5723     int offsets0[2] = { 0, 256 };
 5724     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5725     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5726     vs_sshr(vs2, __ T8H, vs2, 11);
 5727     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5728     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5729 
 5730     // level 6
 5731 
 5732     __ add(tmpAddr, coeffs, 0);
 5733     load64shorts(vs1, tmpAddr);
 5734     __ add(tmpAddr, coeffs, 256);
 5735     load64shorts(vs2, tmpAddr);
 5736     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5737     vs_subv(vs1, __ T8H, vs1, vs2);
 5738     __ add(tmpAddr, coeffs, 0);
 5739     store64shorts(vs3, tmpAddr);
 5740     load64shorts(vs2, zetas);
 5741     vs_ldpq(vq, kyberConsts);
 5742     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5743     __ add(tmpAddr, coeffs, 256);
 5744     store64shorts(vs2, tmpAddr);
 5745 
 5746     __ add(tmpAddr, coeffs, 128);
 5747     load64shorts(vs1, tmpAddr);
 5748     __ add(tmpAddr, coeffs, 384);
 5749     load64shorts(vs2, tmpAddr);
 5750     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5751     vs_subv(vs1, __ T8H, vs1, vs2);
 5752     __ add(tmpAddr, coeffs, 128);
 5753     store64shorts(vs3, tmpAddr);
 5754     load64shorts(vs2, zetas);
 5755     vs_ldpq(vq, kyberConsts);
 5756     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5757     __ add(tmpAddr, coeffs, 384);
 5758     store64shorts(vs2, tmpAddr);
 5759 
 5760     // multiply by 2^-n
 5761 
 5762     // load toMont(2^-n mod q)
 5763     __ add(tmpAddr, kyberConsts, 48);
 5764     __ ldr(v29, __ Q, tmpAddr);
 5765 
 5766     vs_ldpq(vq, kyberConsts);
 5767     __ add(tmpAddr, coeffs, 0);
 5768     load64shorts(vs1, tmpAddr);
 5769     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5770     __ add(tmpAddr, coeffs, 0);
 5771     store64shorts(vs2, tmpAddr);
 5772 
 5773     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5774     load64shorts(vs1, tmpAddr);
 5775     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5776     __ add(tmpAddr, coeffs, 128);
 5777     store64shorts(vs2, tmpAddr);
 5778 
 5779     // now tmpAddr contains coeffs + 256
 5780     load64shorts(vs1, tmpAddr);
 5781     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5782     __ add(tmpAddr, coeffs, 256);
 5783     store64shorts(vs2, tmpAddr);
 5784 
 5785     // now tmpAddr contains coeffs + 384
 5786     load64shorts(vs1, tmpAddr);
 5787     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5788     __ add(tmpAddr, coeffs, 384);
 5789     store64shorts(vs2, tmpAddr);
 5790 
 5791     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5792     __ mov(r0, zr); // return 0
 5793     __ ret(lr);
 5794 
 5795     return start;
 5796   }
 5797 
 5798   // Kyber multiply polynomials in the NTT domain.
 5799   // Implements
 5800   // static int implKyberNttMult(
 5801   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5802   //
 5803   // result (short[256]) = c_rarg0
 5804   // ntta (short[256]) = c_rarg1
 5805   // nttb (short[256]) = c_rarg2
 5806   // zetas (short[128]) = c_rarg3
 5807   address generate_kyberNttMult() {
 5808 
 5809     __ align(CodeEntryAlignment);
 5810     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5811     StubCodeMark mark(this, stub_id);
 5812     address start = __ pc();
 5813     __ enter();
 5814 
 5815     const Register result = c_rarg0;
 5816     const Register ntta = c_rarg1;
 5817     const Register nttb = c_rarg2;
 5818     const Register zetas = c_rarg3;
 5819 
 5820     const Register kyberConsts = r10;
 5821     const Register limit = r11;
 5822 
 5823     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5824     VSeq<4> vs3(16), vs4(20);
 5825     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5826     VSeq<2> vz(28);          // pair of zetas
 5827     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5828 
 5829     __ lea(kyberConsts,
 5830              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5831 
 5832     Label kyberNttMult_loop;
 5833 
 5834     __ add(limit, result, 512);
 5835 
 5836     // load q and qinv
 5837     vs_ldpq(vq, kyberConsts);
 5838 
 5839     // load R^2 mod q (to convert back from Montgomery representation)
 5840     __ add(kyberConsts, kyberConsts, 64);
 5841     __ ldr(v27, __ Q, kyberConsts);
 5842 
 5843     __ BIND(kyberNttMult_loop);
 5844 
 5845     // load 16 zetas
 5846     vs_ldpq_post(vz, zetas);
 5847 
 5848     // load 2 sets of 32 coefficients from the two input arrays
 5849     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5850     // are striped across pairs of vector registers
 5851     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5852     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5853     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5854     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5855 
 5856     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5857     // i.e. montmul the first and second halves of vs1 in order and
 5858     // then with one sequence reversed storing the two results in vs3
 5859     //
 5860     // vs3[0] <- montmul(a0, b0)
 5861     // vs3[1] <- montmul(a1, b1)
 5862     // vs3[2] <- montmul(a0, b1)
 5863     // vs3[3] <- montmul(a1, b0)
 5864     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5865     kyber_montmul16(vs_back(vs3),
 5866                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5867 
 5868     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5869     // i.e. montmul the first and second halves of vs4 in order and
 5870     // then with one sequence reversed storing the two results in vs1
 5871     //
 5872     // vs1[0] <- montmul(a2, b2)
 5873     // vs1[1] <- montmul(a3, b3)
 5874     // vs1[2] <- montmul(a2, b3)
 5875     // vs1[3] <- montmul(a3, b2)
 5876     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5877     kyber_montmul16(vs_back(vs1),
 5878                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5879 
 5880     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5881     // We can schedule two montmuls at a time if we use a suitable vector
 5882     // sequence <vs3[1], vs1[1]>.
 5883     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5884     VSeq<2> vs5(vs3[1], delta);
 5885 
 5886     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5887     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5888     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5889 
 5890     // add results in pairs storing in vs3
 5891     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5892     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5893     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5894 
 5895     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5896     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5897     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5898 
 5899     // vs1 <- montmul(vs3, montRSquareModQ)
 5900     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5901 
 5902     // store back the two pairs of result vectors de-interleaved as 8H elements
 5903     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5904     // in memory
 5905     vs_st2_post(vs1, __ T8H, result);
 5906 
 5907     __ cmp(result, limit);
 5908     __ br(Assembler::NE, kyberNttMult_loop);
 5909 
 5910     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5911     __ mov(r0, zr); // return 0
 5912     __ ret(lr);
 5913 
 5914     return start;
 5915   }
 5916 
 5917   // Kyber add 2 polynomials.
 5918   // Implements
 5919   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5920   //
 5921   // result (short[256]) = c_rarg0
 5922   // a (short[256]) = c_rarg1
 5923   // b (short[256]) = c_rarg2
 5924   address generate_kyberAddPoly_2() {
 5925 
 5926     __ align(CodeEntryAlignment);
 5927     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5928     StubCodeMark mark(this, stub_id);
 5929     address start = __ pc();
 5930     __ enter();
 5931 
 5932     const Register result = c_rarg0;
 5933     const Register a = c_rarg1;
 5934     const Register b = c_rarg2;
 5935 
 5936     const Register kyberConsts = r11;
 5937 
 5938     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5939     // So, we can load, add and store the data in 3 groups of 11,
 5940     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5941     // registers. A further constraint is that the mapping needs
 5942     // to skip callee saves. So, we allocate the register
 5943     // sequences using two 8 sequences, two 2 sequences and two
 5944     // single registers.
 5945     VSeq<8> vs1_1(0);
 5946     VSeq<2> vs1_2(16);
 5947     FloatRegister vs1_3 = v28;
 5948     VSeq<8> vs2_1(18);
 5949     VSeq<2> vs2_2(26);
 5950     FloatRegister vs2_3 = v29;
 5951 
 5952     // two constant vector sequences
 5953     VSeq<8> vc_1(31, 0);
 5954     VSeq<2> vc_2(31, 0);
 5955 
 5956     FloatRegister vc_3 = v31;
 5957     __ lea(kyberConsts,
 5958              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5959 
 5960     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5961     for (int i = 0; i < 3; i++) {
 5962       // load 80 or 88 values from a into vs1_1/2/3
 5963       vs_ldpq_post(vs1_1, a);
 5964       vs_ldpq_post(vs1_2, a);
 5965       if (i < 2) {
 5966         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5967       }
 5968       // load 80 or 88 values from b into vs2_1/2/3
 5969       vs_ldpq_post(vs2_1, b);
 5970       vs_ldpq_post(vs2_2, b);
 5971       if (i < 2) {
 5972         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5973       }
 5974       // sum 80 or 88 values across vs1 and vs2 into vs1
 5975       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5976       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5977       if (i < 2) {
 5978         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5979       }
 5980       // add constant to all 80 or 88 results
 5981       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5982       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5983       if (i < 2) {
 5984         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5985       }
 5986       // store 80 or 88 values
 5987       vs_stpq_post(vs1_1, result);
 5988       vs_stpq_post(vs1_2, result);
 5989       if (i < 2) {
 5990         __ str(vs1_3, __ Q, __ post(result, 16));
 5991       }
 5992     }
 5993 
 5994     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5995     __ mov(r0, zr); // return 0
 5996     __ ret(lr);
 5997 
 5998     return start;
 5999   }
 6000 
 6001   // Kyber add 3 polynomials.
 6002   // Implements
 6003   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6004   //
 6005   // result (short[256]) = c_rarg0
 6006   // a (short[256]) = c_rarg1
 6007   // b (short[256]) = c_rarg2
 6008   // c (short[256]) = c_rarg3
 6009   address generate_kyberAddPoly_3() {
 6010 
 6011     __ align(CodeEntryAlignment);
 6012     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6013     StubCodeMark mark(this, stub_id);
 6014     address start = __ pc();
 6015     __ enter();
 6016 
 6017     const Register result = c_rarg0;
 6018     const Register a = c_rarg1;
 6019     const Register b = c_rarg2;
 6020     const Register c = c_rarg3;
 6021 
 6022     const Register kyberConsts = r11;
 6023 
 6024     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6025     // quadwords.  So, we can load, add and store the data in 3
 6026     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6027     // of 10 or 11 registers. A further constraint is that the
 6028     // mapping needs to skip callee saves. So, we allocate the
 6029     // register sequences using two 8 sequences, two 2 sequences
 6030     // and two single registers.
 6031     VSeq<8> vs1_1(0);
 6032     VSeq<2> vs1_2(16);
 6033     FloatRegister vs1_3 = v28;
 6034     VSeq<8> vs2_1(18);
 6035     VSeq<2> vs2_2(26);
 6036     FloatRegister vs2_3 = v29;
 6037 
 6038     // two constant vector sequences
 6039     VSeq<8> vc_1(31, 0);
 6040     VSeq<2> vc_2(31, 0);
 6041 
 6042     FloatRegister vc_3 = v31;
 6043 
 6044     __ lea(kyberConsts,
 6045              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6046 
 6047     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6048     for (int i = 0; i < 3; i++) {
 6049       // load 80 or 88 values from a into vs1_1/2/3
 6050       vs_ldpq_post(vs1_1, a);
 6051       vs_ldpq_post(vs1_2, a);
 6052       if (i < 2) {
 6053         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6054       }
 6055       // load 80 or 88 values from b into vs2_1/2/3
 6056       vs_ldpq_post(vs2_1, b);
 6057       vs_ldpq_post(vs2_2, b);
 6058       if (i < 2) {
 6059         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6060       }
 6061       // sum 80 or 88 values across vs1 and vs2 into vs1
 6062       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6063       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6064       if (i < 2) {
 6065         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6066       }
 6067       // load 80 or 88 values from c into vs2_1/2/3
 6068       vs_ldpq_post(vs2_1, c);
 6069       vs_ldpq_post(vs2_2, c);
 6070       if (i < 2) {
 6071         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6072       }
 6073       // sum 80 or 88 values across vs1 and vs2 into vs1
 6074       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6075       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6076       if (i < 2) {
 6077         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6078       }
 6079       // add constant to all 80 or 88 results
 6080       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6081       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6082       if (i < 2) {
 6083         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6084       }
 6085       // store 80 or 88 values
 6086       vs_stpq_post(vs1_1, result);
 6087       vs_stpq_post(vs1_2, result);
 6088       if (i < 2) {
 6089         __ str(vs1_3, __ Q, __ post(result, 16));
 6090       }
 6091     }
 6092 
 6093     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6094     __ mov(r0, zr); // return 0
 6095     __ ret(lr);
 6096 
 6097     return start;
 6098   }
 6099 
 6100   // Kyber parse XOF output to polynomial coefficient candidates
 6101   // or decodePoly(12, ...).
 6102   // Implements
 6103   // static int implKyber12To16(
 6104   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6105   //
 6106   // we assume that parsed and condensed are allocated such that for
 6107   // n = (parsedLength + 63) / 64
 6108   // n blocks of 96 bytes of input can be processed, i.e.
 6109   // index + n * 96 <= condensed.length and
 6110   // n * 64 <= parsed.length
 6111   //
 6112   // condensed (byte[]) = c_rarg0
 6113   // condensedIndex = c_rarg1
 6114   // parsed (short[]) = c_rarg2
 6115   // parsedLength = c_rarg3
 6116   address generate_kyber12To16() {
 6117     Label L_F00, L_loop;
 6118 
 6119     __ align(CodeEntryAlignment);
 6120     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6121     StubCodeMark mark(this, stub_id);
 6122     address start = __ pc();
 6123     __ enter();
 6124 
 6125     const Register condensed = c_rarg0;
 6126     const Register condensedOffs = c_rarg1;
 6127     const Register parsed = c_rarg2;
 6128     const Register parsedLength = c_rarg3;
 6129 
 6130     const Register tmpAddr = r11;
 6131 
 6132     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6133     // quadwords so we need a 6 vector sequence for the inputs.
 6134     // Parsing produces 64 shorts, employing two 8 vector
 6135     // sequences to store and combine the intermediate data.
 6136     VSeq<6> vin(24);
 6137     VSeq<8> va(0), vb(16);
 6138 
 6139     __ adr(tmpAddr, L_F00);
 6140     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6141     __ add(condensed, condensed, condensedOffs);
 6142 
 6143     __ BIND(L_loop);
 6144     // load 96 (6 x 16B) byte values
 6145     vs_ld3_post(vin, __ T16B, condensed);
 6146 
 6147     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6148     // holds 48 (16x3) contiguous bytes from memory striped
 6149     // horizontally across each of the 16 byte lanes. Equivalently,
 6150     // that is 16 pairs of 12-bit integers. Likewise the back half
 6151     // holds the next 48 bytes in the same arrangement.
 6152 
 6153     // Each vector in the front half can also be viewed as a vertical
 6154     // strip across the 16 pairs of 12 bit integers. Each byte in
 6155     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6156     // byte in vin[1] stores the high 4 bits of the first int and the
 6157     // low 4 bits of the second int. Each byte in vin[2] stores the
 6158     // high 8 bits of the second int. Likewise the vectors in second
 6159     // half.
 6160 
 6161     // Converting the data to 16-bit shorts requires first of all
 6162     // expanding each of the 6 x 16B vectors into 6 corresponding
 6163     // pairs of 8H vectors. Mask, shift and add operations on the
 6164     // resulting vector pairs can be used to combine 4 and 8 bit
 6165     // parts of related 8H vector elements.
 6166     //
 6167     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6168     // twice, one copy manipulated to provide the lower 4 bits
 6169     // belonging to the first short in a pair and another copy
 6170     // manipulated to provide the higher 4 bits belonging to the
 6171     // second short in a pair. This is why the the vector sequences va
 6172     // and vb used to hold the expanded 8H elements are of length 8.
 6173 
 6174     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6175     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6176     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6177     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6178     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6179     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6180     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6181     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6182 
 6183     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6184     // and vb[4:5]
 6185     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6186     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6187     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6188     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6189     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6190     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6191 
 6192     // shift lo byte of copy 1 of the middle stripe into the high byte
 6193     __ shl(va[2], __ T8H, va[2], 8);
 6194     __ shl(va[3], __ T8H, va[3], 8);
 6195     __ shl(vb[2], __ T8H, vb[2], 8);
 6196     __ shl(vb[3], __ T8H, vb[3], 8);
 6197 
 6198     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6199     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6200     // are in bit positions [4..11].
 6201     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6202     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6203     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6204     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6205 
 6206     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6207     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6208     // copy2
 6209     __ andr(va[2], __ T16B, va[2], v31);
 6210     __ andr(va[3], __ T16B, va[3], v31);
 6211     __ ushr(va[4], __ T8H, va[4], 4);
 6212     __ ushr(va[5], __ T8H, va[5], 4);
 6213     __ andr(vb[2], __ T16B, vb[2], v31);
 6214     __ andr(vb[3], __ T16B, vb[3], v31);
 6215     __ ushr(vb[4], __ T8H, vb[4], 4);
 6216     __ ushr(vb[5], __ T8H, vb[5], 4);
 6217 
 6218     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6219     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6220     // n.b. the ordering ensures: i) inputs are consumed before they
 6221     // are overwritten ii) the order of 16-bit results across successive
 6222     // pairs of vectors in va and then vb reflects the order of the
 6223     // corresponding 12-bit inputs
 6224     __ addv(va[0], __ T8H, va[0], va[2]);
 6225     __ addv(va[2], __ T8H, va[1], va[3]);
 6226     __ addv(va[1], __ T8H, va[4], va[6]);
 6227     __ addv(va[3], __ T8H, va[5], va[7]);
 6228     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6229     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6230     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6231     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6232 
 6233     // store 64 results interleaved as shorts
 6234     vs_st2_post(vs_front(va), __ T8H, parsed);
 6235     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6236 
 6237     __ sub(parsedLength, parsedLength, 64);
 6238     __ cmp(parsedLength, (u1)0);
 6239     __ br(Assembler::GT, L_loop);
 6240 
 6241     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6242     __ mov(r0, zr); // return 0
 6243     __ ret(lr);
 6244 
 6245     // bind label and generate constant data used by this stub
 6246     __ BIND(L_F00);
 6247     __ emit_int64(0x0f000f000f000f00);
 6248     __ emit_int64(0x0f000f000f000f00);
 6249 
 6250     return start;
 6251   }
 6252 
 6253   // Kyber Barrett reduce function.
 6254   // Implements
 6255   // static int implKyberBarrettReduce(short[] coeffs) {}
 6256   //
 6257   // coeffs (short[256]) = c_rarg0
 6258   address generate_kyberBarrettReduce() {
 6259 
 6260     __ align(CodeEntryAlignment);
 6261     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6262     StubCodeMark mark(this, stub_id);
 6263     address start = __ pc();
 6264     __ enter();
 6265 
 6266     const Register coeffs = c_rarg0;
 6267 
 6268     const Register kyberConsts = r10;
 6269     const Register result = r11;
 6270 
 6271     // As above we process 256 sets of values in total i.e. 32 x
 6272     // 8H quadwords. So, we can load, add and store the data in 3
 6273     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6274     // of 10 or 11 registers. A further constraint is that the
 6275     // mapping needs to skip callee saves. So, we allocate the
 6276     // register sequences using two 8 sequences, two 2 sequences
 6277     // and two single registers.
 6278     VSeq<8> vs1_1(0);
 6279     VSeq<2> vs1_2(16);
 6280     FloatRegister vs1_3 = v28;
 6281     VSeq<8> vs2_1(18);
 6282     VSeq<2> vs2_2(26);
 6283     FloatRegister vs2_3 = v29;
 6284 
 6285     // we also need a pair of corresponding constant sequences
 6286 
 6287     VSeq<8> vc1_1(30, 0);
 6288     VSeq<2> vc1_2(30, 0);
 6289     FloatRegister vc1_3 = v30; // for kyber_q
 6290 
 6291     VSeq<8> vc2_1(31, 0);
 6292     VSeq<2> vc2_2(31, 0);
 6293     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6294 
 6295     __ add(result, coeffs, 0);
 6296     __ lea(kyberConsts,
 6297              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6298 
 6299     // load q and the multiplier for the Barrett reduction
 6300     __ add(kyberConsts, kyberConsts, 16);
 6301     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6302 
 6303     for (int i = 0; i < 3; i++) {
 6304       // load 80 or 88 coefficients
 6305       vs_ldpq_post(vs1_1, coeffs);
 6306       vs_ldpq_post(vs1_2, coeffs);
 6307       if (i < 2) {
 6308         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6309       }
 6310 
 6311       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6312       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6313       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6314       if (i < 2) {
 6315         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6316       }
 6317 
 6318       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6319       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6320       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6321       if (i < 2) {
 6322         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6323       }
 6324 
 6325       // vs1 <- vs1 - vs2 * kyber_q
 6326       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6327       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6328       if (i < 2) {
 6329         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6330       }
 6331 
 6332       vs_stpq_post(vs1_1, result);
 6333       vs_stpq_post(vs1_2, result);
 6334       if (i < 2) {
 6335         __ str(vs1_3, __ Q, __ post(result, 16));
 6336       }
 6337     }
 6338 
 6339     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6340     __ mov(r0, zr); // return 0
 6341     __ ret(lr);
 6342 
 6343     return start;
 6344   }
 6345 
 6346 
 6347   // Dilithium-specific montmul helper routines that generate parallel
 6348   // code for, respectively, a single 4x4s vector sequence montmul or
 6349   // two such multiplies in a row.
 6350 
 6351   // Perform 16 32-bit Montgomery multiplications in parallel
 6352   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6353                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6354     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6355     // It will assert that the register use is valid
 6356     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6357   }
 6358 
 6359   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6360   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6361                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6362     // Schedule two successive 4x4S multiplies via the montmul helper
 6363     // on the front and back halves of va, vb and vc. The helper will
 6364     // assert that the register use has no overlap conflicts on each
 6365     // individual call but we also need to ensure that the necessary
 6366     // disjoint/equality constraints are met across both calls.
 6367 
 6368     // vb, vc, vtmp and vq must be disjoint. va must either be
 6369     // disjoint from all other registers or equal vc
 6370 
 6371     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6372     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6373     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6374 
 6375     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6376     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6377 
 6378     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6379 
 6380     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6381     assert(vs_disjoint(va, vb), "va and vb overlap");
 6382     assert(vs_disjoint(va, vq), "va and vq overlap");
 6383     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6384 
 6385     // We multiply the front and back halves of each sequence 4 at a
 6386     // time because
 6387     //
 6388     // 1) we are currently only able to get 4-way instruction
 6389     // parallelism at best
 6390     //
 6391     // 2) we need registers for the constants in vq and temporary
 6392     // scratch registers to hold intermediate results so vtmp can only
 6393     // be a VSeq<4> which means we only have 4 scratch slots.
 6394 
 6395     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6396     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6397   }
 6398 
 6399   // Perform combined montmul then add/sub on 4x4S vectors.
 6400   void dilithium_montmul16_sub_add(
 6401           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6402           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6403     // compute a = montmul(a1, c)
 6404     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6405     // ouptut a1 = a0 - a
 6406     vs_subv(va1, __ T4S, va0, vc);
 6407     //    and a0 = a0 + a
 6408     vs_addv(va0, __ T4S, va0, vc);
 6409   }
 6410 
 6411   // Perform combined add/sub then montul on 4x4S vectors.
 6412   void dilithium_sub_add_montmul16(
 6413           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6414           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6415     // compute c = a0 - a1
 6416     vs_subv(vtmp1, __ T4S, va0, va1);
 6417     // output a0 = a0 + a1
 6418     vs_addv(va0, __ T4S, va0, va1);
 6419     // output a1 = b montmul c
 6420     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6421   }
 6422 
 6423   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6424   // in the Java implementation come in sequences of at least 8, so we
 6425   // can use ldpq to collect the corresponding data into pairs of vector
 6426   // registers.
 6427   // We collect the coefficients corresponding to the 'j+l' indexes into
 6428   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6429   // then we do the (Montgomery) multiplications by the zetas in parallel
 6430   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6431   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6432   // v0-v7 and finally save the results back to the coeffs array.
 6433   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6434     const Register coeffs, const Register zetas) {
 6435     int c1 = 0;
 6436     int c2 = 512;
 6437     int startIncr;
 6438     // don't use callee save registers v8 - v15
 6439     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6440     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6441     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6442     int offsets[4] = { 0, 32, 64, 96 };
 6443 
 6444     for (int level = 0; level < 5; level++) {
 6445       int c1Start = c1;
 6446       int c2Start = c2;
 6447       if (level == 3) {
 6448         offsets[1] = 32;
 6449         offsets[2] = 128;
 6450         offsets[3] = 160;
 6451       } else if (level == 4) {
 6452         offsets[1] = 64;
 6453         offsets[2] = 128;
 6454         offsets[3] = 192;
 6455       }
 6456 
 6457       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6458       // time at 4 different offsets and multiply them in order by the
 6459       // next set of input values. So we employ indexed load and store
 6460       // pair instructions with arrangement 4S.
 6461       for (int i = 0; i < 4; i++) {
 6462         // reload q and qinv
 6463         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6464         // load 8x4S coefficients via second start pos == c2
 6465         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6466         // load next 8x4S inputs == b
 6467         vs_ldpq_post(vs2, zetas);
 6468         // compute a == c2 * b mod MONT_Q
 6469         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6470         // load 8x4s coefficients via first start pos == c1
 6471         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6472         // compute a1 =  c1 + a
 6473         vs_addv(vs3, __ T4S, vs1, vs2);
 6474         // compute a2 =  c1 - a
 6475         vs_subv(vs1, __ T4S, vs1, vs2);
 6476         // output a1 and a2
 6477         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6478         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6479 
 6480         int k = 4 * level + i;
 6481 
 6482         if (k > 7) {
 6483           startIncr = 256;
 6484         } else if (k == 5) {
 6485           startIncr = 384;
 6486         } else {
 6487           startIncr = 128;
 6488         }
 6489 
 6490         c1Start += startIncr;
 6491         c2Start += startIncr;
 6492       }
 6493 
 6494       c2 /= 2;
 6495     }
 6496   }
 6497 
 6498   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6499   // Implements the method
 6500   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6501   // of the Java class sun.security.provider
 6502   //
 6503   // coeffs (int[256]) = c_rarg0
 6504   // zetas (int[256]) = c_rarg1
 6505   address generate_dilithiumAlmostNtt() {
 6506 
 6507     __ align(CodeEntryAlignment);
 6508     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6509     StubCodeMark mark(this, stub_id);
 6510     address start = __ pc();
 6511     __ enter();
 6512 
 6513     const Register coeffs = c_rarg0;
 6514     const Register zetas = c_rarg1;
 6515 
 6516     const Register tmpAddr = r9;
 6517     const Register dilithiumConsts = r10;
 6518     const Register result = r11;
 6519     // don't use callee save registers v8 - v15
 6520     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6521     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6522     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6523     int offsets[4] = { 0, 32, 64, 96};
 6524     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6525     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6526     __ add(result, coeffs, 0);
 6527     __ lea(dilithiumConsts,
 6528              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6529 
 6530     // Each level represents one iteration of the outer for loop of the Java version.
 6531 
 6532     // level 0-4
 6533     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6534 
 6535     // level 5
 6536 
 6537     // At level 5 the coefficients we need to combine with the zetas
 6538     // are grouped in memory in blocks of size 4. So, for both sets of
 6539     // coefficients we load 4 adjacent values at 8 different offsets
 6540     // using an indexed ldr with register variant Q and multiply them
 6541     // in sequence order by the next set of inputs. Likewise we store
 6542     // the resuls using an indexed str with register variant Q.
 6543     for (int i = 0; i < 1024; i += 256) {
 6544       // reload constants q, qinv each iteration as they get clobbered later
 6545       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6546       // load 32 (8x4S) coefficients via first offsets = c1
 6547       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6548       // load next 32 (8x4S) inputs = b
 6549       vs_ldpq_post(vs2, zetas);
 6550       // a = b montul c1
 6551       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6552       // load 32 (8x4S) coefficients via second offsets = c2
 6553       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6554       // add/sub with result of multiply
 6555       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6556       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6557       // write back new coefficients using same offsets
 6558       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6559       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6560     }
 6561 
 6562     // level 6
 6563     // At level 6 the coefficients we need to combine with the zetas
 6564     // are grouped in memory in pairs, the first two being montmul
 6565     // inputs and the second add/sub inputs. We can still implement
 6566     // the montmul+sub+add using 4-way parallelism but only if we
 6567     // combine the coefficients with the zetas 16 at a time. We load 8
 6568     // adjacent values at 4 different offsets using an ld2 load with
 6569     // arrangement 2D. That interleaves the lower and upper halves of
 6570     // each pair of quadwords into successive vector registers. We
 6571     // then need to montmul the 4 even elements of the coefficients
 6572     // register sequence by the zetas in order and then add/sub the 4
 6573     // odd elements of the coefficients register sequence. We use an
 6574     // equivalent st2 operation to store the results back into memory
 6575     // de-interleaved.
 6576     for (int i = 0; i < 1024; i += 128) {
 6577       // reload constants q, qinv each iteration as they get clobbered later
 6578       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6579       // load interleaved 16 (4x2D) coefficients via offsets
 6580       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6581       // load next 16 (4x4S) inputs
 6582       vs_ldpq_post(vs_front(vs2), zetas);
 6583       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6584       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6585                                   vs_front(vs2), vtmp, vq);
 6586       // store interleaved 16 (4x2D) coefficients via offsets
 6587       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6588     }
 6589 
 6590     // level 7
 6591     // At level 7 the coefficients we need to combine with the zetas
 6592     // occur singly with montmul inputs alterating with add/sub
 6593     // inputs. Once again we can use 4-way parallelism to combine 16
 6594     // zetas at a time. However, we have to load 8 adjacent values at
 6595     // 4 different offsets using an ld2 load with arrangement 4S. That
 6596     // interleaves the the odd words of each pair into one
 6597     // coefficients vector register and the even words of the pair
 6598     // into the next register. We then need to montmul the 4 even
 6599     // elements of the coefficients register sequence by the zetas in
 6600     // order and then add/sub the 4 odd elements of the coefficients
 6601     // register sequence. We use an equivalent st2 operation to store
 6602     // the results back into memory de-interleaved.
 6603 
 6604     for (int i = 0; i < 1024; i += 128) {
 6605       // reload constants q, qinv each iteration as they get clobbered later
 6606       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6607       // load interleaved 16 (4x4S) coefficients via offsets
 6608       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6609       // load next 16 (4x4S) inputs
 6610       vs_ldpq_post(vs_front(vs2), zetas);
 6611       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6612       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6613                                   vs_front(vs2), vtmp, vq);
 6614       // store interleaved 16 (4x4S) coefficients via offsets
 6615       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6616     }
 6617     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6618     __ mov(r0, zr); // return 0
 6619     __ ret(lr);
 6620 
 6621     return start;
 6622   }
 6623 
 6624   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6625   // in the Java implementation come in sequences of at least 8, so we
 6626   // can use ldpq to collect the corresponding data into pairs of vector
 6627   // registers
 6628   // We collect the coefficients that correspond to the 'j's into vs1
 6629   // the coefficiets that correspond to the 'j+l's into vs2 then
 6630   // do the additions into vs3 and the subtractions into vs1 then
 6631   // save the result of the additions, load the zetas into vs2
 6632   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6633   // finally save the results back to the coeffs array
 6634   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6635     const Register coeffs, const Register zetas) {
 6636     int c1 = 0;
 6637     int c2 = 32;
 6638     int startIncr;
 6639     int offsets[4];
 6640     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6641     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6642     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6643 
 6644     offsets[0] = 0;
 6645 
 6646     for (int level = 3; level < 8; level++) {
 6647       int c1Start = c1;
 6648       int c2Start = c2;
 6649       if (level == 3) {
 6650         offsets[1] = 64;
 6651         offsets[2] = 128;
 6652         offsets[3] = 192;
 6653       } else if (level == 4) {
 6654         offsets[1] = 32;
 6655         offsets[2] = 128;
 6656         offsets[3] = 160;
 6657       } else {
 6658         offsets[1] = 32;
 6659         offsets[2] = 64;
 6660         offsets[3] = 96;
 6661       }
 6662 
 6663       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6664       // time at 4 different offsets and multiply them in order by the
 6665       // next set of input values. So we employ indexed load and store
 6666       // pair instructions with arrangement 4S.
 6667       for (int i = 0; i < 4; i++) {
 6668         // load v1 32 (8x4S) coefficients relative to first start index
 6669         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6670         // load v2 32 (8x4S) coefficients relative to second start index
 6671         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6672         // a0 = v1 + v2 -- n.b. clobbers vqs
 6673         vs_addv(vs3, __ T4S, vs1, vs2);
 6674         // a1 = v1 - v2
 6675         vs_subv(vs1, __ T4S, vs1, vs2);
 6676         // save a1 relative to first start index
 6677         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6678         // load constants q, qinv each iteration as they get clobbered above
 6679         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6680         // load b next 32 (8x4S) inputs
 6681         vs_ldpq_post(vs2, zetas);
 6682         // a = a1 montmul b
 6683         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6684         // save a relative to second start index
 6685         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6686 
 6687         int k = 4 * level + i;
 6688 
 6689         if (k < 24) {
 6690           startIncr = 256;
 6691         } else if (k == 25) {
 6692           startIncr = 384;
 6693         } else {
 6694           startIncr = 128;
 6695         }
 6696 
 6697         c1Start += startIncr;
 6698         c2Start += startIncr;
 6699       }
 6700 
 6701       c2 *= 2;
 6702     }
 6703   }
 6704 
 6705   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6706   // Implements the method
 6707   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6708   // the sun.security.provider.ML_DSA class.
 6709   //
 6710   // coeffs (int[256]) = c_rarg0
 6711   // zetas (int[256]) = c_rarg1
 6712   address generate_dilithiumAlmostInverseNtt() {
 6713 
 6714     __ align(CodeEntryAlignment);
 6715     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6716     StubCodeMark mark(this, stub_id);
 6717     address start = __ pc();
 6718     __ enter();
 6719 
 6720     const Register coeffs = c_rarg0;
 6721     const Register zetas = c_rarg1;
 6722 
 6723     const Register tmpAddr = r9;
 6724     const Register dilithiumConsts = r10;
 6725     const Register result = r11;
 6726     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6727     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6728     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6729     int offsets[4] = { 0, 32, 64, 96 };
 6730     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6731     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6732 
 6733     __ add(result, coeffs, 0);
 6734     __ lea(dilithiumConsts,
 6735              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6736 
 6737     // Each level represents one iteration of the outer for loop of the Java version
 6738 
 6739     // level 0
 6740     // At level 0 we need to interleave adjacent quartets of
 6741     // coefficients before we multiply and add/sub by the next 16
 6742     // zetas just as we did for level 7 in the multiply code. So we
 6743     // load and store the values using an ld2/st2 with arrangement 4S.
 6744     for (int i = 0; i < 1024; i += 128) {
 6745       // load constants q, qinv
 6746       // n.b. this can be moved out of the loop as they do not get
 6747       // clobbered by first two loops
 6748       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6749       // a0/a1 load interleaved 32 (8x4S) coefficients
 6750       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6751       // b load next 32 (8x4S) inputs
 6752       vs_ldpq_post(vs_front(vs2), zetas);
 6753       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6754       // n.b. second half of vs2 provides temporary register storage
 6755       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6756                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6757       // a0/a1 store interleaved 32 (8x4S) coefficients
 6758       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6759     }
 6760 
 6761     // level 1
 6762     // At level 1 we need to interleave pairs of adjacent pairs of
 6763     // coefficients before we multiply by the next 16 zetas just as we
 6764     // did for level 6 in the multiply code. So we load and store the
 6765     // values an ld2/st2 with arrangement 2D.
 6766     for (int i = 0; i < 1024; i += 128) {
 6767       // a0/a1 load interleaved 32 (8x2D) coefficients
 6768       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6769       // b load next 16 (4x4S) inputs
 6770       vs_ldpq_post(vs_front(vs2), zetas);
 6771       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6772       // n.b. second half of vs2 provides temporary register storage
 6773       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6774                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6775       // a0/a1 store interleaved 32 (8x2D) coefficients
 6776       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6777     }
 6778 
 6779     // level 2
 6780     // At level 2 coefficients come in blocks of 4. So, we load 4
 6781     // adjacent coefficients at 8 distinct offsets for both the first
 6782     // and second coefficient sequences, using an ldr with register
 6783     // variant Q then combine them with next set of 32 zetas. Likewise
 6784     // we store the results using an str with register variant Q.
 6785     for (int i = 0; i < 1024; i += 256) {
 6786       // c0 load 32 (8x4S) coefficients via first offsets
 6787       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6788       // c1 load 32 (8x4S) coefficients via second offsets
 6789       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6790       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6791       vs_addv(vs3, __ T4S, vs1, vs2);
 6792       // c = c0 - c1
 6793       vs_subv(vs1, __ T4S, vs1, vs2);
 6794       // store a0 32 (8x4S) coefficients via first offsets
 6795       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6796       // b load 32 (8x4S) next inputs
 6797       vs_ldpq_post(vs2, zetas);
 6798       // reload constants q, qinv -- they were clobbered earlier
 6799       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6800       // compute a1 = b montmul c
 6801       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6802       // store a1 32 (8x4S) coefficients via second offsets
 6803       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6804     }
 6805 
 6806     // level 3-7
 6807     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6808 
 6809     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6810     __ mov(r0, zr); // return 0
 6811     __ ret(lr);
 6812 
 6813     return start;
 6814   }
 6815 
 6816   // Dilithium multiply polynomials in the NTT domain.
 6817   // Straightforward implementation of the method
 6818   // static int implDilithiumNttMult(
 6819   //              int[] result, int[] ntta, int[] nttb {} of
 6820   // the sun.security.provider.ML_DSA class.
 6821   //
 6822   // result (int[256]) = c_rarg0
 6823   // poly1 (int[256]) = c_rarg1
 6824   // poly2 (int[256]) = c_rarg2
 6825   address generate_dilithiumNttMult() {
 6826 
 6827         __ align(CodeEntryAlignment);
 6828     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6829     StubCodeMark mark(this, stub_id);
 6830     address start = __ pc();
 6831     __ enter();
 6832 
 6833     Label L_loop;
 6834 
 6835     const Register result = c_rarg0;
 6836     const Register poly1 = c_rarg1;
 6837     const Register poly2 = c_rarg2;
 6838 
 6839     const Register dilithiumConsts = r10;
 6840     const Register len = r11;
 6841 
 6842     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6843     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6844     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6845     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6846 
 6847     __ lea(dilithiumConsts,
 6848              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6849 
 6850     // load constants q, qinv
 6851     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6852     // load constant rSquare into v29
 6853     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6854 
 6855     __ mov(len, zr);
 6856     __ add(len, len, 1024);
 6857 
 6858     __ BIND(L_loop);
 6859 
 6860     // b load 32 (8x4S) next inputs from poly1
 6861     vs_ldpq_post(vs1, poly1);
 6862     // c load 32 (8x4S) next inputs from poly2
 6863     vs_ldpq_post(vs2, poly2);
 6864     // compute a = b montmul c
 6865     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6866     // compute a = rsquare montmul a
 6867     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6868     // save a 32 (8x4S) results
 6869     vs_stpq_post(vs2, result);
 6870 
 6871     __ sub(len, len, 128);
 6872     __ cmp(len, (u1)128);
 6873     __ br(Assembler::GE, L_loop);
 6874 
 6875     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6876     __ mov(r0, zr); // return 0
 6877     __ ret(lr);
 6878 
 6879     return start;
 6880   }
 6881 
 6882   // Dilithium Motgomery multiply an array by a constant.
 6883   // A straightforward implementation of the method
 6884   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6885   // of the sun.security.provider.MLDSA class
 6886   //
 6887   // coeffs (int[256]) = c_rarg0
 6888   // constant (int) = c_rarg1
 6889   address generate_dilithiumMontMulByConstant() {
 6890 
 6891     __ align(CodeEntryAlignment);
 6892     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6893     StubCodeMark mark(this, stub_id);
 6894     address start = __ pc();
 6895     __ enter();
 6896 
 6897     Label L_loop;
 6898 
 6899     const Register coeffs = c_rarg0;
 6900     const Register constant = c_rarg1;
 6901 
 6902     const Register dilithiumConsts = r10;
 6903     const Register result = r11;
 6904     const Register len = r12;
 6905 
 6906     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6907     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6908     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6909     VSeq<8> vconst(29, 0);             // for montmul by constant
 6910 
 6911     // results track inputs
 6912     __ add(result, coeffs, 0);
 6913     __ lea(dilithiumConsts,
 6914              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6915 
 6916     // load constants q, qinv -- they do not get clobbered by first two loops
 6917     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6918     // copy caller supplied constant across vconst
 6919     __ dup(vconst[0], __ T4S, constant);
 6920     __ mov(len, zr);
 6921     __ add(len, len, 1024);
 6922 
 6923     __ BIND(L_loop);
 6924 
 6925     // load next 32 inputs
 6926     vs_ldpq_post(vs2, coeffs);
 6927     // mont mul by constant
 6928     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6929     // write next 32 results
 6930     vs_stpq_post(vs2, result);
 6931 
 6932     __ sub(len, len, 128);
 6933     __ cmp(len, (u1)128);
 6934     __ br(Assembler::GE, L_loop);
 6935 
 6936     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6937     __ mov(r0, zr); // return 0
 6938     __ ret(lr);
 6939 
 6940     return start;
 6941   }
 6942 
 6943   // Dilithium decompose poly.
 6944   // Implements the method
 6945   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6946   // of the sun.security.provider.ML_DSA class
 6947   //
 6948   // input (int[256]) = c_rarg0
 6949   // lowPart (int[256]) = c_rarg1
 6950   // highPart (int[256]) = c_rarg2
 6951   // twoGamma2  (int) = c_rarg3
 6952   // multiplier (int) = c_rarg4
 6953   address generate_dilithiumDecomposePoly() {
 6954 
 6955     __ align(CodeEntryAlignment);
 6956     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 6957     StubCodeMark mark(this, stub_id);
 6958     address start = __ pc();
 6959     Label L_loop;
 6960 
 6961     const Register input = c_rarg0;
 6962     const Register lowPart = c_rarg1;
 6963     const Register highPart = c_rarg2;
 6964     const Register twoGamma2 = c_rarg3;
 6965     const Register multiplier = c_rarg4;
 6966 
 6967     const Register len = r9;
 6968     const Register dilithiumConsts = r10;
 6969     const Register tmp = r11;
 6970 
 6971     // 6 independent sets of 4x4s values
 6972     VSeq<4> vs1(0), vs2(4), vs3(8);
 6973     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6974 
 6975     // 7 constants for cross-multiplying
 6976     VSeq<4> one(25, 0);
 6977     VSeq<4> qminus1(26, 0);
 6978     VSeq<4> g2(27, 0);
 6979     VSeq<4> twog2(28, 0);
 6980     VSeq<4> mult(29, 0);
 6981     VSeq<4> q(30, 0);
 6982     VSeq<4> qadd(31, 0);
 6983 
 6984     __ enter();
 6985 
 6986     __ lea(dilithiumConsts,
 6987              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6988 
 6989     // save callee-saved registers
 6990     __ stpd(v8, v9, __ pre(sp, -64));
 6991     __ stpd(v10, v11, Address(sp, 16));
 6992     __ stpd(v12, v13, Address(sp, 32));
 6993     __ stpd(v14, v15, Address(sp, 48));
 6994 
 6995     // populate constant registers
 6996     __ mov(tmp, zr);
 6997     __ add(tmp, tmp, 1);
 6998     __ dup(one[0], __ T4S, tmp); // 1
 6999     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7000     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7001     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7002     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7003     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7004     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7005 
 7006     __ mov(len, zr);
 7007     __ add(len, len, 1024);
 7008 
 7009     __ BIND(L_loop);
 7010 
 7011     // load next 4x4S inputs interleaved: rplus --> vs1
 7012     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7013 
 7014     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7015     vs_addv(vtmp, __ T4S, vs1, qadd);
 7016     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7017     vs_mulv(vtmp, __ T4S, vtmp, q);
 7018     vs_subv(vs1, __ T4S, vs1, vtmp);
 7019 
 7020     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7021     vs_sshr(vtmp, __ T4S, vs1, 31);
 7022     vs_andr(vtmp, vtmp, q);
 7023     vs_addv(vs1, __ T4S, vs1, vtmp);
 7024 
 7025     // quotient --> vs2
 7026     // int quotient = (rplus * multiplier) >> 22;
 7027     vs_mulv(vtmp, __ T4S, vs1, mult);
 7028     vs_sshr(vs2, __ T4S, vtmp, 22);
 7029 
 7030     // r0 --> vs3
 7031     // int r0 = rplus - quotient * twoGamma2;
 7032     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7033     vs_subv(vs3, __ T4S, vs1, vtmp);
 7034 
 7035     // mask --> vs4
 7036     // int mask = (twoGamma2 - r0) >> 22;
 7037     vs_subv(vtmp, __ T4S, twog2, vs3);
 7038     vs_sshr(vs4, __ T4S, vtmp, 22);
 7039 
 7040     // r0 -= (mask & twoGamma2);
 7041     vs_andr(vtmp, vs4, twog2);
 7042     vs_subv(vs3, __ T4S, vs3, vtmp);
 7043 
 7044     //  quotient += (mask & 1);
 7045     vs_andr(vtmp, vs4, one);
 7046     vs_addv(vs2, __ T4S, vs2, vtmp);
 7047 
 7048     // mask = (twoGamma2 / 2 - r0) >> 31;
 7049     vs_subv(vtmp, __ T4S, g2, vs3);
 7050     vs_sshr(vs4, __ T4S, vtmp, 31);
 7051 
 7052     // r0 -= (mask & twoGamma2);
 7053     vs_andr(vtmp, vs4, twog2);
 7054     vs_subv(vs3, __ T4S, vs3, vtmp);
 7055 
 7056     // quotient += (mask & 1);
 7057     vs_andr(vtmp, vs4, one);
 7058     vs_addv(vs2, __ T4S, vs2, vtmp);
 7059 
 7060     // r1 --> vs5
 7061     // int r1 = rplus - r0 - (dilithium_q - 1);
 7062     vs_subv(vtmp, __ T4S, vs1, vs3);
 7063     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7064 
 7065     // r1 --> vs1 (overwriting rplus)
 7066     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7067     vs_negr(vtmp, __ T4S, vs5);
 7068     vs_orr(vtmp, vs5, vtmp);
 7069     vs_sshr(vs1, __ T4S, vtmp, 31);
 7070 
 7071     // r0 += ~r1;
 7072     vs_notr(vtmp, vs1);
 7073     vs_addv(vs3, __ T4S, vs3, vtmp);
 7074 
 7075     // r1 = r1 & quotient;
 7076     vs_andr(vs1, vs2, vs1);
 7077 
 7078     // store results inteleaved
 7079     // lowPart[m] = r0;
 7080     // highPart[m] = r1;
 7081     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7082     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7083 
 7084     __ sub(len, len, 64);
 7085     __ cmp(len, (u1)64);
 7086     __ br(Assembler::GE, L_loop);
 7087 
 7088     // restore callee-saved vector registers
 7089     __ ldpd(v14, v15, Address(sp, 48));
 7090     __ ldpd(v12, v13, Address(sp, 32));
 7091     __ ldpd(v10, v11, Address(sp, 16));
 7092     __ ldpd(v8, v9, __ post(sp, 64));
 7093 
 7094     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7095     __ mov(r0, zr); // return 0
 7096     __ ret(lr);
 7097 
 7098     return start;
 7099   }
 7100 
 7101   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7102              Register tmp0, Register tmp1, Register tmp2) {
 7103     __ bic(tmp0, a2, a1); // for a0
 7104     __ bic(tmp1, a3, a2); // for a1
 7105     __ bic(tmp2, a4, a3); // for a2
 7106     __ eor(a2, a2, tmp2);
 7107     __ bic(tmp2, a0, a4); // for a3
 7108     __ eor(a3, a3, tmp2);
 7109     __ bic(tmp2, a1, a0); // for a4
 7110     __ eor(a0, a0, tmp0);
 7111     __ eor(a1, a1, tmp1);
 7112     __ eor(a4, a4, tmp2);
 7113   }
 7114 
 7115   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7116                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7117                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7118                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7119                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7120                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7121                         Register tmp0, Register tmp1, Register tmp2) {
 7122     __ eor3(tmp1, a4, a9, a14);
 7123     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7124     __ eor3(tmp2, a1, a6, a11);
 7125     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7126     __ rax1(tmp2, tmp0, tmp1); // d0
 7127     {
 7128 
 7129       Register tmp3, tmp4;
 7130       if (can_use_fp && can_use_r18) {
 7131         tmp3 = rfp;
 7132         tmp4 = r18_tls;
 7133       } else {
 7134         tmp3 = a4;
 7135         tmp4 = a9;
 7136         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7137       }
 7138 
 7139       __ eor3(tmp3, a0, a5, a10);
 7140       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7141       __ eor(a0, a0, tmp2);
 7142       __ eor(a5, a5, tmp2);
 7143       __ eor(a10, a10, tmp2);
 7144       __ eor(a15, a15, tmp2);
 7145       __ eor(a20, a20, tmp2); // d0(tmp2)
 7146       __ eor3(tmp3, a2, a7, a12);
 7147       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7148       __ rax1(tmp3, tmp4, tmp2); // d1
 7149       __ eor(a1, a1, tmp3);
 7150       __ eor(a6, a6, tmp3);
 7151       __ eor(a11, a11, tmp3);
 7152       __ eor(a16, a16, tmp3);
 7153       __ eor(a21, a21, tmp3); // d1(tmp3)
 7154       __ rax1(tmp3, tmp2, tmp0); // d3
 7155       __ eor3(tmp2, a3, a8, a13);
 7156       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7157       __ eor(a3, a3, tmp3);
 7158       __ eor(a8, a8, tmp3);
 7159       __ eor(a13, a13, tmp3);
 7160       __ eor(a18, a18, tmp3);
 7161       __ eor(a23, a23, tmp3);
 7162       __ rax1(tmp2, tmp1, tmp0); // d2
 7163       __ eor(a2, a2, tmp2);
 7164       __ eor(a7, a7, tmp2);
 7165       __ eor(a12, a12, tmp2);
 7166       __ rax1(tmp0, tmp0, tmp4); // d4
 7167       if (!can_use_fp || !can_use_r18) {
 7168         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7169       }
 7170       __ eor(a17, a17, tmp2);
 7171       __ eor(a22, a22, tmp2);
 7172       __ eor(a4, a4, tmp0);
 7173       __ eor(a9, a9, tmp0);
 7174       __ eor(a14, a14, tmp0);
 7175       __ eor(a19, a19, tmp0);
 7176       __ eor(a24, a24, tmp0);
 7177     }
 7178 
 7179     __ rol(tmp0, a10, 3);
 7180     __ rol(a10, a1, 1);
 7181     __ rol(a1, a6, 44);
 7182     __ rol(a6, a9, 20);
 7183     __ rol(a9, a22, 61);
 7184     __ rol(a22, a14, 39);
 7185     __ rol(a14, a20, 18);
 7186     __ rol(a20, a2, 62);
 7187     __ rol(a2, a12, 43);
 7188     __ rol(a12, a13, 25);
 7189     __ rol(a13, a19, 8) ;
 7190     __ rol(a19, a23, 56);
 7191     __ rol(a23, a15, 41);
 7192     __ rol(a15, a4, 27);
 7193     __ rol(a4, a24, 14);
 7194     __ rol(a24, a21, 2);
 7195     __ rol(a21, a8, 55);
 7196     __ rol(a8, a16, 45);
 7197     __ rol(a16, a5, 36);
 7198     __ rol(a5, a3, 28);
 7199     __ rol(a3, a18, 21);
 7200     __ rol(a18, a17, 15);
 7201     __ rol(a17, a11, 10);
 7202     __ rol(a11, a7, 6);
 7203     __ mov(a7, tmp0);
 7204 
 7205     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7206     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7207     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7208     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7209     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7210 
 7211     __ ldr(tmp1, __ post(rc, 8));
 7212     __ eor(a0, a0, tmp1);
 7213 
 7214   }
 7215 
 7216   // Arguments:
 7217   //
 7218   // Inputs:
 7219   //   c_rarg0   - byte[]  source+offset
 7220   //   c_rarg1   - byte[]  SHA.state
 7221   //   c_rarg2   - int     block_size
 7222   //   c_rarg3   - int     offset
 7223   //   c_rarg4   - int     limit
 7224   //
 7225   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7226     bool multi_block;
 7227     switch (stub_id) {
 7228     case StubId::stubgen_sha3_implCompress_id:
 7229       multi_block = false;
 7230       break;
 7231     case StubId::stubgen_sha3_implCompressMB_id:
 7232       multi_block = true;
 7233       break;
 7234     default:
 7235       ShouldNotReachHere();
 7236     }
 7237 
 7238     static const uint64_t round_consts[24] = {
 7239       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7240       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7241       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7242       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7243       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7244       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7245       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7246       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7247     };
 7248 
 7249     __ align(CodeEntryAlignment);
 7250     StubCodeMark mark(this, stub_id);
 7251     address start = __ pc();
 7252 
 7253     Register buf           = c_rarg0;
 7254     Register state         = c_rarg1;
 7255     Register block_size    = c_rarg2;
 7256     Register ofs           = c_rarg3;
 7257     Register limit         = c_rarg4;
 7258 
 7259     // use r3.r17,r19..r28 to keep a0..a24.
 7260     // a0..a24 are respective locals from SHA3.java
 7261     Register a0 = r25,
 7262              a1 = r26,
 7263              a2 = r27,
 7264              a3 = r3,
 7265              a4 = r4,
 7266              a5 = r5,
 7267              a6 = r6,
 7268              a7 = r7,
 7269              a8 = rscratch1, // r8
 7270              a9 = rscratch2, // r9
 7271              a10 = r10,
 7272              a11 = r11,
 7273              a12 = r12,
 7274              a13 = r13,
 7275              a14 = r14,
 7276              a15 = r15,
 7277              a16 = r16,
 7278              a17 = r17,
 7279              a18 = r28,
 7280              a19 = r19,
 7281              a20 = r20,
 7282              a21 = r21,
 7283              a22 = r22,
 7284              a23 = r23,
 7285              a24 = r24;
 7286 
 7287     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7288 
 7289     Label sha3_loop, rounds24_preloop, loop_body;
 7290     Label sha3_512_or_sha3_384, shake128;
 7291 
 7292     bool can_use_r18 = false;
 7293 #ifndef R18_RESERVED
 7294     can_use_r18 = true;
 7295 #endif
 7296     bool can_use_fp = !PreserveFramePointer;
 7297 
 7298     __ enter();
 7299 
 7300     // save almost all yet unsaved gpr registers on stack
 7301     __ str(block_size, __ pre(sp, -128));
 7302     if (multi_block) {
 7303       __ stpw(ofs, limit, Address(sp, 8));
 7304     }
 7305     // 8 bytes at sp+16 will be used to keep buf
 7306     __ stp(r19, r20, Address(sp, 32));
 7307     __ stp(r21, r22, Address(sp, 48));
 7308     __ stp(r23, r24, Address(sp, 64));
 7309     __ stp(r25, r26, Address(sp, 80));
 7310     __ stp(r27, r28, Address(sp, 96));
 7311     if (can_use_r18 && can_use_fp) {
 7312       __ stp(r18_tls, state, Address(sp, 112));
 7313     } else {
 7314       __ str(state, Address(sp, 112));
 7315     }
 7316 
 7317     // begin sha3 calculations: loading a0..a24 from state arrary
 7318     __ ldp(a0, a1, state);
 7319     __ ldp(a2, a3, Address(state, 16));
 7320     __ ldp(a4, a5, Address(state, 32));
 7321     __ ldp(a6, a7, Address(state, 48));
 7322     __ ldp(a8, a9, Address(state, 64));
 7323     __ ldp(a10, a11, Address(state, 80));
 7324     __ ldp(a12, a13, Address(state, 96));
 7325     __ ldp(a14, a15, Address(state, 112));
 7326     __ ldp(a16, a17, Address(state, 128));
 7327     __ ldp(a18, a19, Address(state, 144));
 7328     __ ldp(a20, a21, Address(state, 160));
 7329     __ ldp(a22, a23, Address(state, 176));
 7330     __ ldr(a24, Address(state, 192));
 7331 
 7332     __ BIND(sha3_loop);
 7333 
 7334     // load input
 7335     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7336     __ eor(a0, a0, tmp3);
 7337     __ eor(a1, a1, tmp2);
 7338     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7339     __ eor(a2, a2, tmp3);
 7340     __ eor(a3, a3, tmp2);
 7341     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7342     __ eor(a4, a4, tmp3);
 7343     __ eor(a5, a5, tmp2);
 7344     __ ldr(tmp3, __ post(buf, 8));
 7345     __ eor(a6, a6, tmp3);
 7346 
 7347     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7348     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7349 
 7350     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7351     __ eor(a7, a7, tmp3);
 7352     __ eor(a8, a8, tmp2);
 7353     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7354     __ eor(a9, a9, tmp3);
 7355     __ eor(a10, a10, tmp2);
 7356     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7357     __ eor(a11, a11, tmp3);
 7358     __ eor(a12, a12, tmp2);
 7359     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7360     __ eor(a13, a13, tmp3);
 7361     __ eor(a14, a14, tmp2);
 7362     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7363     __ eor(a15, a15, tmp3);
 7364     __ eor(a16, a16, tmp2);
 7365 
 7366     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7367     __ andw(tmp2, block_size, 48);
 7368     __ cbzw(tmp2, rounds24_preloop);
 7369     __ tbnz(block_size, 5, shake128);
 7370     // block_size == 144, bit5 == 0, SHA3-244
 7371     __ ldr(tmp3, __ post(buf, 8));
 7372     __ eor(a17, a17, tmp3);
 7373     __ b(rounds24_preloop);
 7374 
 7375     __ BIND(shake128);
 7376     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7377     __ eor(a17, a17, tmp3);
 7378     __ eor(a18, a18, tmp2);
 7379     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7380     __ eor(a19, a19, tmp3);
 7381     __ eor(a20, a20, tmp2);
 7382     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7383 
 7384     __ BIND(sha3_512_or_sha3_384);
 7385     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7386     __ eor(a7, a7, tmp3);
 7387     __ eor(a8, a8, tmp2);
 7388     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7389 
 7390     // SHA3-384
 7391     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7392     __ eor(a9, a9, tmp3);
 7393     __ eor(a10, a10, tmp2);
 7394     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7395     __ eor(a11, a11, tmp3);
 7396     __ eor(a12, a12, tmp2);
 7397 
 7398     __ BIND(rounds24_preloop);
 7399     __ fmovs(v0, 24.0); // float loop counter,
 7400     __ fmovs(v1, 1.0);  // exact representation
 7401 
 7402     __ str(buf, Address(sp, 16));
 7403     __ lea(tmp3, ExternalAddress((address) round_consts));
 7404 
 7405     __ BIND(loop_body);
 7406     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7407                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7408                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7409                      tmp0, tmp1, tmp2);
 7410     __ fsubs(v0, v0, v1);
 7411     __ fcmps(v0, 0.0);
 7412     __ br(__ NE, loop_body);
 7413 
 7414     if (multi_block) {
 7415       __ ldrw(block_size, sp); // block_size
 7416       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7417       __ addw(tmp2, tmp2, block_size);
 7418       __ cmpw(tmp2, tmp1);
 7419       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7420       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7421       __ br(Assembler::LE, sha3_loop);
 7422       __ movw(c_rarg0, tmp2); // return offset
 7423     }
 7424     if (can_use_fp && can_use_r18) {
 7425       __ ldp(r18_tls, state, Address(sp, 112));
 7426     } else {
 7427       __ ldr(state, Address(sp, 112));
 7428     }
 7429     // save calculated sha3 state
 7430     __ stp(a0, a1, Address(state));
 7431     __ stp(a2, a3, Address(state, 16));
 7432     __ stp(a4, a5, Address(state, 32));
 7433     __ stp(a6, a7, Address(state, 48));
 7434     __ stp(a8, a9, Address(state, 64));
 7435     __ stp(a10, a11, Address(state, 80));
 7436     __ stp(a12, a13, Address(state, 96));
 7437     __ stp(a14, a15, Address(state, 112));
 7438     __ stp(a16, a17, Address(state, 128));
 7439     __ stp(a18, a19, Address(state, 144));
 7440     __ stp(a20, a21, Address(state, 160));
 7441     __ stp(a22, a23, Address(state, 176));
 7442     __ str(a24, Address(state, 192));
 7443 
 7444     // restore required registers from stack
 7445     __ ldp(r19, r20, Address(sp, 32));
 7446     __ ldp(r21, r22, Address(sp, 48));
 7447     __ ldp(r23, r24, Address(sp, 64));
 7448     __ ldp(r25, r26, Address(sp, 80));
 7449     __ ldp(r27, r28, Address(sp, 96));
 7450     if (can_use_fp && can_use_r18) {
 7451       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7452     } // else no need to recalculate rfp, since it wasn't changed
 7453 
 7454     __ leave();
 7455 
 7456     __ ret(lr);
 7457 
 7458     return start;
 7459   }
 7460 
 7461   /**
 7462    *  Arguments:
 7463    *
 7464    * Inputs:
 7465    *   c_rarg0   - int crc
 7466    *   c_rarg1   - byte* buf
 7467    *   c_rarg2   - int length
 7468    *
 7469    * Output:
 7470    *       rax   - int crc result
 7471    */
 7472   address generate_updateBytesCRC32() {
 7473     assert(UseCRC32Intrinsics, "what are we doing here?");
 7474 
 7475     __ align(CodeEntryAlignment);
 7476     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7477     StubCodeMark mark(this, stub_id);
 7478 
 7479     address start = __ pc();
 7480 
 7481     const Register crc   = c_rarg0;  // crc
 7482     const Register buf   = c_rarg1;  // source java byte array address
 7483     const Register len   = c_rarg2;  // length
 7484     const Register table0 = c_rarg3; // crc_table address
 7485     const Register table1 = c_rarg4;
 7486     const Register table2 = c_rarg5;
 7487     const Register table3 = c_rarg6;
 7488     const Register tmp3 = c_rarg7;
 7489 
 7490     BLOCK_COMMENT("Entry:");
 7491     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7492 
 7493     __ kernel_crc32(crc, buf, len,
 7494               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7495 
 7496     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7497     __ ret(lr);
 7498 
 7499     return start;
 7500   }
 7501 
 7502   /**
 7503    *  Arguments:
 7504    *
 7505    * Inputs:
 7506    *   c_rarg0   - int crc
 7507    *   c_rarg1   - byte* buf
 7508    *   c_rarg2   - int length
 7509    *   c_rarg3   - int* table
 7510    *
 7511    * Output:
 7512    *       r0   - int crc result
 7513    */
 7514   address generate_updateBytesCRC32C() {
 7515     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7516 
 7517     __ align(CodeEntryAlignment);
 7518     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7519     StubCodeMark mark(this, stub_id);
 7520 
 7521     address start = __ pc();
 7522 
 7523     const Register crc   = c_rarg0;  // crc
 7524     const Register buf   = c_rarg1;  // source java byte array address
 7525     const Register len   = c_rarg2;  // length
 7526     const Register table0 = c_rarg3; // crc_table address
 7527     const Register table1 = c_rarg4;
 7528     const Register table2 = c_rarg5;
 7529     const Register table3 = c_rarg6;
 7530     const Register tmp3 = c_rarg7;
 7531 
 7532     BLOCK_COMMENT("Entry:");
 7533     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7534 
 7535     __ kernel_crc32c(crc, buf, len,
 7536               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7537 
 7538     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7539     __ ret(lr);
 7540 
 7541     return start;
 7542   }
 7543 
 7544   /***
 7545    *  Arguments:
 7546    *
 7547    *  Inputs:
 7548    *   c_rarg0   - int   adler
 7549    *   c_rarg1   - byte* buff
 7550    *   c_rarg2   - int   len
 7551    *
 7552    * Output:
 7553    *   c_rarg0   - int adler result
 7554    */
 7555   address generate_updateBytesAdler32() {
 7556     __ align(CodeEntryAlignment);
 7557     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7558     StubCodeMark mark(this, stub_id);
 7559     address start = __ pc();
 7560 
 7561     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7562 
 7563     // Aliases
 7564     Register adler  = c_rarg0;
 7565     Register s1     = c_rarg0;
 7566     Register s2     = c_rarg3;
 7567     Register buff   = c_rarg1;
 7568     Register len    = c_rarg2;
 7569     Register nmax  = r4;
 7570     Register base  = r5;
 7571     Register count = r6;
 7572     Register temp0 = rscratch1;
 7573     Register temp1 = rscratch2;
 7574     FloatRegister vbytes = v0;
 7575     FloatRegister vs1acc = v1;
 7576     FloatRegister vs2acc = v2;
 7577     FloatRegister vtable = v3;
 7578 
 7579     // Max number of bytes we can process before having to take the mod
 7580     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7581     uint64_t BASE = 0xfff1;
 7582     uint64_t NMAX = 0x15B0;
 7583 
 7584     __ mov(base, BASE);
 7585     __ mov(nmax, NMAX);
 7586 
 7587     // Load accumulation coefficients for the upper 16 bits
 7588     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7589     __ ld1(vtable, __ T16B, Address(temp0));
 7590 
 7591     // s1 is initialized to the lower 16 bits of adler
 7592     // s2 is initialized to the upper 16 bits of adler
 7593     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7594     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7595 
 7596     // The pipelined loop needs at least 16 elements for 1 iteration
 7597     // It does check this, but it is more effective to skip to the cleanup loop
 7598     __ cmp(len, (u1)16);
 7599     __ br(Assembler::HS, L_nmax);
 7600     __ cbz(len, L_combine);
 7601 
 7602     __ bind(L_simple_by1_loop);
 7603     __ ldrb(temp0, Address(__ post(buff, 1)));
 7604     __ add(s1, s1, temp0);
 7605     __ add(s2, s2, s1);
 7606     __ subs(len, len, 1);
 7607     __ br(Assembler::HI, L_simple_by1_loop);
 7608 
 7609     // s1 = s1 % BASE
 7610     __ subs(temp0, s1, base);
 7611     __ csel(s1, temp0, s1, Assembler::HS);
 7612 
 7613     // s2 = s2 % BASE
 7614     __ lsr(temp0, s2, 16);
 7615     __ lsl(temp1, temp0, 4);
 7616     __ sub(temp1, temp1, temp0);
 7617     __ add(s2, temp1, s2, ext::uxth);
 7618 
 7619     __ subs(temp0, s2, base);
 7620     __ csel(s2, temp0, s2, Assembler::HS);
 7621 
 7622     __ b(L_combine);
 7623 
 7624     __ bind(L_nmax);
 7625     __ subs(len, len, nmax);
 7626     __ sub(count, nmax, 16);
 7627     __ br(Assembler::LO, L_by16);
 7628 
 7629     __ bind(L_nmax_loop);
 7630 
 7631     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7632                                       vbytes, vs1acc, vs2acc, vtable);
 7633 
 7634     __ subs(count, count, 16);
 7635     __ br(Assembler::HS, L_nmax_loop);
 7636 
 7637     // s1 = s1 % BASE
 7638     __ lsr(temp0, s1, 16);
 7639     __ lsl(temp1, temp0, 4);
 7640     __ sub(temp1, temp1, temp0);
 7641     __ add(temp1, temp1, s1, ext::uxth);
 7642 
 7643     __ lsr(temp0, temp1, 16);
 7644     __ lsl(s1, temp0, 4);
 7645     __ sub(s1, s1, temp0);
 7646     __ add(s1, s1, temp1, ext:: uxth);
 7647 
 7648     __ subs(temp0, s1, base);
 7649     __ csel(s1, temp0, s1, Assembler::HS);
 7650 
 7651     // s2 = s2 % BASE
 7652     __ lsr(temp0, s2, 16);
 7653     __ lsl(temp1, temp0, 4);
 7654     __ sub(temp1, temp1, temp0);
 7655     __ add(temp1, temp1, s2, ext::uxth);
 7656 
 7657     __ lsr(temp0, temp1, 16);
 7658     __ lsl(s2, temp0, 4);
 7659     __ sub(s2, s2, temp0);
 7660     __ add(s2, s2, temp1, ext:: uxth);
 7661 
 7662     __ subs(temp0, s2, base);
 7663     __ csel(s2, temp0, s2, Assembler::HS);
 7664 
 7665     __ subs(len, len, nmax);
 7666     __ sub(count, nmax, 16);
 7667     __ br(Assembler::HS, L_nmax_loop);
 7668 
 7669     __ bind(L_by16);
 7670     __ adds(len, len, count);
 7671     __ br(Assembler::LO, L_by1);
 7672 
 7673     __ bind(L_by16_loop);
 7674 
 7675     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7676                                       vbytes, vs1acc, vs2acc, vtable);
 7677 
 7678     __ subs(len, len, 16);
 7679     __ br(Assembler::HS, L_by16_loop);
 7680 
 7681     __ bind(L_by1);
 7682     __ adds(len, len, 15);
 7683     __ br(Assembler::LO, L_do_mod);
 7684 
 7685     __ bind(L_by1_loop);
 7686     __ ldrb(temp0, Address(__ post(buff, 1)));
 7687     __ add(s1, temp0, s1);
 7688     __ add(s2, s2, s1);
 7689     __ subs(len, len, 1);
 7690     __ br(Assembler::HS, L_by1_loop);
 7691 
 7692     __ bind(L_do_mod);
 7693     // s1 = s1 % BASE
 7694     __ lsr(temp0, s1, 16);
 7695     __ lsl(temp1, temp0, 4);
 7696     __ sub(temp1, temp1, temp0);
 7697     __ add(temp1, temp1, s1, ext::uxth);
 7698 
 7699     __ lsr(temp0, temp1, 16);
 7700     __ lsl(s1, temp0, 4);
 7701     __ sub(s1, s1, temp0);
 7702     __ add(s1, s1, temp1, ext:: uxth);
 7703 
 7704     __ subs(temp0, s1, base);
 7705     __ csel(s1, temp0, s1, Assembler::HS);
 7706 
 7707     // s2 = s2 % BASE
 7708     __ lsr(temp0, s2, 16);
 7709     __ lsl(temp1, temp0, 4);
 7710     __ sub(temp1, temp1, temp0);
 7711     __ add(temp1, temp1, s2, ext::uxth);
 7712 
 7713     __ lsr(temp0, temp1, 16);
 7714     __ lsl(s2, temp0, 4);
 7715     __ sub(s2, s2, temp0);
 7716     __ add(s2, s2, temp1, ext:: uxth);
 7717 
 7718     __ subs(temp0, s2, base);
 7719     __ csel(s2, temp0, s2, Assembler::HS);
 7720 
 7721     // Combine lower bits and higher bits
 7722     __ bind(L_combine);
 7723     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7724 
 7725     __ ret(lr);
 7726 
 7727     return start;
 7728   }
 7729 
 7730   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7731           Register temp0, Register temp1, FloatRegister vbytes,
 7732           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7733     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7734     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7735     // In non-vectorized code, we update s1 and s2 as:
 7736     //   s1 <- s1 + b1
 7737     //   s2 <- s2 + s1
 7738     //   s1 <- s1 + b2
 7739     //   s2 <- s2 + b1
 7740     //   ...
 7741     //   s1 <- s1 + b16
 7742     //   s2 <- s2 + s1
 7743     // Putting above assignments together, we have:
 7744     //   s1_new = s1 + b1 + b2 + ... + b16
 7745     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7746     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7747     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7748     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7749 
 7750     // s2 = s2 + s1 * 16
 7751     __ add(s2, s2, s1, Assembler::LSL, 4);
 7752 
 7753     // vs1acc = b1 + b2 + b3 + ... + b16
 7754     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7755     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7756     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7757     __ uaddlv(vs1acc, __ T16B, vbytes);
 7758     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7759 
 7760     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7761     __ fmovd(temp0, vs1acc);
 7762     __ fmovd(temp1, vs2acc);
 7763     __ add(s1, s1, temp0);
 7764     __ add(s2, s2, temp1);
 7765   }
 7766 
 7767   /**
 7768    *  Arguments:
 7769    *
 7770    *  Input:
 7771    *    c_rarg0   - x address
 7772    *    c_rarg1   - x length
 7773    *    c_rarg2   - y address
 7774    *    c_rarg3   - y length
 7775    *    c_rarg4   - z address
 7776    */
 7777   address generate_multiplyToLen() {
 7778     __ align(CodeEntryAlignment);
 7779     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7780     StubCodeMark mark(this, stub_id);
 7781 
 7782     address start = __ pc();
 7783     const Register x     = r0;
 7784     const Register xlen  = r1;
 7785     const Register y     = r2;
 7786     const Register ylen  = r3;
 7787     const Register z     = r4;
 7788 
 7789     const Register tmp0  = r5;
 7790     const Register tmp1  = r10;
 7791     const Register tmp2  = r11;
 7792     const Register tmp3  = r12;
 7793     const Register tmp4  = r13;
 7794     const Register tmp5  = r14;
 7795     const Register tmp6  = r15;
 7796     const Register tmp7  = r16;
 7797 
 7798     BLOCK_COMMENT("Entry:");
 7799     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7800     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7801     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7802     __ ret(lr);
 7803 
 7804     return start;
 7805   }
 7806 
 7807   address generate_squareToLen() {
 7808     // squareToLen algorithm for sizes 1..127 described in java code works
 7809     // faster than multiply_to_len on some CPUs and slower on others, but
 7810     // multiply_to_len shows a bit better overall results
 7811     __ align(CodeEntryAlignment);
 7812     StubId stub_id = StubId::stubgen_squareToLen_id;
 7813     StubCodeMark mark(this, stub_id);
 7814     address start = __ pc();
 7815 
 7816     const Register x     = r0;
 7817     const Register xlen  = r1;
 7818     const Register z     = r2;
 7819     const Register y     = r4; // == x
 7820     const Register ylen  = r5; // == xlen
 7821 
 7822     const Register tmp0  = r3;
 7823     const Register tmp1  = r10;
 7824     const Register tmp2  = r11;
 7825     const Register tmp3  = r12;
 7826     const Register tmp4  = r13;
 7827     const Register tmp5  = r14;
 7828     const Register tmp6  = r15;
 7829     const Register tmp7  = r16;
 7830 
 7831     RegSet spilled_regs = RegSet::of(y, ylen);
 7832     BLOCK_COMMENT("Entry:");
 7833     __ enter();
 7834     __ push(spilled_regs, sp);
 7835     __ mov(y, x);
 7836     __ mov(ylen, xlen);
 7837     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7838     __ pop(spilled_regs, sp);
 7839     __ leave();
 7840     __ ret(lr);
 7841     return start;
 7842   }
 7843 
 7844   address generate_mulAdd() {
 7845     __ align(CodeEntryAlignment);
 7846     StubId stub_id = StubId::stubgen_mulAdd_id;
 7847     StubCodeMark mark(this, stub_id);
 7848 
 7849     address start = __ pc();
 7850 
 7851     const Register out     = r0;
 7852     const Register in      = r1;
 7853     const Register offset  = r2;
 7854     const Register len     = r3;
 7855     const Register k       = r4;
 7856 
 7857     BLOCK_COMMENT("Entry:");
 7858     __ enter();
 7859     __ mul_add(out, in, offset, len, k);
 7860     __ leave();
 7861     __ ret(lr);
 7862 
 7863     return start;
 7864   }
 7865 
 7866   // Arguments:
 7867   //
 7868   // Input:
 7869   //   c_rarg0   - newArr address
 7870   //   c_rarg1   - oldArr address
 7871   //   c_rarg2   - newIdx
 7872   //   c_rarg3   - shiftCount
 7873   //   c_rarg4   - numIter
 7874   //
 7875   address generate_bigIntegerRightShift() {
 7876     __ align(CodeEntryAlignment);
 7877     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7878     StubCodeMark mark(this, stub_id);
 7879     address start = __ pc();
 7880 
 7881     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7882 
 7883     Register newArr        = c_rarg0;
 7884     Register oldArr        = c_rarg1;
 7885     Register newIdx        = c_rarg2;
 7886     Register shiftCount    = c_rarg3;
 7887     Register numIter       = c_rarg4;
 7888     Register idx           = numIter;
 7889 
 7890     Register newArrCur     = rscratch1;
 7891     Register shiftRevCount = rscratch2;
 7892     Register oldArrCur     = r13;
 7893     Register oldArrNext    = r14;
 7894 
 7895     FloatRegister oldElem0        = v0;
 7896     FloatRegister oldElem1        = v1;
 7897     FloatRegister newElem         = v2;
 7898     FloatRegister shiftVCount     = v3;
 7899     FloatRegister shiftVRevCount  = v4;
 7900 
 7901     __ cbz(idx, Exit);
 7902 
 7903     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7904 
 7905     // left shift count
 7906     __ movw(shiftRevCount, 32);
 7907     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7908 
 7909     // numIter too small to allow a 4-words SIMD loop, rolling back
 7910     __ cmp(numIter, (u1)4);
 7911     __ br(Assembler::LT, ShiftThree);
 7912 
 7913     __ dup(shiftVCount,    __ T4S, shiftCount);
 7914     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7915     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7916 
 7917     __ BIND(ShiftSIMDLoop);
 7918 
 7919     // Calculate the load addresses
 7920     __ sub(idx, idx, 4);
 7921     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7922     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7923     __ add(oldArrCur,  oldArrNext, 4);
 7924 
 7925     // Load 4 words and process
 7926     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7927     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7928     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7929     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7930     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7931     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7932 
 7933     __ cmp(idx, (u1)4);
 7934     __ br(Assembler::LT, ShiftTwoLoop);
 7935     __ b(ShiftSIMDLoop);
 7936 
 7937     __ BIND(ShiftTwoLoop);
 7938     __ cbz(idx, Exit);
 7939     __ cmp(idx, (u1)1);
 7940     __ br(Assembler::EQ, ShiftOne);
 7941 
 7942     // Calculate the load addresses
 7943     __ sub(idx, idx, 2);
 7944     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7945     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7946     __ add(oldArrCur,  oldArrNext, 4);
 7947 
 7948     // Load 2 words and process
 7949     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7950     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7951     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7952     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7953     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7954     __ st1(newElem,   __ T2S, Address(newArrCur));
 7955     __ b(ShiftTwoLoop);
 7956 
 7957     __ BIND(ShiftThree);
 7958     __ tbz(idx, 1, ShiftOne);
 7959     __ tbz(idx, 0, ShiftTwo);
 7960     __ ldrw(r10,  Address(oldArr, 12));
 7961     __ ldrw(r11,  Address(oldArr, 8));
 7962     __ lsrvw(r10, r10, shiftCount);
 7963     __ lslvw(r11, r11, shiftRevCount);
 7964     __ orrw(r12,  r10, r11);
 7965     __ strw(r12,  Address(newArr, 8));
 7966 
 7967     __ BIND(ShiftTwo);
 7968     __ ldrw(r10,  Address(oldArr, 8));
 7969     __ ldrw(r11,  Address(oldArr, 4));
 7970     __ lsrvw(r10, r10, shiftCount);
 7971     __ lslvw(r11, r11, shiftRevCount);
 7972     __ orrw(r12,  r10, r11);
 7973     __ strw(r12,  Address(newArr, 4));
 7974 
 7975     __ BIND(ShiftOne);
 7976     __ ldrw(r10,  Address(oldArr, 4));
 7977     __ ldrw(r11,  Address(oldArr));
 7978     __ lsrvw(r10, r10, shiftCount);
 7979     __ lslvw(r11, r11, shiftRevCount);
 7980     __ orrw(r12,  r10, r11);
 7981     __ strw(r12,  Address(newArr));
 7982 
 7983     __ BIND(Exit);
 7984     __ ret(lr);
 7985 
 7986     return start;
 7987   }
 7988 
 7989   // Arguments:
 7990   //
 7991   // Input:
 7992   //   c_rarg0   - newArr address
 7993   //   c_rarg1   - oldArr address
 7994   //   c_rarg2   - newIdx
 7995   //   c_rarg3   - shiftCount
 7996   //   c_rarg4   - numIter
 7997   //
 7998   address generate_bigIntegerLeftShift() {
 7999     __ align(CodeEntryAlignment);
 8000     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8001     StubCodeMark mark(this, stub_id);
 8002     address start = __ pc();
 8003 
 8004     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8005 
 8006     Register newArr        = c_rarg0;
 8007     Register oldArr        = c_rarg1;
 8008     Register newIdx        = c_rarg2;
 8009     Register shiftCount    = c_rarg3;
 8010     Register numIter       = c_rarg4;
 8011 
 8012     Register shiftRevCount = rscratch1;
 8013     Register oldArrNext    = rscratch2;
 8014 
 8015     FloatRegister oldElem0        = v0;
 8016     FloatRegister oldElem1        = v1;
 8017     FloatRegister newElem         = v2;
 8018     FloatRegister shiftVCount     = v3;
 8019     FloatRegister shiftVRevCount  = v4;
 8020 
 8021     __ cbz(numIter, Exit);
 8022 
 8023     __ add(oldArrNext, oldArr, 4);
 8024     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8025 
 8026     // right shift count
 8027     __ movw(shiftRevCount, 32);
 8028     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8029 
 8030     // numIter too small to allow a 4-words SIMD loop, rolling back
 8031     __ cmp(numIter, (u1)4);
 8032     __ br(Assembler::LT, ShiftThree);
 8033 
 8034     __ dup(shiftVCount,     __ T4S, shiftCount);
 8035     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8036     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8037 
 8038     __ BIND(ShiftSIMDLoop);
 8039 
 8040     // load 4 words and process
 8041     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8042     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8043     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8044     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8045     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8046     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8047     __ sub(numIter,   numIter, 4);
 8048 
 8049     __ cmp(numIter, (u1)4);
 8050     __ br(Assembler::LT, ShiftTwoLoop);
 8051     __ b(ShiftSIMDLoop);
 8052 
 8053     __ BIND(ShiftTwoLoop);
 8054     __ cbz(numIter, Exit);
 8055     __ cmp(numIter, (u1)1);
 8056     __ br(Assembler::EQ, ShiftOne);
 8057 
 8058     // load 2 words and process
 8059     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8060     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8061     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8062     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8063     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8064     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8065     __ sub(numIter,   numIter, 2);
 8066     __ b(ShiftTwoLoop);
 8067 
 8068     __ BIND(ShiftThree);
 8069     __ ldrw(r10,  __ post(oldArr, 4));
 8070     __ ldrw(r11,  __ post(oldArrNext, 4));
 8071     __ lslvw(r10, r10, shiftCount);
 8072     __ lsrvw(r11, r11, shiftRevCount);
 8073     __ orrw(r12,  r10, r11);
 8074     __ strw(r12,  __ post(newArr, 4));
 8075     __ tbz(numIter, 1, Exit);
 8076     __ tbz(numIter, 0, ShiftOne);
 8077 
 8078     __ BIND(ShiftTwo);
 8079     __ ldrw(r10,  __ post(oldArr, 4));
 8080     __ ldrw(r11,  __ post(oldArrNext, 4));
 8081     __ lslvw(r10, r10, shiftCount);
 8082     __ lsrvw(r11, r11, shiftRevCount);
 8083     __ orrw(r12,  r10, r11);
 8084     __ strw(r12,  __ post(newArr, 4));
 8085 
 8086     __ BIND(ShiftOne);
 8087     __ ldrw(r10,  Address(oldArr));
 8088     __ ldrw(r11,  Address(oldArrNext));
 8089     __ lslvw(r10, r10, shiftCount);
 8090     __ lsrvw(r11, r11, shiftRevCount);
 8091     __ orrw(r12,  r10, r11);
 8092     __ strw(r12,  Address(newArr));
 8093 
 8094     __ BIND(Exit);
 8095     __ ret(lr);
 8096 
 8097     return start;
 8098   }
 8099 
 8100   address generate_count_positives(address &count_positives_long) {
 8101     const u1 large_loop_size = 64;
 8102     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8103     int dcache_line = VM_Version::dcache_line_size();
 8104 
 8105     Register ary1 = r1, len = r2, result = r0;
 8106 
 8107     __ align(CodeEntryAlignment);
 8108 
 8109     StubId stub_id = StubId::stubgen_count_positives_id;
 8110     StubCodeMark mark(this, stub_id);
 8111 
 8112     address entry = __ pc();
 8113 
 8114     __ enter();
 8115     // precondition: a copy of len is already in result
 8116     // __ mov(result, len);
 8117 
 8118   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8119         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8120 
 8121   __ cmp(len, (u1)15);
 8122   __ br(Assembler::GT, LEN_OVER_15);
 8123   // The only case when execution falls into this code is when pointer is near
 8124   // the end of memory page and we have to avoid reading next page
 8125   __ add(ary1, ary1, len);
 8126   __ subs(len, len, 8);
 8127   __ br(Assembler::GT, LEN_OVER_8);
 8128   __ ldr(rscratch2, Address(ary1, -8));
 8129   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8130   __ lsrv(rscratch2, rscratch2, rscratch1);
 8131   __ tst(rscratch2, UPPER_BIT_MASK);
 8132   __ csel(result, zr, result, Assembler::NE);
 8133   __ leave();
 8134   __ ret(lr);
 8135   __ bind(LEN_OVER_8);
 8136   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8137   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8138   __ tst(rscratch2, UPPER_BIT_MASK);
 8139   __ br(Assembler::NE, RET_NO_POP);
 8140   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8141   __ lsrv(rscratch1, rscratch1, rscratch2);
 8142   __ tst(rscratch1, UPPER_BIT_MASK);
 8143   __ bind(RET_NO_POP);
 8144   __ csel(result, zr, result, Assembler::NE);
 8145   __ leave();
 8146   __ ret(lr);
 8147 
 8148   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8149   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8150 
 8151   count_positives_long = __ pc(); // 2nd entry point
 8152 
 8153   __ enter();
 8154 
 8155   __ bind(LEN_OVER_15);
 8156     __ push(spilled_regs, sp);
 8157     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8158     __ cbz(rscratch2, ALIGNED);
 8159     __ ldp(tmp6, tmp1, Address(ary1));
 8160     __ mov(tmp5, 16);
 8161     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8162     __ add(ary1, ary1, rscratch1);
 8163     __ orr(tmp6, tmp6, tmp1);
 8164     __ tst(tmp6, UPPER_BIT_MASK);
 8165     __ br(Assembler::NE, RET_ADJUST);
 8166     __ sub(len, len, rscratch1);
 8167 
 8168   __ bind(ALIGNED);
 8169     __ cmp(len, large_loop_size);
 8170     __ br(Assembler::LT, CHECK_16);
 8171     // Perform 16-byte load as early return in pre-loop to handle situation
 8172     // when initially aligned large array has negative values at starting bytes,
 8173     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8174     // slower. Cases with negative bytes further ahead won't be affected that
 8175     // much. In fact, it'll be faster due to early loads, less instructions and
 8176     // less branches in LARGE_LOOP.
 8177     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8178     __ sub(len, len, 16);
 8179     __ orr(tmp6, tmp6, tmp1);
 8180     __ tst(tmp6, UPPER_BIT_MASK);
 8181     __ br(Assembler::NE, RET_ADJUST_16);
 8182     __ cmp(len, large_loop_size);
 8183     __ br(Assembler::LT, CHECK_16);
 8184 
 8185     if (SoftwarePrefetchHintDistance >= 0
 8186         && SoftwarePrefetchHintDistance >= dcache_line) {
 8187       // initial prefetch
 8188       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8189     }
 8190   __ bind(LARGE_LOOP);
 8191     if (SoftwarePrefetchHintDistance >= 0) {
 8192       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8193     }
 8194     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8195     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8196     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8197     // instructions per cycle and have less branches, but this approach disables
 8198     // early return, thus, all 64 bytes are loaded and checked every time.
 8199     __ ldp(tmp2, tmp3, Address(ary1));
 8200     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8201     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8202     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8203     __ add(ary1, ary1, large_loop_size);
 8204     __ sub(len, len, large_loop_size);
 8205     __ orr(tmp2, tmp2, tmp3);
 8206     __ orr(tmp4, tmp4, tmp5);
 8207     __ orr(rscratch1, rscratch1, rscratch2);
 8208     __ orr(tmp6, tmp6, tmp1);
 8209     __ orr(tmp2, tmp2, tmp4);
 8210     __ orr(rscratch1, rscratch1, tmp6);
 8211     __ orr(tmp2, tmp2, rscratch1);
 8212     __ tst(tmp2, UPPER_BIT_MASK);
 8213     __ br(Assembler::NE, RET_ADJUST_LONG);
 8214     __ cmp(len, large_loop_size);
 8215     __ br(Assembler::GE, LARGE_LOOP);
 8216 
 8217   __ bind(CHECK_16); // small 16-byte load pre-loop
 8218     __ cmp(len, (u1)16);
 8219     __ br(Assembler::LT, POST_LOOP16);
 8220 
 8221   __ bind(LOOP16); // small 16-byte load loop
 8222     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8223     __ sub(len, len, 16);
 8224     __ orr(tmp2, tmp2, tmp3);
 8225     __ tst(tmp2, UPPER_BIT_MASK);
 8226     __ br(Assembler::NE, RET_ADJUST_16);
 8227     __ cmp(len, (u1)16);
 8228     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8229 
 8230   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8231     __ cmp(len, (u1)8);
 8232     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8233     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8234     __ tst(tmp3, UPPER_BIT_MASK);
 8235     __ br(Assembler::NE, RET_ADJUST);
 8236     __ sub(len, len, 8);
 8237 
 8238   __ bind(POST_LOOP16_LOAD_TAIL);
 8239     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8240     __ ldr(tmp1, Address(ary1));
 8241     __ mov(tmp2, 64);
 8242     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8243     __ lslv(tmp1, tmp1, tmp4);
 8244     __ tst(tmp1, UPPER_BIT_MASK);
 8245     __ br(Assembler::NE, RET_ADJUST);
 8246     // Fallthrough
 8247 
 8248   __ bind(RET_LEN);
 8249     __ pop(spilled_regs, sp);
 8250     __ leave();
 8251     __ ret(lr);
 8252 
 8253     // difference result - len is the count of guaranteed to be
 8254     // positive bytes
 8255 
 8256   __ bind(RET_ADJUST_LONG);
 8257     __ add(len, len, (u1)(large_loop_size - 16));
 8258   __ bind(RET_ADJUST_16);
 8259     __ add(len, len, 16);
 8260   __ bind(RET_ADJUST);
 8261     __ pop(spilled_regs, sp);
 8262     __ leave();
 8263     __ sub(result, result, len);
 8264     __ ret(lr);
 8265 
 8266     return entry;
 8267   }
 8268 
 8269   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8270         bool usePrefetch, Label &NOT_EQUAL) {
 8271     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8272         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8273         tmp7 = r12, tmp8 = r13;
 8274     Label LOOP;
 8275 
 8276     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8277     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8278     __ bind(LOOP);
 8279     if (usePrefetch) {
 8280       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8281       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8282     }
 8283     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8284     __ eor(tmp1, tmp1, tmp2);
 8285     __ eor(tmp3, tmp3, tmp4);
 8286     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8287     __ orr(tmp1, tmp1, tmp3);
 8288     __ cbnz(tmp1, NOT_EQUAL);
 8289     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8290     __ eor(tmp5, tmp5, tmp6);
 8291     __ eor(tmp7, tmp7, tmp8);
 8292     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8293     __ orr(tmp5, tmp5, tmp7);
 8294     __ cbnz(tmp5, NOT_EQUAL);
 8295     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8296     __ eor(tmp1, tmp1, tmp2);
 8297     __ eor(tmp3, tmp3, tmp4);
 8298     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8299     __ orr(tmp1, tmp1, tmp3);
 8300     __ cbnz(tmp1, NOT_EQUAL);
 8301     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8302     __ eor(tmp5, tmp5, tmp6);
 8303     __ sub(cnt1, cnt1, 8 * wordSize);
 8304     __ eor(tmp7, tmp7, tmp8);
 8305     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8306     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8307     // cmp) because subs allows an unlimited range of immediate operand.
 8308     __ subs(tmp6, cnt1, loopThreshold);
 8309     __ orr(tmp5, tmp5, tmp7);
 8310     __ cbnz(tmp5, NOT_EQUAL);
 8311     __ br(__ GE, LOOP);
 8312     // post-loop
 8313     __ eor(tmp1, tmp1, tmp2);
 8314     __ eor(tmp3, tmp3, tmp4);
 8315     __ orr(tmp1, tmp1, tmp3);
 8316     __ sub(cnt1, cnt1, 2 * wordSize);
 8317     __ cbnz(tmp1, NOT_EQUAL);
 8318   }
 8319 
 8320   void generate_large_array_equals_loop_simd(int loopThreshold,
 8321         bool usePrefetch, Label &NOT_EQUAL) {
 8322     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8323         tmp2 = rscratch2;
 8324     Label LOOP;
 8325 
 8326     __ bind(LOOP);
 8327     if (usePrefetch) {
 8328       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8329       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8330     }
 8331     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8332     __ sub(cnt1, cnt1, 8 * wordSize);
 8333     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8334     __ subs(tmp1, cnt1, loopThreshold);
 8335     __ eor(v0, __ T16B, v0, v4);
 8336     __ eor(v1, __ T16B, v1, v5);
 8337     __ eor(v2, __ T16B, v2, v6);
 8338     __ eor(v3, __ T16B, v3, v7);
 8339     __ orr(v0, __ T16B, v0, v1);
 8340     __ orr(v1, __ T16B, v2, v3);
 8341     __ orr(v0, __ T16B, v0, v1);
 8342     __ umov(tmp1, v0, __ D, 0);
 8343     __ umov(tmp2, v0, __ D, 1);
 8344     __ orr(tmp1, tmp1, tmp2);
 8345     __ cbnz(tmp1, NOT_EQUAL);
 8346     __ br(__ GE, LOOP);
 8347   }
 8348 
 8349   // a1 = r1 - array1 address
 8350   // a2 = r2 - array2 address
 8351   // result = r0 - return value. Already contains "false"
 8352   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8353   // r3-r5 are reserved temporary registers
 8354   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8355   address generate_large_array_equals() {
 8356     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8357         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8358         tmp7 = r12, tmp8 = r13;
 8359     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8360         SMALL_LOOP, POST_LOOP;
 8361     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8362     // calculate if at least 32 prefetched bytes are used
 8363     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8364     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8365     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8366     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8367         tmp5, tmp6, tmp7, tmp8);
 8368 
 8369     __ align(CodeEntryAlignment);
 8370 
 8371     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8372     StubCodeMark mark(this, stub_id);
 8373 
 8374     address entry = __ pc();
 8375     __ enter();
 8376     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8377     // also advance pointers to use post-increment instead of pre-increment
 8378     __ add(a1, a1, wordSize);
 8379     __ add(a2, a2, wordSize);
 8380     if (AvoidUnalignedAccesses) {
 8381       // both implementations (SIMD/nonSIMD) are using relatively large load
 8382       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8383       // on some CPUs in case of address is not at least 16-byte aligned.
 8384       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8385       // load if needed at least for 1st address and make if 16-byte aligned.
 8386       Label ALIGNED16;
 8387       __ tbz(a1, 3, ALIGNED16);
 8388       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8389       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8390       __ sub(cnt1, cnt1, wordSize);
 8391       __ eor(tmp1, tmp1, tmp2);
 8392       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8393       __ bind(ALIGNED16);
 8394     }
 8395     if (UseSIMDForArrayEquals) {
 8396       if (SoftwarePrefetchHintDistance >= 0) {
 8397         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8398         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8399         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8400             /* prfm = */ true, NOT_EQUAL);
 8401         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8402         __ br(__ LT, TAIL);
 8403       }
 8404       __ bind(NO_PREFETCH_LARGE_LOOP);
 8405       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8406           /* prfm = */ false, NOT_EQUAL);
 8407     } else {
 8408       __ push(spilled_regs, sp);
 8409       if (SoftwarePrefetchHintDistance >= 0) {
 8410         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8411         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8412         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8413             /* prfm = */ true, NOT_EQUAL);
 8414         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8415         __ br(__ LT, TAIL);
 8416       }
 8417       __ bind(NO_PREFETCH_LARGE_LOOP);
 8418       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8419           /* prfm = */ false, NOT_EQUAL);
 8420     }
 8421     __ bind(TAIL);
 8422       __ cbz(cnt1, EQUAL);
 8423       __ subs(cnt1, cnt1, wordSize);
 8424       __ br(__ LE, POST_LOOP);
 8425     __ bind(SMALL_LOOP);
 8426       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8427       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8428       __ subs(cnt1, cnt1, wordSize);
 8429       __ eor(tmp1, tmp1, tmp2);
 8430       __ cbnz(tmp1, NOT_EQUAL);
 8431       __ br(__ GT, SMALL_LOOP);
 8432     __ bind(POST_LOOP);
 8433       __ ldr(tmp1, Address(a1, cnt1));
 8434       __ ldr(tmp2, Address(a2, cnt1));
 8435       __ eor(tmp1, tmp1, tmp2);
 8436       __ cbnz(tmp1, NOT_EQUAL);
 8437     __ bind(EQUAL);
 8438       __ mov(result, true);
 8439     __ bind(NOT_EQUAL);
 8440       if (!UseSIMDForArrayEquals) {
 8441         __ pop(spilled_regs, sp);
 8442       }
 8443     __ bind(NOT_EQUAL_NO_POP);
 8444     __ leave();
 8445     __ ret(lr);
 8446     return entry;
 8447   }
 8448 
 8449   // result = r0 - return value. Contains initial hashcode value on entry.
 8450   // ary = r1 - array address
 8451   // cnt = r2 - elements count
 8452   // Clobbers: v0-v13, rscratch1, rscratch2
 8453   address generate_large_arrays_hashcode(BasicType eltype) {
 8454     const Register result = r0, ary = r1, cnt = r2;
 8455     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8456     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8457     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8458     const FloatRegister vpowm = v13;
 8459 
 8460     ARRAYS_HASHCODE_REGISTERS;
 8461 
 8462     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8463 
 8464     unsigned int vf; // vectorization factor
 8465     bool multiply_by_halves;
 8466     Assembler::SIMD_Arrangement load_arrangement;
 8467     switch (eltype) {
 8468     case T_BOOLEAN:
 8469     case T_BYTE:
 8470       load_arrangement = Assembler::T8B;
 8471       multiply_by_halves = true;
 8472       vf = 8;
 8473       break;
 8474     case T_CHAR:
 8475     case T_SHORT:
 8476       load_arrangement = Assembler::T8H;
 8477       multiply_by_halves = true;
 8478       vf = 8;
 8479       break;
 8480     case T_INT:
 8481       load_arrangement = Assembler::T4S;
 8482       multiply_by_halves = false;
 8483       vf = 4;
 8484       break;
 8485     default:
 8486       ShouldNotReachHere();
 8487     }
 8488 
 8489     // Unroll factor
 8490     const unsigned uf = 4;
 8491 
 8492     // Effective vectorization factor
 8493     const unsigned evf = vf * uf;
 8494 
 8495     __ align(CodeEntryAlignment);
 8496 
 8497     StubId stub_id;
 8498     switch (eltype) {
 8499     case T_BOOLEAN:
 8500       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8501       break;
 8502     case T_BYTE:
 8503       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8504       break;
 8505     case T_CHAR:
 8506       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8507       break;
 8508     case T_SHORT:
 8509       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8510       break;
 8511     case T_INT:
 8512       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8513       break;
 8514     default:
 8515       stub_id = StubId::NO_STUBID;
 8516       ShouldNotReachHere();
 8517     };
 8518 
 8519     StubCodeMark mark(this, stub_id);
 8520 
 8521     address entry = __ pc();
 8522     __ enter();
 8523 
 8524     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8525     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8526     // value shouldn't change throughout both loops.
 8527     __ movw(rscratch1, intpow(31U, 3));
 8528     __ mov(vpow, Assembler::S, 0, rscratch1);
 8529     __ movw(rscratch1, intpow(31U, 2));
 8530     __ mov(vpow, Assembler::S, 1, rscratch1);
 8531     __ movw(rscratch1, intpow(31U, 1));
 8532     __ mov(vpow, Assembler::S, 2, rscratch1);
 8533     __ movw(rscratch1, intpow(31U, 0));
 8534     __ mov(vpow, Assembler::S, 3, rscratch1);
 8535 
 8536     __ mov(vmul0, Assembler::T16B, 0);
 8537     __ mov(vmul0, Assembler::S, 3, result);
 8538 
 8539     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8540     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8541 
 8542     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8543     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8544 
 8545     // SMALL LOOP
 8546     __ bind(SMALL_LOOP);
 8547 
 8548     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8549     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8550     __ subsw(rscratch2, rscratch2, vf);
 8551 
 8552     if (load_arrangement == Assembler::T8B) {
 8553       // Extend 8B to 8H to be able to use vector multiply
 8554       // instructions
 8555       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8556       if (is_signed_subword_type(eltype)) {
 8557         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8558       } else {
 8559         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8560       }
 8561     }
 8562 
 8563     switch (load_arrangement) {
 8564     case Assembler::T4S:
 8565       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8566       break;
 8567     case Assembler::T8B:
 8568     case Assembler::T8H:
 8569       assert(is_subword_type(eltype), "subword type expected");
 8570       if (is_signed_subword_type(eltype)) {
 8571         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8572       } else {
 8573         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8574       }
 8575       break;
 8576     default:
 8577       __ should_not_reach_here();
 8578     }
 8579 
 8580     // Process the upper half of a vector
 8581     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8582       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8583       if (is_signed_subword_type(eltype)) {
 8584         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8585       } else {
 8586         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8587       }
 8588     }
 8589 
 8590     __ br(Assembler::HI, SMALL_LOOP);
 8591 
 8592     // SMALL LOOP'S EPILOQUE
 8593     __ lsr(rscratch2, cnt, exact_log2(evf));
 8594     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8595 
 8596     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8597     __ addv(vmul0, Assembler::T4S, vmul0);
 8598     __ umov(result, vmul0, Assembler::S, 0);
 8599 
 8600     // TAIL
 8601     __ bind(TAIL);
 8602 
 8603     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8604     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8605     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8606     __ andr(rscratch2, cnt, vf - 1);
 8607     __ bind(TAIL_SHORTCUT);
 8608     __ adr(rscratch1, BR_BASE);
 8609     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8610     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8611     __ movw(rscratch2, 0x1f);
 8612     __ br(rscratch1);
 8613 
 8614     for (size_t i = 0; i < vf - 1; ++i) {
 8615       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8616                                    eltype);
 8617       __ maddw(result, result, rscratch2, rscratch1);
 8618       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8619       // Generate 2nd nop to have 4 instructions per iteration.
 8620       if (VM_Version::supports_a53mac()) {
 8621         __ nop();
 8622       }
 8623     }
 8624     __ bind(BR_BASE);
 8625 
 8626     __ leave();
 8627     __ ret(lr);
 8628 
 8629     // LARGE LOOP
 8630     __ bind(LARGE_LOOP_PREHEADER);
 8631 
 8632     __ lsr(rscratch2, cnt, exact_log2(evf));
 8633 
 8634     if (multiply_by_halves) {
 8635       // 31^4 - multiplier between lower and upper parts of a register
 8636       __ movw(rscratch1, intpow(31U, vf / 2));
 8637       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8638       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8639       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8640       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8641     } else {
 8642       // 31^16
 8643       __ movw(rscratch1, intpow(31U, evf));
 8644       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8645     }
 8646 
 8647     __ mov(vmul3, Assembler::T16B, 0);
 8648     __ mov(vmul2, Assembler::T16B, 0);
 8649     __ mov(vmul1, Assembler::T16B, 0);
 8650 
 8651     __ bind(LARGE_LOOP);
 8652 
 8653     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8654     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8655     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8656     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8657 
 8658     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8659            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8660 
 8661     if (load_arrangement == Assembler::T8B) {
 8662       // Extend 8B to 8H to be able to use vector multiply
 8663       // instructions
 8664       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8665       if (is_signed_subword_type(eltype)) {
 8666         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8667         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8668         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8669         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8670       } else {
 8671         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8672         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8673         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8674         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8675       }
 8676     }
 8677 
 8678     switch (load_arrangement) {
 8679     case Assembler::T4S:
 8680       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8681       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8682       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8683       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8684       break;
 8685     case Assembler::T8B:
 8686     case Assembler::T8H:
 8687       assert(is_subword_type(eltype), "subword type expected");
 8688       if (is_signed_subword_type(eltype)) {
 8689         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8690         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8691         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8692         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8693       } else {
 8694         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8695         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8696         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8697         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8698       }
 8699       break;
 8700     default:
 8701       __ should_not_reach_here();
 8702     }
 8703 
 8704     // Process the upper half of a vector
 8705     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8706       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8707       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8708       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8709       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8710       if (is_signed_subword_type(eltype)) {
 8711         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8712         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8713         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8714         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8715       } else {
 8716         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8717         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8718         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8719         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8720       }
 8721     }
 8722 
 8723     __ subsw(rscratch2, rscratch2, 1);
 8724     __ br(Assembler::HI, LARGE_LOOP);
 8725 
 8726     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8727     __ addv(vmul3, Assembler::T4S, vmul3);
 8728     __ umov(result, vmul3, Assembler::S, 0);
 8729 
 8730     __ mov(rscratch2, intpow(31U, vf));
 8731 
 8732     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8733     __ addv(vmul2, Assembler::T4S, vmul2);
 8734     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8735     __ maddw(result, result, rscratch2, rscratch1);
 8736 
 8737     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8738     __ addv(vmul1, Assembler::T4S, vmul1);
 8739     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8740     __ maddw(result, result, rscratch2, rscratch1);
 8741 
 8742     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8743     __ addv(vmul0, Assembler::T4S, vmul0);
 8744     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8745     __ maddw(result, result, rscratch2, rscratch1);
 8746 
 8747     __ andr(rscratch2, cnt, vf - 1);
 8748     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8749 
 8750     __ leave();
 8751     __ ret(lr);
 8752 
 8753     return entry;
 8754   }
 8755 
 8756   address generate_dsin_dcos(bool isCos) {
 8757     __ align(CodeEntryAlignment);
 8758     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8759     StubCodeMark mark(this, stub_id);
 8760     address start = __ pc();
 8761     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8762         (address)StubRoutines::aarch64::_two_over_pi,
 8763         (address)StubRoutines::aarch64::_pio2,
 8764         (address)StubRoutines::aarch64::_dsin_coef,
 8765         (address)StubRoutines::aarch64::_dcos_coef);
 8766     return start;
 8767   }
 8768 
 8769   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8770   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8771       Label &DIFF2) {
 8772     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8773     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8774 
 8775     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8776     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8777     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8778     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8779 
 8780     __ fmovd(tmpL, vtmp3);
 8781     __ eor(rscratch2, tmp3, tmpL);
 8782     __ cbnz(rscratch2, DIFF2);
 8783 
 8784     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8785     __ umov(tmpL, vtmp3, __ D, 1);
 8786     __ eor(rscratch2, tmpU, tmpL);
 8787     __ cbnz(rscratch2, DIFF1);
 8788 
 8789     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8790     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8791     __ fmovd(tmpL, vtmp);
 8792     __ eor(rscratch2, tmp3, tmpL);
 8793     __ cbnz(rscratch2, DIFF2);
 8794 
 8795     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8796     __ umov(tmpL, vtmp, __ D, 1);
 8797     __ eor(rscratch2, tmpU, tmpL);
 8798     __ cbnz(rscratch2, DIFF1);
 8799   }
 8800 
 8801   // r0  = result
 8802   // r1  = str1
 8803   // r2  = cnt1
 8804   // r3  = str2
 8805   // r4  = cnt2
 8806   // r10 = tmp1
 8807   // r11 = tmp2
 8808   address generate_compare_long_string_different_encoding(bool isLU) {
 8809     __ align(CodeEntryAlignment);
 8810     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8811     StubCodeMark mark(this, stub_id);
 8812     address entry = __ pc();
 8813     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8814         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8815         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8816     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8817         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8818     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8819     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8820 
 8821     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8822 
 8823     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8824     // cnt2 == amount of characters left to compare
 8825     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8826     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8827     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8828     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8829     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8830     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8831     __ eor(rscratch2, tmp1, tmp2);
 8832     __ mov(rscratch1, tmp2);
 8833     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8834     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8835              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8836     __ push(spilled_regs, sp);
 8837     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8838     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8839 
 8840     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8841 
 8842     if (SoftwarePrefetchHintDistance >= 0) {
 8843       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8844       __ br(__ LT, NO_PREFETCH);
 8845       __ bind(LARGE_LOOP_PREFETCH);
 8846         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8847         __ mov(tmp4, 2);
 8848         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8849         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8850           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8851           __ subs(tmp4, tmp4, 1);
 8852           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8853           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8854           __ mov(tmp4, 2);
 8855         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8856           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8857           __ subs(tmp4, tmp4, 1);
 8858           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8859           __ sub(cnt2, cnt2, 64);
 8860           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8861           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8862     }
 8863     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8864     __ bind(NO_PREFETCH);
 8865     __ subs(cnt2, cnt2, 16);
 8866     __ br(__ LT, TAIL);
 8867     __ align(OptoLoopAlignment);
 8868     __ bind(SMALL_LOOP); // smaller loop
 8869       __ subs(cnt2, cnt2, 16);
 8870       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8871       __ br(__ GE, SMALL_LOOP);
 8872       __ cmn(cnt2, (u1)16);
 8873       __ br(__ EQ, LOAD_LAST);
 8874     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8875       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8876       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8877       __ ldr(tmp3, Address(cnt1, -8));
 8878       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8879       __ b(LOAD_LAST);
 8880     __ bind(DIFF2);
 8881       __ mov(tmpU, tmp3);
 8882     __ bind(DIFF1);
 8883       __ pop(spilled_regs, sp);
 8884       __ b(CALCULATE_DIFFERENCE);
 8885     __ bind(LOAD_LAST);
 8886       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8887       // No need to load it again
 8888       __ mov(tmpU, tmp3);
 8889       __ pop(spilled_regs, sp);
 8890 
 8891       // tmp2 points to the address of the last 4 Latin1 characters right now
 8892       __ ldrs(vtmp, Address(tmp2));
 8893       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8894       __ fmovd(tmpL, vtmp);
 8895 
 8896       __ eor(rscratch2, tmpU, tmpL);
 8897       __ cbz(rscratch2, DONE);
 8898 
 8899     // Find the first different characters in the longwords and
 8900     // compute their difference.
 8901     __ bind(CALCULATE_DIFFERENCE);
 8902       __ rev(rscratch2, rscratch2);
 8903       __ clz(rscratch2, rscratch2);
 8904       __ andr(rscratch2, rscratch2, -16);
 8905       __ lsrv(tmp1, tmp1, rscratch2);
 8906       __ uxthw(tmp1, tmp1);
 8907       __ lsrv(rscratch1, rscratch1, rscratch2);
 8908       __ uxthw(rscratch1, rscratch1);
 8909       __ subw(result, tmp1, rscratch1);
 8910     __ bind(DONE);
 8911       __ ret(lr);
 8912     return entry;
 8913   }
 8914 
 8915   // r0 = input (float16)
 8916   // v0 = result (float)
 8917   // v1 = temporary float register
 8918   address generate_float16ToFloat() {
 8919     __ align(CodeEntryAlignment);
 8920     StubId stub_id = StubId::stubgen_hf2f_id;
 8921     StubCodeMark mark(this, stub_id);
 8922     address entry = __ pc();
 8923     BLOCK_COMMENT("Entry:");
 8924     __ flt16_to_flt(v0, r0, v1);
 8925     __ ret(lr);
 8926     return entry;
 8927   }
 8928 
 8929   // v0 = input (float)
 8930   // r0 = result (float16)
 8931   // v1 = temporary float register
 8932   address generate_floatToFloat16() {
 8933     __ align(CodeEntryAlignment);
 8934     StubId stub_id = StubId::stubgen_f2hf_id;
 8935     StubCodeMark mark(this, stub_id);
 8936     address entry = __ pc();
 8937     BLOCK_COMMENT("Entry:");
 8938     __ flt_to_flt16(r0, v0, v1);
 8939     __ ret(lr);
 8940     return entry;
 8941   }
 8942 
 8943   address generate_method_entry_barrier() {
 8944     __ align(CodeEntryAlignment);
 8945     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 8946     StubCodeMark mark(this, stub_id);
 8947 
 8948     Label deoptimize_label;
 8949 
 8950     address start = __ pc();
 8951 
 8952     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8953 
 8954     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8955       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8956       // We can get here despite the nmethod being good, if we have not
 8957       // yet applied our cross modification fence (or data fence).
 8958       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8959       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8960       __ ldrw(rscratch2, rscratch2);
 8961       __ strw(rscratch2, thread_epoch_addr);
 8962       __ isb();
 8963       __ membar(__ LoadLoad);
 8964     }
 8965 
 8966     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8967 
 8968     __ enter();
 8969     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8970 
 8971     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8972 
 8973     __ push_call_clobbered_registers();
 8974 
 8975     __ mov(c_rarg0, rscratch2);
 8976     __ call_VM_leaf
 8977          (CAST_FROM_FN_PTR
 8978           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8979 
 8980     __ reset_last_Java_frame(true);
 8981 
 8982     __ mov(rscratch1, r0);
 8983 
 8984     __ pop_call_clobbered_registers();
 8985 
 8986     __ cbnz(rscratch1, deoptimize_label);
 8987 
 8988     __ leave();
 8989     __ ret(lr);
 8990 
 8991     __ BIND(deoptimize_label);
 8992 
 8993     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8994     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8995 
 8996     __ mov(sp, rscratch1);
 8997     __ br(rscratch2);
 8998 
 8999     return start;
 9000   }
 9001 
 9002   // r0  = result
 9003   // r1  = str1
 9004   // r2  = cnt1
 9005   // r3  = str2
 9006   // r4  = cnt2
 9007   // r10 = tmp1
 9008   // r11 = tmp2
 9009   address generate_compare_long_string_same_encoding(bool isLL) {
 9010     __ align(CodeEntryAlignment);
 9011     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9012     StubCodeMark mark(this, stub_id);
 9013     address entry = __ pc();
 9014     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9015         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9016 
 9017     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9018 
 9019     // exit from large loop when less than 64 bytes left to read or we're about
 9020     // to prefetch memory behind array border
 9021     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9022 
 9023     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9024     __ eor(rscratch2, tmp1, tmp2);
 9025     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9026 
 9027     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9028     // update pointers, because of previous read
 9029     __ add(str1, str1, wordSize);
 9030     __ add(str2, str2, wordSize);
 9031     if (SoftwarePrefetchHintDistance >= 0) {
 9032       __ align(OptoLoopAlignment);
 9033       __ bind(LARGE_LOOP_PREFETCH);
 9034         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9035         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9036 
 9037         for (int i = 0; i < 4; i++) {
 9038           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9039           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9040           __ cmp(tmp1, tmp2);
 9041           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9042           __ br(Assembler::NE, DIFF);
 9043         }
 9044         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9045         __ add(str1, str1, 64);
 9046         __ add(str2, str2, 64);
 9047         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9048         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9049         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9050     }
 9051 
 9052     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9053     __ br(Assembler::LE, LESS16);
 9054     __ align(OptoLoopAlignment);
 9055     __ bind(LOOP_COMPARE16);
 9056       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9057       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9058       __ cmp(tmp1, tmp2);
 9059       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9060       __ br(Assembler::NE, DIFF);
 9061       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9062       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9063       __ br(Assembler::LT, LESS16);
 9064 
 9065       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9066       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9067       __ cmp(tmp1, tmp2);
 9068       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9069       __ br(Assembler::NE, DIFF);
 9070       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9071       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9072       __ br(Assembler::GE, LOOP_COMPARE16);
 9073       __ cbz(cnt2, LENGTH_DIFF);
 9074 
 9075     __ bind(LESS16);
 9076       // each 8 compare
 9077       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9078       __ br(Assembler::LE, LESS8);
 9079       __ ldr(tmp1, Address(__ post(str1, 8)));
 9080       __ ldr(tmp2, Address(__ post(str2, 8)));
 9081       __ eor(rscratch2, tmp1, tmp2);
 9082       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9083       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9084 
 9085     __ bind(LESS8); // directly load last 8 bytes
 9086       if (!isLL) {
 9087         __ add(cnt2, cnt2, cnt2);
 9088       }
 9089       __ ldr(tmp1, Address(str1, cnt2));
 9090       __ ldr(tmp2, Address(str2, cnt2));
 9091       __ eor(rscratch2, tmp1, tmp2);
 9092       __ cbz(rscratch2, LENGTH_DIFF);
 9093       __ b(CAL_DIFFERENCE);
 9094 
 9095     __ bind(DIFF);
 9096       __ cmp(tmp1, tmp2);
 9097       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9098       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9099       // reuse rscratch2 register for the result of eor instruction
 9100       __ eor(rscratch2, tmp1, tmp2);
 9101 
 9102     __ bind(CAL_DIFFERENCE);
 9103       __ rev(rscratch2, rscratch2);
 9104       __ clz(rscratch2, rscratch2);
 9105       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9106       __ lsrv(tmp1, tmp1, rscratch2);
 9107       __ lsrv(tmp2, tmp2, rscratch2);
 9108       if (isLL) {
 9109         __ uxtbw(tmp1, tmp1);
 9110         __ uxtbw(tmp2, tmp2);
 9111       } else {
 9112         __ uxthw(tmp1, tmp1);
 9113         __ uxthw(tmp2, tmp2);
 9114       }
 9115       __ subw(result, tmp1, tmp2);
 9116 
 9117     __ bind(LENGTH_DIFF);
 9118       __ ret(lr);
 9119     return entry;
 9120   }
 9121 
 9122   enum string_compare_mode {
 9123     LL,
 9124     LU,
 9125     UL,
 9126     UU,
 9127   };
 9128 
 9129   // The following registers are declared in aarch64.ad
 9130   // r0  = result
 9131   // r1  = str1
 9132   // r2  = cnt1
 9133   // r3  = str2
 9134   // r4  = cnt2
 9135   // r10 = tmp1
 9136   // r11 = tmp2
 9137   // z0  = ztmp1
 9138   // z1  = ztmp2
 9139   // p0  = pgtmp1
 9140   // p1  = pgtmp2
 9141   address generate_compare_long_string_sve(string_compare_mode mode) {
 9142     StubId stub_id;
 9143     switch (mode) {
 9144       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9145       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9146       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9147       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9148       default: ShouldNotReachHere();
 9149     }
 9150 
 9151     __ align(CodeEntryAlignment);
 9152     address entry = __ pc();
 9153     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9154              tmp1 = r10, tmp2 = r11;
 9155 
 9156     Label LOOP, DONE, MISMATCH;
 9157     Register vec_len = tmp1;
 9158     Register idx = tmp2;
 9159     // The minimum of the string lengths has been stored in cnt2.
 9160     Register cnt = cnt2;
 9161     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9162     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9163 
 9164 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9165     switch (mode) {                                                            \
 9166       case LL:                                                                 \
 9167         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9168         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9169         break;                                                                 \
 9170       case LU:                                                                 \
 9171         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9172         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9173         break;                                                                 \
 9174       case UL:                                                                 \
 9175         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9176         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9177         break;                                                                 \
 9178       case UU:                                                                 \
 9179         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9180         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9181         break;                                                                 \
 9182       default:                                                                 \
 9183         ShouldNotReachHere();                                                  \
 9184     }
 9185 
 9186     StubCodeMark mark(this, stub_id);
 9187 
 9188     __ mov(idx, 0);
 9189     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9190 
 9191     if (mode == LL) {
 9192       __ sve_cntb(vec_len);
 9193     } else {
 9194       __ sve_cnth(vec_len);
 9195     }
 9196 
 9197     __ sub(rscratch1, cnt, vec_len);
 9198 
 9199     __ bind(LOOP);
 9200 
 9201       // main loop
 9202       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9203       __ add(idx, idx, vec_len);
 9204       // Compare strings.
 9205       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9206       __ br(__ NE, MISMATCH);
 9207       __ cmp(idx, rscratch1);
 9208       __ br(__ LT, LOOP);
 9209 
 9210     // post loop, last iteration
 9211     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9212 
 9213     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9214     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9215     __ br(__ EQ, DONE);
 9216 
 9217     __ bind(MISMATCH);
 9218 
 9219     // Crop the vector to find its location.
 9220     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9221     // Extract the first different characters of each string.
 9222     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9223     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9224 
 9225     // Compute the difference of the first different characters.
 9226     __ sub(result, rscratch1, rscratch2);
 9227 
 9228     __ bind(DONE);
 9229     __ ret(lr);
 9230 #undef LOAD_PAIR
 9231     return entry;
 9232   }
 9233 
 9234   void generate_compare_long_strings() {
 9235     if (UseSVE == 0) {
 9236       StubRoutines::aarch64::_compare_long_string_LL
 9237           = generate_compare_long_string_same_encoding(true);
 9238       StubRoutines::aarch64::_compare_long_string_UU
 9239           = generate_compare_long_string_same_encoding(false);
 9240       StubRoutines::aarch64::_compare_long_string_LU
 9241           = generate_compare_long_string_different_encoding(true);
 9242       StubRoutines::aarch64::_compare_long_string_UL
 9243           = generate_compare_long_string_different_encoding(false);
 9244     } else {
 9245       StubRoutines::aarch64::_compare_long_string_LL
 9246           = generate_compare_long_string_sve(LL);
 9247       StubRoutines::aarch64::_compare_long_string_UU
 9248           = generate_compare_long_string_sve(UU);
 9249       StubRoutines::aarch64::_compare_long_string_LU
 9250           = generate_compare_long_string_sve(LU);
 9251       StubRoutines::aarch64::_compare_long_string_UL
 9252           = generate_compare_long_string_sve(UL);
 9253     }
 9254   }
 9255 
 9256   // R0 = result
 9257   // R1 = str2
 9258   // R2 = cnt1
 9259   // R3 = str1
 9260   // R4 = cnt2
 9261   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9262   //
 9263   // This generic linear code use few additional ideas, which makes it faster:
 9264   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9265   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9266   // 2) we can use "fast" algorithm of finding single character to search for
 9267   // first symbol with less branches(1 branch per each loaded register instead
 9268   // of branch for each symbol), so, this is where constants like
 9269   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9270   // 3) after loading and analyzing 1st register of source string, it can be
 9271   // used to search for every 1st character entry, saving few loads in
 9272   // comparison with "simplier-but-slower" implementation
 9273   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9274   // re-using/re-initializing/compressing register values, which makes code
 9275   // larger and a bit less readable, however, most of extra operations are
 9276   // issued during loads or branches, so, penalty is minimal
 9277   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9278     StubId stub_id;
 9279     if (str1_isL) {
 9280       if (str2_isL) {
 9281         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9282       } else {
 9283         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9284       }
 9285     } else {
 9286       if (str2_isL) {
 9287         ShouldNotReachHere();
 9288       } else {
 9289         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9290       }
 9291     }
 9292     __ align(CodeEntryAlignment);
 9293     StubCodeMark mark(this, stub_id);
 9294     address entry = __ pc();
 9295 
 9296     int str1_chr_size = str1_isL ? 1 : 2;
 9297     int str2_chr_size = str2_isL ? 1 : 2;
 9298     int str1_chr_shift = str1_isL ? 0 : 1;
 9299     int str2_chr_shift = str2_isL ? 0 : 1;
 9300     bool isL = str1_isL && str2_isL;
 9301    // parameters
 9302     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9303     // temporary registers
 9304     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9305     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9306     // redefinitions
 9307     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9308 
 9309     __ push(spilled_regs, sp);
 9310     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9311         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9312         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9313         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9314         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9315         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9316     // Read whole register from str1. It is safe, because length >=8 here
 9317     __ ldr(ch1, Address(str1));
 9318     // Read whole register from str2. It is safe, because length >=8 here
 9319     __ ldr(ch2, Address(str2));
 9320     __ sub(cnt2, cnt2, cnt1);
 9321     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9322     if (str1_isL != str2_isL) {
 9323       __ eor(v0, __ T16B, v0, v0);
 9324     }
 9325     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9326     __ mul(first, first, tmp1);
 9327     // check if we have less than 1 register to check
 9328     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9329     if (str1_isL != str2_isL) {
 9330       __ fmovd(v1, ch1);
 9331     }
 9332     __ br(__ LE, L_SMALL);
 9333     __ eor(ch2, first, ch2);
 9334     if (str1_isL != str2_isL) {
 9335       __ zip1(v1, __ T16B, v1, v0);
 9336     }
 9337     __ sub(tmp2, ch2, tmp1);
 9338     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9339     __ bics(tmp2, tmp2, ch2);
 9340     if (str1_isL != str2_isL) {
 9341       __ fmovd(ch1, v1);
 9342     }
 9343     __ br(__ NE, L_HAS_ZERO);
 9344     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9345     __ add(result, result, wordSize/str2_chr_size);
 9346     __ add(str2, str2, wordSize);
 9347     __ br(__ LT, L_POST_LOOP);
 9348     __ BIND(L_LOOP);
 9349       __ ldr(ch2, Address(str2));
 9350       __ eor(ch2, first, ch2);
 9351       __ sub(tmp2, ch2, tmp1);
 9352       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9353       __ bics(tmp2, tmp2, ch2);
 9354       __ br(__ NE, L_HAS_ZERO);
 9355     __ BIND(L_LOOP_PROCEED);
 9356       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9357       __ add(str2, str2, wordSize);
 9358       __ add(result, result, wordSize/str2_chr_size);
 9359       __ br(__ GE, L_LOOP);
 9360     __ BIND(L_POST_LOOP);
 9361       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9362       __ br(__ LE, NOMATCH);
 9363       __ ldr(ch2, Address(str2));
 9364       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9365       __ eor(ch2, first, ch2);
 9366       __ sub(tmp2, ch2, tmp1);
 9367       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9368       __ mov(tmp4, -1); // all bits set
 9369       __ b(L_SMALL_PROCEED);
 9370     __ align(OptoLoopAlignment);
 9371     __ BIND(L_SMALL);
 9372       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9373       __ eor(ch2, first, ch2);
 9374       if (str1_isL != str2_isL) {
 9375         __ zip1(v1, __ T16B, v1, v0);
 9376       }
 9377       __ sub(tmp2, ch2, tmp1);
 9378       __ mov(tmp4, -1); // all bits set
 9379       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9380       if (str1_isL != str2_isL) {
 9381         __ fmovd(ch1, v1); // move converted 4 symbols
 9382       }
 9383     __ BIND(L_SMALL_PROCEED);
 9384       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9385       __ bic(tmp2, tmp2, ch2);
 9386       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9387       __ rbit(tmp2, tmp2);
 9388       __ br(__ EQ, NOMATCH);
 9389     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9390       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9391       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9392       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9393       if (str2_isL) { // LL
 9394         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9395         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9396         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9397         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9398         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9399       } else {
 9400         __ mov(ch2, 0xE); // all bits in byte set except last one
 9401         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9402         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9403         __ lslv(tmp2, tmp2, tmp4);
 9404         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9405         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9406         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9407         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9408       }
 9409       __ cmp(ch1, ch2);
 9410       __ mov(tmp4, wordSize/str2_chr_size);
 9411       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9412     __ BIND(L_SMALL_CMP_LOOP);
 9413       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9414                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9415       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9416                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9417       __ add(tmp4, tmp4, 1);
 9418       __ cmp(tmp4, cnt1);
 9419       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9420       __ cmp(first, ch2);
 9421       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9422     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9423       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9424       __ clz(tmp4, tmp2);
 9425       __ add(result, result, 1); // advance index
 9426       __ add(str2, str2, str2_chr_size); // advance pointer
 9427       __ b(L_SMALL_HAS_ZERO_LOOP);
 9428     __ align(OptoLoopAlignment);
 9429     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9430       __ cmp(first, ch2);
 9431       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9432       __ b(DONE);
 9433     __ align(OptoLoopAlignment);
 9434     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9435       if (str2_isL) { // LL
 9436         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9437         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9438         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9439         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9440         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9441       } else {
 9442         __ mov(ch2, 0xE); // all bits in byte set except last one
 9443         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9444         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9445         __ lslv(tmp2, tmp2, tmp4);
 9446         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9447         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9448         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9449         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9450       }
 9451       __ cmp(ch1, ch2);
 9452       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9453       __ b(DONE);
 9454     __ align(OptoLoopAlignment);
 9455     __ BIND(L_HAS_ZERO);
 9456       __ rbit(tmp2, tmp2);
 9457       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9458       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9459       // It's fine because both counters are 32bit and are not changed in this
 9460       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9461       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9462       __ sub(result, result, 1);
 9463     __ BIND(L_HAS_ZERO_LOOP);
 9464       __ mov(cnt1, wordSize/str2_chr_size);
 9465       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9466       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9467       if (str2_isL) {
 9468         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9469         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9470         __ lslv(tmp2, tmp2, tmp4);
 9471         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9472         __ add(tmp4, tmp4, 1);
 9473         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9474         __ lsl(tmp2, tmp2, 1);
 9475         __ mov(tmp4, wordSize/str2_chr_size);
 9476       } else {
 9477         __ mov(ch2, 0xE);
 9478         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9479         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9480         __ lslv(tmp2, tmp2, tmp4);
 9481         __ add(tmp4, tmp4, 1);
 9482         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9483         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9484         __ lsl(tmp2, tmp2, 1);
 9485         __ mov(tmp4, wordSize/str2_chr_size);
 9486         __ sub(str2, str2, str2_chr_size);
 9487       }
 9488       __ cmp(ch1, ch2);
 9489       __ mov(tmp4, wordSize/str2_chr_size);
 9490       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9491     __ BIND(L_CMP_LOOP);
 9492       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9493                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9494       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9495                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9496       __ add(tmp4, tmp4, 1);
 9497       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9498       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9499       __ cmp(cnt1, ch2);
 9500       __ br(__ EQ, L_CMP_LOOP);
 9501     __ BIND(L_CMP_LOOP_NOMATCH);
 9502       // here we're not matched
 9503       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9504       __ clz(tmp4, tmp2);
 9505       __ add(str2, str2, str2_chr_size); // advance pointer
 9506       __ b(L_HAS_ZERO_LOOP);
 9507     __ align(OptoLoopAlignment);
 9508     __ BIND(L_CMP_LOOP_LAST_CMP);
 9509       __ cmp(cnt1, ch2);
 9510       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9511       __ b(DONE);
 9512     __ align(OptoLoopAlignment);
 9513     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9514       if (str2_isL) {
 9515         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9516         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9517         __ lslv(tmp2, tmp2, tmp4);
 9518         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9519         __ add(tmp4, tmp4, 1);
 9520         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9521         __ lsl(tmp2, tmp2, 1);
 9522       } else {
 9523         __ mov(ch2, 0xE);
 9524         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9525         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9526         __ lslv(tmp2, tmp2, tmp4);
 9527         __ add(tmp4, tmp4, 1);
 9528         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9529         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9530         __ lsl(tmp2, tmp2, 1);
 9531         __ sub(str2, str2, str2_chr_size);
 9532       }
 9533       __ cmp(ch1, ch2);
 9534       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9535       __ b(DONE);
 9536     __ align(OptoLoopAlignment);
 9537     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9538       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9539       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9540       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9541       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9542       // result by analyzed characters value, so, we can just reset lower bits
 9543       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9544       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9545       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9546       // index of last analyzed substring inside current octet. So, str2 in at
 9547       // respective start address. We need to advance it to next octet
 9548       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9549       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9550       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9551       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9552       __ movw(cnt2, cnt2);
 9553       __ b(L_LOOP_PROCEED);
 9554     __ align(OptoLoopAlignment);
 9555     __ BIND(NOMATCH);
 9556       __ mov(result, -1);
 9557     __ BIND(DONE);
 9558       __ pop(spilled_regs, sp);
 9559       __ ret(lr);
 9560     return entry;
 9561   }
 9562 
 9563   void generate_string_indexof_stubs() {
 9564     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9565     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9566     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9567   }
 9568 
 9569   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9570       FloatRegister src1, FloatRegister src2) {
 9571     Register dst = r1;
 9572     __ zip1(v1, __ T16B, src1, v0);
 9573     __ zip2(v2, __ T16B, src1, v0);
 9574     if (generatePrfm) {
 9575       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9576     }
 9577     __ zip1(v3, __ T16B, src2, v0);
 9578     __ zip2(v4, __ T16B, src2, v0);
 9579     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9580   }
 9581 
 9582   // R0 = src
 9583   // R1 = dst
 9584   // R2 = len
 9585   // R3 = len >> 3
 9586   // V0 = 0
 9587   // v1 = loaded 8 bytes
 9588   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9589   address generate_large_byte_array_inflate() {
 9590     __ align(CodeEntryAlignment);
 9591     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9592     StubCodeMark mark(this, stub_id);
 9593     address entry = __ pc();
 9594     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9595     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9596     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9597 
 9598     // do one more 8-byte read to have address 16-byte aligned in most cases
 9599     // also use single store instruction
 9600     __ ldrd(v2, __ post(src, 8));
 9601     __ sub(octetCounter, octetCounter, 2);
 9602     __ zip1(v1, __ T16B, v1, v0);
 9603     __ zip1(v2, __ T16B, v2, v0);
 9604     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9605     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9606     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9607     __ br(__ LE, LOOP_START);
 9608     __ b(LOOP_PRFM_START);
 9609     __ bind(LOOP_PRFM);
 9610       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9611     __ bind(LOOP_PRFM_START);
 9612       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9613       __ sub(octetCounter, octetCounter, 8);
 9614       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9615       inflate_and_store_2_fp_registers(true, v3, v4);
 9616       inflate_and_store_2_fp_registers(true, v5, v6);
 9617       __ br(__ GT, LOOP_PRFM);
 9618       __ cmp(octetCounter, (u1)8);
 9619       __ br(__ LT, DONE);
 9620     __ bind(LOOP);
 9621       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9622       __ bind(LOOP_START);
 9623       __ sub(octetCounter, octetCounter, 8);
 9624       __ cmp(octetCounter, (u1)8);
 9625       inflate_and_store_2_fp_registers(false, v3, v4);
 9626       inflate_and_store_2_fp_registers(false, v5, v6);
 9627       __ br(__ GE, LOOP);
 9628     __ bind(DONE);
 9629       __ ret(lr);
 9630     return entry;
 9631   }
 9632 
 9633   /**
 9634    *  Arguments:
 9635    *
 9636    *  Input:
 9637    *  c_rarg0   - current state address
 9638    *  c_rarg1   - H key address
 9639    *  c_rarg2   - data address
 9640    *  c_rarg3   - number of blocks
 9641    *
 9642    *  Output:
 9643    *  Updated state at c_rarg0
 9644    */
 9645   address generate_ghash_processBlocks() {
 9646     // Bafflingly, GCM uses little-endian for the byte order, but
 9647     // big-endian for the bit order.  For example, the polynomial 1 is
 9648     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9649     //
 9650     // So, we must either reverse the bytes in each word and do
 9651     // everything big-endian or reverse the bits in each byte and do
 9652     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9653     // the bits in each byte (we have an instruction, RBIT, to do
 9654     // that) and keep the data in little-endian bit order through the
 9655     // calculation, bit-reversing the inputs and outputs.
 9656 
 9657     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9658     StubCodeMark mark(this, stub_id);
 9659     Label polynomial; // local data generated at end of stub
 9660     __ align(CodeEntryAlignment);
 9661     address start = __ pc();
 9662 
 9663     Register state   = c_rarg0;
 9664     Register subkeyH = c_rarg1;
 9665     Register data    = c_rarg2;
 9666     Register blocks  = c_rarg3;
 9667 
 9668     FloatRegister vzr = v30;
 9669     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9670 
 9671     __ adr(rscratch1, polynomial);
 9672     __ ldrq(v24, rscratch1);    // The field polynomial
 9673 
 9674     __ ldrq(v0, Address(state));
 9675     __ ldrq(v1, Address(subkeyH));
 9676 
 9677     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9678     __ rbit(v0, __ T16B, v0);
 9679     __ rev64(v1, __ T16B, v1);
 9680     __ rbit(v1, __ T16B, v1);
 9681 
 9682     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9683     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9684 
 9685     {
 9686       Label L_ghash_loop;
 9687       __ bind(L_ghash_loop);
 9688 
 9689       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9690                                                  // reversing each byte
 9691       __ rbit(v2, __ T16B, v2);
 9692       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9693 
 9694       // Multiply state in v2 by subkey in v1
 9695       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9696                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9697                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9698       // Reduce v7:v5 by the field polynomial
 9699       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9700 
 9701       __ sub(blocks, blocks, 1);
 9702       __ cbnz(blocks, L_ghash_loop);
 9703     }
 9704 
 9705     // The bit-reversed result is at this point in v0
 9706     __ rev64(v0, __ T16B, v0);
 9707     __ rbit(v0, __ T16B, v0);
 9708 
 9709     __ st1(v0, __ T16B, state);
 9710     __ ret(lr);
 9711 
 9712     // bind label and generate local polynomial data
 9713     __ align(wordSize * 2);
 9714     __ bind(polynomial);
 9715     __ emit_int64(0x87);  // The low-order bits of the field
 9716                           // polynomial (i.e. p = z^7+z^2+z+1)
 9717                           // repeated in the low and high parts of a
 9718                           // 128-bit vector
 9719     __ emit_int64(0x87);
 9720 
 9721     return start;
 9722   }
 9723 
 9724   address generate_ghash_processBlocks_wide() {
 9725     address small = generate_ghash_processBlocks();
 9726 
 9727     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9728     StubCodeMark mark(this, stub_id);
 9729     Label polynomial;           // local data generated after stub
 9730     __ align(CodeEntryAlignment);
 9731     address start = __ pc();
 9732 
 9733     Register state   = c_rarg0;
 9734     Register subkeyH = c_rarg1;
 9735     Register data    = c_rarg2;
 9736     Register blocks  = c_rarg3;
 9737 
 9738     const int unroll = 4;
 9739 
 9740     __ cmp(blocks, (unsigned char)(unroll * 2));
 9741     __ br(__ LT, small);
 9742 
 9743     if (unroll > 1) {
 9744     // Save state before entering routine
 9745       __ sub(sp, sp, 4 * 16);
 9746       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9747       __ sub(sp, sp, 4 * 16);
 9748       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9749     }
 9750 
 9751     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
 9752 
 9753     if (unroll > 1) {
 9754       // And restore state
 9755       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9756       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9757     }
 9758 
 9759     __ cmp(blocks, (unsigned char)0);
 9760     __ br(__ GT, small);
 9761 
 9762     __ ret(lr);
 9763 
 9764     // bind label and generate polynomial data
 9765     __ align(wordSize * 2);
 9766     __ bind(polynomial);
 9767     __ emit_int64(0x87);  // The low-order bits of the field
 9768                           // polynomial (i.e. p = z^7+z^2+z+1)
 9769                           // repeated in the low and high parts of a
 9770                           // 128-bit vector
 9771     __ emit_int64(0x87);
 9772 
 9773     return start;
 9774 
 9775   }
 9776 
 9777   void generate_base64_encode_simdround(Register src, Register dst,
 9778         FloatRegister codec, u8 size) {
 9779 
 9780     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9781     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9782     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9783 
 9784     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9785 
 9786     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9787 
 9788     __ ushr(ind0, arrangement, in0,  2);
 9789 
 9790     __ ushr(ind1, arrangement, in1,  2);
 9791     __ shl(in0,   arrangement, in0,  6);
 9792     __ orr(ind1,  arrangement, ind1, in0);
 9793     __ ushr(ind1, arrangement, ind1, 2);
 9794 
 9795     __ ushr(ind2, arrangement, in2,  4);
 9796     __ shl(in1,   arrangement, in1,  4);
 9797     __ orr(ind2,  arrangement, in1,  ind2);
 9798     __ ushr(ind2, arrangement, ind2, 2);
 9799 
 9800     __ shl(ind3,  arrangement, in2,  2);
 9801     __ ushr(ind3, arrangement, ind3, 2);
 9802 
 9803     __ tbl(out0,  arrangement, codec,  4, ind0);
 9804     __ tbl(out1,  arrangement, codec,  4, ind1);
 9805     __ tbl(out2,  arrangement, codec,  4, ind2);
 9806     __ tbl(out3,  arrangement, codec,  4, ind3);
 9807 
 9808     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9809   }
 9810 
 9811    /**
 9812    *  Arguments:
 9813    *
 9814    *  Input:
 9815    *  c_rarg0   - src_start
 9816    *  c_rarg1   - src_offset
 9817    *  c_rarg2   - src_length
 9818    *  c_rarg3   - dest_start
 9819    *  c_rarg4   - dest_offset
 9820    *  c_rarg5   - isURL
 9821    *
 9822    */
 9823   address generate_base64_encodeBlock() {
 9824 
 9825     static const char toBase64[64] = {
 9826       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9827       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9828       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9829       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9830       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9831     };
 9832 
 9833     static const char toBase64URL[64] = {
 9834       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9835       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9836       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9837       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9838       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9839     };
 9840 
 9841     __ align(CodeEntryAlignment);
 9842     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9843     StubCodeMark mark(this, stub_id);
 9844     address start = __ pc();
 9845 
 9846     Register src   = c_rarg0;  // source array
 9847     Register soff  = c_rarg1;  // source start offset
 9848     Register send  = c_rarg2;  // source end offset
 9849     Register dst   = c_rarg3;  // dest array
 9850     Register doff  = c_rarg4;  // position for writing to dest array
 9851     Register isURL = c_rarg5;  // Base64 or URL character set
 9852 
 9853     // c_rarg6 and c_rarg7 are free to use as temps
 9854     Register codec  = c_rarg6;
 9855     Register length = c_rarg7;
 9856 
 9857     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9858 
 9859     __ add(src, src, soff);
 9860     __ add(dst, dst, doff);
 9861     __ sub(length, send, soff);
 9862 
 9863     // load the codec base address
 9864     __ lea(codec, ExternalAddress((address) toBase64));
 9865     __ cbz(isURL, ProcessData);
 9866     __ lea(codec, ExternalAddress((address) toBase64URL));
 9867 
 9868     __ BIND(ProcessData);
 9869 
 9870     // too short to formup a SIMD loop, roll back
 9871     __ cmp(length, (u1)24);
 9872     __ br(Assembler::LT, Process3B);
 9873 
 9874     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9875 
 9876     __ BIND(Process48B);
 9877     __ cmp(length, (u1)48);
 9878     __ br(Assembler::LT, Process24B);
 9879     generate_base64_encode_simdround(src, dst, v0, 16);
 9880     __ sub(length, length, 48);
 9881     __ b(Process48B);
 9882 
 9883     __ BIND(Process24B);
 9884     __ cmp(length, (u1)24);
 9885     __ br(Assembler::LT, SIMDExit);
 9886     generate_base64_encode_simdround(src, dst, v0, 8);
 9887     __ sub(length, length, 24);
 9888 
 9889     __ BIND(SIMDExit);
 9890     __ cbz(length, Exit);
 9891 
 9892     __ BIND(Process3B);
 9893     //  3 src bytes, 24 bits
 9894     __ ldrb(r10, __ post(src, 1));
 9895     __ ldrb(r11, __ post(src, 1));
 9896     __ ldrb(r12, __ post(src, 1));
 9897     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9898     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9899     // codec index
 9900     __ ubfmw(r15, r12, 18, 23);
 9901     __ ubfmw(r14, r12, 12, 17);
 9902     __ ubfmw(r13, r12, 6,  11);
 9903     __ andw(r12,  r12, 63);
 9904     // get the code based on the codec
 9905     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9906     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9907     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9908     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9909     __ strb(r15, __ post(dst, 1));
 9910     __ strb(r14, __ post(dst, 1));
 9911     __ strb(r13, __ post(dst, 1));
 9912     __ strb(r12, __ post(dst, 1));
 9913     __ sub(length, length, 3);
 9914     __ cbnz(length, Process3B);
 9915 
 9916     __ BIND(Exit);
 9917     __ ret(lr);
 9918 
 9919     return start;
 9920   }
 9921 
 9922   void generate_base64_decode_simdround(Register src, Register dst,
 9923         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9924 
 9925     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9926     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9927 
 9928     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9929     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9930 
 9931     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9932 
 9933     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9934 
 9935     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9936 
 9937     // we need unsigned saturating subtract, to make sure all input values
 9938     // in range [0, 63] will have 0U value in the higher half lookup
 9939     __ uqsubv(decH0, __ T16B, in0, v27);
 9940     __ uqsubv(decH1, __ T16B, in1, v27);
 9941     __ uqsubv(decH2, __ T16B, in2, v27);
 9942     __ uqsubv(decH3, __ T16B, in3, v27);
 9943 
 9944     // lower half lookup
 9945     __ tbl(decL0, arrangement, codecL, 4, in0);
 9946     __ tbl(decL1, arrangement, codecL, 4, in1);
 9947     __ tbl(decL2, arrangement, codecL, 4, in2);
 9948     __ tbl(decL3, arrangement, codecL, 4, in3);
 9949 
 9950     // higher half lookup
 9951     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9952     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9953     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9954     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9955 
 9956     // combine lower and higher
 9957     __ orr(decL0, arrangement, decL0, decH0);
 9958     __ orr(decL1, arrangement, decL1, decH1);
 9959     __ orr(decL2, arrangement, decL2, decH2);
 9960     __ orr(decL3, arrangement, decL3, decH3);
 9961 
 9962     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9963     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9964     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9965     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9966     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9967     __ orr(in0, arrangement, decH0, decH1);
 9968     __ orr(in1, arrangement, decH2, decH3);
 9969     __ orr(in2, arrangement, in0,   in1);
 9970     __ umaxv(in3, arrangement, in2);
 9971     __ umov(rscratch2, in3, __ B, 0);
 9972 
 9973     // get the data to output
 9974     __ shl(out0,  arrangement, decL0, 2);
 9975     __ ushr(out1, arrangement, decL1, 4);
 9976     __ orr(out0,  arrangement, out0,  out1);
 9977     __ shl(out1,  arrangement, decL1, 4);
 9978     __ ushr(out2, arrangement, decL2, 2);
 9979     __ orr(out1,  arrangement, out1,  out2);
 9980     __ shl(out2,  arrangement, decL2, 6);
 9981     __ orr(out2,  arrangement, out2,  decL3);
 9982 
 9983     __ cbz(rscratch2, NoIllegalData);
 9984 
 9985     // handle illegal input
 9986     __ umov(r10, in2, __ D, 0);
 9987     if (size == 16) {
 9988       __ cbnz(r10, ErrorInLowerHalf);
 9989 
 9990       // illegal input is in higher half, store the lower half now.
 9991       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9992 
 9993       __ umov(r10, in2,  __ D, 1);
 9994       __ umov(r11, out0, __ D, 1);
 9995       __ umov(r12, out1, __ D, 1);
 9996       __ umov(r13, out2, __ D, 1);
 9997       __ b(StoreLegalData);
 9998 
 9999       __ BIND(ErrorInLowerHalf);
10000     }
10001     __ umov(r11, out0, __ D, 0);
10002     __ umov(r12, out1, __ D, 0);
10003     __ umov(r13, out2, __ D, 0);
10004 
10005     __ BIND(StoreLegalData);
10006     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10007     __ strb(r11, __ post(dst, 1));
10008     __ strb(r12, __ post(dst, 1));
10009     __ strb(r13, __ post(dst, 1));
10010     __ lsr(r10, r10, 8);
10011     __ lsr(r11, r11, 8);
10012     __ lsr(r12, r12, 8);
10013     __ lsr(r13, r13, 8);
10014     __ b(StoreLegalData);
10015 
10016     __ BIND(NoIllegalData);
10017     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10018   }
10019 
10020 
10021    /**
10022    *  Arguments:
10023    *
10024    *  Input:
10025    *  c_rarg0   - src_start
10026    *  c_rarg1   - src_offset
10027    *  c_rarg2   - src_length
10028    *  c_rarg3   - dest_start
10029    *  c_rarg4   - dest_offset
10030    *  c_rarg5   - isURL
10031    *  c_rarg6   - isMIME
10032    *
10033    */
10034   address generate_base64_decodeBlock() {
10035 
10036     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10037     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10038     // titled "Base64 decoding".
10039 
10040     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10041     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10042     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10043     static const uint8_t fromBase64ForNoSIMD[256] = {
10044       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10045       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10046       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10047        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10048       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10049        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10050       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10051        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10052       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10054       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10055       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10056       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10058       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10059       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10060     };
10061 
10062     static const uint8_t fromBase64URLForNoSIMD[256] = {
10063       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10064       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10065       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10066        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10067       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10068        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10069       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10070        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10071       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10072       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10073       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10074       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10075       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10076       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10077       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10078       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10079     };
10080 
10081     // A legal value of base64 code is in range [0, 127].  We need two lookups
10082     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10083     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10084     // table vector lookup use tbx, out of range indices are unchanged in
10085     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10086     // The value of index 64 is set to 0, so that we know that we already get the
10087     // decoded data with the 1st lookup.
10088     static const uint8_t fromBase64ForSIMD[128] = {
10089       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10090       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10091       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10092        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10093         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10094        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10095       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10096        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10097     };
10098 
10099     static const uint8_t fromBase64URLForSIMD[128] = {
10100       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10101       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10102       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10103        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10104         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10105        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10106        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10107        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10108     };
10109 
10110     __ align(CodeEntryAlignment);
10111     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10112     StubCodeMark mark(this, stub_id);
10113     address start = __ pc();
10114 
10115     Register src    = c_rarg0;  // source array
10116     Register soff   = c_rarg1;  // source start offset
10117     Register send   = c_rarg2;  // source end offset
10118     Register dst    = c_rarg3;  // dest array
10119     Register doff   = c_rarg4;  // position for writing to dest array
10120     Register isURL  = c_rarg5;  // Base64 or URL character set
10121     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10122 
10123     Register length = send;    // reuse send as length of source data to process
10124 
10125     Register simd_codec   = c_rarg6;
10126     Register nosimd_codec = c_rarg7;
10127 
10128     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10129 
10130     __ enter();
10131 
10132     __ add(src, src, soff);
10133     __ add(dst, dst, doff);
10134 
10135     __ mov(doff, dst);
10136 
10137     __ sub(length, send, soff);
10138     __ bfm(length, zr, 0, 1);
10139 
10140     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10141     __ cbz(isURL, ProcessData);
10142     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10143 
10144     __ BIND(ProcessData);
10145     __ mov(rscratch1, length);
10146     __ cmp(length, (u1)144); // 144 = 80 + 64
10147     __ br(Assembler::LT, Process4B);
10148 
10149     // In the MIME case, the line length cannot be more than 76
10150     // bytes (see RFC 2045). This is too short a block for SIMD
10151     // to be worthwhile, so we use non-SIMD here.
10152     __ movw(rscratch1, 79);
10153 
10154     __ BIND(Process4B);
10155     __ ldrw(r14, __ post(src, 4));
10156     __ ubfxw(r10, r14, 0,  8);
10157     __ ubfxw(r11, r14, 8,  8);
10158     __ ubfxw(r12, r14, 16, 8);
10159     __ ubfxw(r13, r14, 24, 8);
10160     // get the de-code
10161     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10162     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10163     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10164     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10165     // error detection, 255u indicates an illegal input
10166     __ orrw(r14, r10, r11);
10167     __ orrw(r15, r12, r13);
10168     __ orrw(r14, r14, r15);
10169     __ tbnz(r14, 7, Exit);
10170     // recover the data
10171     __ lslw(r14, r10, 10);
10172     __ bfiw(r14, r11, 4, 6);
10173     __ bfmw(r14, r12, 2, 5);
10174     __ rev16w(r14, r14);
10175     __ bfiw(r13, r12, 6, 2);
10176     __ strh(r14, __ post(dst, 2));
10177     __ strb(r13, __ post(dst, 1));
10178     // non-simd loop
10179     __ subsw(rscratch1, rscratch1, 4);
10180     __ br(Assembler::GT, Process4B);
10181 
10182     // if exiting from PreProcess80B, rscratch1 == -1;
10183     // otherwise, rscratch1 == 0.
10184     __ cbzw(rscratch1, Exit);
10185     __ sub(length, length, 80);
10186 
10187     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10188     __ cbz(isURL, SIMDEnter);
10189     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10190 
10191     __ BIND(SIMDEnter);
10192     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10193     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10194     __ mov(rscratch1, 63);
10195     __ dup(v27, __ T16B, rscratch1);
10196 
10197     __ BIND(Process64B);
10198     __ cmp(length, (u1)64);
10199     __ br(Assembler::LT, Process32B);
10200     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10201     __ sub(length, length, 64);
10202     __ b(Process64B);
10203 
10204     __ BIND(Process32B);
10205     __ cmp(length, (u1)32);
10206     __ br(Assembler::LT, SIMDExit);
10207     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10208     __ sub(length, length, 32);
10209     __ b(Process32B);
10210 
10211     __ BIND(SIMDExit);
10212     __ cbz(length, Exit);
10213     __ movw(rscratch1, length);
10214     __ b(Process4B);
10215 
10216     __ BIND(Exit);
10217     __ sub(c_rarg0, dst, doff);
10218 
10219     __ leave();
10220     __ ret(lr);
10221 
10222     return start;
10223   }
10224 
10225   // Support for spin waits.
10226   address generate_spin_wait() {
10227     __ align(CodeEntryAlignment);
10228     StubId stub_id = StubId::stubgen_spin_wait_id;
10229     StubCodeMark mark(this, stub_id);
10230     address start = __ pc();
10231 
10232     __ spin_wait();
10233     __ ret(lr);
10234 
10235     return start;
10236   }
10237 
10238   void generate_lookup_secondary_supers_table_stub() {
10239     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10240     StubCodeMark mark(this, stub_id);
10241 
10242     const Register
10243       r_super_klass  = r0,
10244       r_array_base   = r1,
10245       r_array_length = r2,
10246       r_array_index  = r3,
10247       r_sub_klass    = r4,
10248       r_bitmap       = rscratch2,
10249       result         = r5;
10250     const FloatRegister
10251       vtemp          = v0;
10252 
10253     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10254       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10255       Label L_success;
10256       __ enter();
10257       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10258                                              r_array_base, r_array_length, r_array_index,
10259                                              vtemp, result, slot,
10260                                              /*stub_is_near*/true);
10261       __ leave();
10262       __ ret(lr);
10263     }
10264   }
10265 
10266   // Slow path implementation for UseSecondarySupersTable.
10267   address generate_lookup_secondary_supers_table_slow_path_stub() {
10268     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10269     StubCodeMark mark(this, stub_id);
10270 
10271     address start = __ pc();
10272     const Register
10273       r_super_klass  = r0,        // argument
10274       r_array_base   = r1,        // argument
10275       temp1          = r2,        // temp
10276       r_array_index  = r3,        // argument
10277       r_bitmap       = rscratch2, // argument
10278       result         = r5;        // argument
10279 
10280     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10281     __ ret(lr);
10282 
10283     return start;
10284   }
10285 
10286 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10287 
10288   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10289   //
10290   // If LSE is in use, generate LSE versions of all the stubs. The
10291   // non-LSE versions are in atomic_aarch64.S.
10292 
10293   // class AtomicStubMark records the entry point of a stub and the
10294   // stub pointer which will point to it. The stub pointer is set to
10295   // the entry point when ~AtomicStubMark() is called, which must be
10296   // after ICache::invalidate_range. This ensures safe publication of
10297   // the generated code.
10298   class AtomicStubMark {
10299     address _entry_point;
10300     aarch64_atomic_stub_t *_stub;
10301     MacroAssembler *_masm;
10302   public:
10303     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10304       _masm = masm;
10305       __ align(32);
10306       _entry_point = __ pc();
10307       _stub = stub;
10308     }
10309     ~AtomicStubMark() {
10310       *_stub = (aarch64_atomic_stub_t)_entry_point;
10311     }
10312   };
10313 
10314   // NB: For memory_order_conservative we need a trailing membar after
10315   // LSE atomic operations but not a leading membar.
10316   //
10317   // We don't need a leading membar because a clause in the Arm ARM
10318   // says:
10319   //
10320   //   Barrier-ordered-before
10321   //
10322   //   Barrier instructions order prior Memory effects before subsequent
10323   //   Memory effects generated by the same Observer. A read or a write
10324   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10325   //   Observer if and only if RW1 appears in program order before RW 2
10326   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10327   //   instruction with both Acquire and Release semantics.
10328   //
10329   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10330   // and Release semantics, therefore we don't need a leading
10331   // barrier. However, there is no corresponding Barrier-ordered-after
10332   // relationship, therefore we need a trailing membar to prevent a
10333   // later store or load from being reordered with the store in an
10334   // atomic instruction.
10335   //
10336   // This was checked by using the herd7 consistency model simulator
10337   // (http://diy.inria.fr/) with this test case:
10338   //
10339   // AArch64 LseCas
10340   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10341   // P0 | P1;
10342   // LDR W4, [X2] | MOV W3, #0;
10343   // DMB LD       | MOV W4, #1;
10344   // LDR W3, [X1] | CASAL W3, W4, [X1];
10345   //              | DMB ISH;
10346   //              | STR W4, [X2];
10347   // exists
10348   // (0:X3=0 /\ 0:X4=1)
10349   //
10350   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10351   // with the store to x in P1. Without the DMB in P1 this may happen.
10352   //
10353   // At the time of writing we don't know of any AArch64 hardware that
10354   // reorders stores in this way, but the Reference Manual permits it.
10355 
10356   void gen_cas_entry(Assembler::operand_size size,
10357                      atomic_memory_order order) {
10358     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10359       exchange_val = c_rarg2;
10360     bool acquire, release;
10361     switch (order) {
10362       case memory_order_relaxed:
10363         acquire = false;
10364         release = false;
10365         break;
10366       case memory_order_release:
10367         acquire = false;
10368         release = true;
10369         break;
10370       default:
10371         acquire = true;
10372         release = true;
10373         break;
10374     }
10375     __ mov(prev, compare_val);
10376     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10377     if (order == memory_order_conservative) {
10378       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10379     }
10380     if (size == Assembler::xword) {
10381       __ mov(r0, prev);
10382     } else {
10383       __ movw(r0, prev);
10384     }
10385     __ ret(lr);
10386   }
10387 
10388   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10389     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10390     // If not relaxed, then default to conservative.  Relaxed is the only
10391     // case we use enough to be worth specializing.
10392     if (order == memory_order_relaxed) {
10393       __ ldadd(size, incr, prev, addr);
10394     } else {
10395       __ ldaddal(size, incr, prev, addr);
10396       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10397     }
10398     if (size == Assembler::xword) {
10399       __ mov(r0, prev);
10400     } else {
10401       __ movw(r0, prev);
10402     }
10403     __ ret(lr);
10404   }
10405 
10406   void gen_swpal_entry(Assembler::operand_size size) {
10407     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10408     __ swpal(size, incr, prev, addr);
10409     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10410     if (size == Assembler::xword) {
10411       __ mov(r0, prev);
10412     } else {
10413       __ movw(r0, prev);
10414     }
10415     __ ret(lr);
10416   }
10417 
10418   void generate_atomic_entry_points() {
10419     if (! UseLSE) {
10420       return;
10421     }
10422     __ align(CodeEntryAlignment);
10423     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10424     StubCodeMark mark(this, stub_id);
10425     address first_entry = __ pc();
10426 
10427     // ADD, memory_order_conservative
10428     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10429     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10430     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10431     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10432 
10433     // ADD, memory_order_relaxed
10434     AtomicStubMark mark_fetch_add_4_relaxed
10435       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10436     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10437     AtomicStubMark mark_fetch_add_8_relaxed
10438       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10439     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10440 
10441     // XCHG, memory_order_conservative
10442     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10443     gen_swpal_entry(Assembler::word);
10444     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10445     gen_swpal_entry(Assembler::xword);
10446 
10447     // CAS, memory_order_conservative
10448     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10449     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10450     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10451     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10452     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10453     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10454 
10455     // CAS, memory_order_relaxed
10456     AtomicStubMark mark_cmpxchg_1_relaxed
10457       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10458     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10459     AtomicStubMark mark_cmpxchg_4_relaxed
10460       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10461     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10462     AtomicStubMark mark_cmpxchg_8_relaxed
10463       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10464     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10465 
10466     AtomicStubMark mark_cmpxchg_4_release
10467       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10468     gen_cas_entry(MacroAssembler::word, memory_order_release);
10469     AtomicStubMark mark_cmpxchg_8_release
10470       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10471     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10472 
10473     AtomicStubMark mark_cmpxchg_4_seq_cst
10474       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10475     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10476     AtomicStubMark mark_cmpxchg_8_seq_cst
10477       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10478     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10479 
10480     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10481   }
10482 #endif // LINUX
10483 
10484   static void save_return_registers(MacroAssembler* masm) {
10485     if (InlineTypeReturnedAsFields) {
10486       masm->push(RegSet::range(r0, r7), sp);
10487       masm->sub(sp, sp, 4 * wordSize);
10488       masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
10489       masm->sub(sp, sp, 4 * wordSize);
10490       masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
10491     } else {
10492       masm->fmovd(rscratch1, v0);
10493       masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
10494     }
10495   }
10496 
10497   static void restore_return_registers(MacroAssembler* masm) {
10498     if (InlineTypeReturnedAsFields) {
10499       masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10500       masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10501       masm->pop(RegSet::range(r0, r7), sp);
10502     } else {
10503       masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
10504       masm->fmovd(v0, rscratch1);
10505     }
10506   }
10507 
10508   address generate_cont_thaw(Continuation::thaw_kind kind) {
10509     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10510     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10511 
10512     address start = __ pc();
10513 
10514     if (return_barrier) {
10515       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10516       __ mov(sp, rscratch1);
10517     }
10518     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10519 
10520     if (return_barrier) {
10521       // preserve possible return value from a method returning to the return barrier
10522       save_return_registers(_masm);
10523     }
10524 
10525     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10526     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10527     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10528 
10529     if (return_barrier) {
10530       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10531       restore_return_registers(_masm);
10532     }
10533     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10534 
10535 
10536     Label thaw_success;
10537     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10538     __ cbnz(rscratch2, thaw_success);
10539     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10540     __ br(rscratch1);
10541     __ bind(thaw_success);
10542 
10543     // make room for the thawed frames
10544     __ sub(rscratch1, sp, rscratch2);
10545     __ andr(rscratch1, rscratch1, -16); // align
10546     __ mov(sp, rscratch1);
10547 
10548     if (return_barrier) {
10549       // save original return value -- again
10550       save_return_registers(_masm);
10551     }
10552 
10553     // If we want, we can templatize thaw by kind, and have three different entries
10554     __ movw(c_rarg1, (uint32_t)kind);
10555 
10556     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10557     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10558 
10559     if (return_barrier) {
10560       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10561       restore_return_registers(_masm);
10562     } else {
10563       __ mov(r0, zr); // return 0 (success) from doYield
10564     }
10565 
10566     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10567     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10568     __ mov(rfp, sp);
10569 
10570     if (return_barrier_exception) {
10571       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10572       __ authenticate_return_address(c_rarg1);
10573       __ verify_oop(r0);
10574       // save return value containing the exception oop in callee-saved R19
10575       __ mov(r19, r0);
10576 
10577       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10578 
10579       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10580       // __ reinitialize_ptrue();
10581 
10582       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10583 
10584       __ mov(r1, r0); // the exception handler
10585       __ mov(r0, r19); // restore return value containing the exception oop
10586       __ verify_oop(r0);
10587 
10588       __ leave();
10589       __ mov(r3, lr);
10590       __ br(r1); // the exception handler
10591     } else {
10592       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10593       __ leave();
10594       __ ret(lr);
10595     }
10596 
10597     return start;
10598   }
10599 
10600   address generate_cont_thaw() {
10601     if (!Continuations::enabled()) return nullptr;
10602 
10603     StubId stub_id = StubId::stubgen_cont_thaw_id;
10604     StubCodeMark mark(this, stub_id);
10605     address start = __ pc();
10606     generate_cont_thaw(Continuation::thaw_top);
10607     return start;
10608   }
10609 
10610   address generate_cont_returnBarrier() {
10611     if (!Continuations::enabled()) return nullptr;
10612 
10613     // TODO: will probably need multiple return barriers depending on return type
10614     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10615     StubCodeMark mark(this, stub_id);
10616     address start = __ pc();
10617 
10618     generate_cont_thaw(Continuation::thaw_return_barrier);
10619 
10620     return start;
10621   }
10622 
10623   address generate_cont_returnBarrier_exception() {
10624     if (!Continuations::enabled()) return nullptr;
10625 
10626     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10627     StubCodeMark mark(this, stub_id);
10628     address start = __ pc();
10629 
10630     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10631 
10632     return start;
10633   }
10634 
10635   address generate_cont_preempt_stub() {
10636     if (!Continuations::enabled()) return nullptr;
10637     StubId stub_id = StubId::stubgen_cont_preempt_id;
10638     StubCodeMark mark(this, stub_id);
10639     address start = __ pc();
10640 
10641     __ reset_last_Java_frame(true);
10642 
10643     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10644     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10645     __ mov(sp, rscratch2);
10646 
10647     Label preemption_cancelled;
10648     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10649     __ cbnz(rscratch1, preemption_cancelled);
10650 
10651     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10652     SharedRuntime::continuation_enter_cleanup(_masm);
10653     __ leave();
10654     __ ret(lr);
10655 
10656     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10657     __ bind(preemption_cancelled);
10658     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10659     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10660     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10661     __ ldr(rscratch1, Address(rscratch1));
10662     __ br(rscratch1);
10663 
10664     return start;
10665   }
10666 
10667   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10668   // are represented as long[5], with BITS_PER_LIMB = 26.
10669   // Pack five 26-bit limbs into three 64-bit registers.
10670   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10671     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10672     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10673     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10674     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10675 
10676     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10677     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10678     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10679     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10680 
10681     if (dest2->is_valid()) {
10682       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10683     } else {
10684 #ifdef ASSERT
10685       Label OK;
10686       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10687       __ br(__ EQ, OK);
10688       __ stop("high bits of Poly1305 integer should be zero");
10689       __ should_not_reach_here();
10690       __ bind(OK);
10691 #endif
10692     }
10693   }
10694 
10695   // As above, but return only a 128-bit integer, packed into two
10696   // 64-bit registers.
10697   void pack_26(Register dest0, Register dest1, Register src) {
10698     pack_26(dest0, dest1, noreg, src);
10699   }
10700 
10701   // Multiply and multiply-accumulate unsigned 64-bit registers.
10702   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10703     __ mul(prod_lo, n, m);
10704     __ umulh(prod_hi, n, m);
10705   }
10706   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10707     wide_mul(rscratch1, rscratch2, n, m);
10708     __ adds(sum_lo, sum_lo, rscratch1);
10709     __ adc(sum_hi, sum_hi, rscratch2);
10710   }
10711 
10712   // Poly1305, RFC 7539
10713 
10714   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10715   // description of the tricks used to simplify and accelerate this
10716   // computation.
10717 
10718   address generate_poly1305_processBlocks() {
10719     __ align(CodeEntryAlignment);
10720     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10721     StubCodeMark mark(this, stub_id);
10722     address start = __ pc();
10723     Label here;
10724     __ enter();
10725     RegSet callee_saved = RegSet::range(r19, r28);
10726     __ push(callee_saved, sp);
10727 
10728     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10729 
10730     // Arguments
10731     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10732 
10733     // R_n is the 128-bit randomly-generated key, packed into two
10734     // registers.  The caller passes this key to us as long[5], with
10735     // BITS_PER_LIMB = 26.
10736     const Register R_0 = *++regs, R_1 = *++regs;
10737     pack_26(R_0, R_1, r_start);
10738 
10739     // RR_n is (R_n >> 2) * 5
10740     const Register RR_0 = *++regs, RR_1 = *++regs;
10741     __ lsr(RR_0, R_0, 2);
10742     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10743     __ lsr(RR_1, R_1, 2);
10744     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10745 
10746     // U_n is the current checksum
10747     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10748     pack_26(U_0, U_1, U_2, acc_start);
10749 
10750     static constexpr int BLOCK_LENGTH = 16;
10751     Label DONE, LOOP;
10752 
10753     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10754     __ br(Assembler::LT, DONE); {
10755       __ bind(LOOP);
10756 
10757       // S_n is to be the sum of U_n and the next block of data
10758       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10759       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10760       __ adds(S_0, U_0, S_0);
10761       __ adcs(S_1, U_1, S_1);
10762       __ adc(S_2, U_2, zr);
10763       __ add(S_2, S_2, 1);
10764 
10765       const Register U_0HI = *++regs, U_1HI = *++regs;
10766 
10767       // NB: this logic depends on some of the special properties of
10768       // Poly1305 keys. In particular, because we know that the top
10769       // four bits of R_0 and R_1 are zero, we can add together
10770       // partial products without any risk of needing to propagate a
10771       // carry out.
10772       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10773       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10774       __ andr(U_2, R_0, 3);
10775       __ mul(U_2, S_2, U_2);
10776 
10777       // Recycle registers S_0, S_1, S_2
10778       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10779 
10780       // Partial reduction mod 2**130 - 5
10781       __ adds(U_1, U_0HI, U_1);
10782       __ adc(U_2, U_1HI, U_2);
10783       // Sum now in U_2:U_1:U_0.
10784       // Dead: U_0HI, U_1HI.
10785       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10786 
10787       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10788 
10789       // First, U_2:U_1:U_0 += (U_2 >> 2)
10790       __ lsr(rscratch1, U_2, 2);
10791       __ andr(U_2, U_2, (u8)3);
10792       __ adds(U_0, U_0, rscratch1);
10793       __ adcs(U_1, U_1, zr);
10794       __ adc(U_2, U_2, zr);
10795       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10796       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10797       __ adcs(U_1, U_1, zr);
10798       __ adc(U_2, U_2, zr);
10799 
10800       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10801       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10802       __ br(~ Assembler::LT, LOOP);
10803     }
10804 
10805     // Further reduce modulo 2^130 - 5
10806     __ lsr(rscratch1, U_2, 2);
10807     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10808     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10809     __ adcs(U_1, U_1, zr);
10810     __ andr(U_2, U_2, (u1)3);
10811     __ adc(U_2, U_2, zr);
10812 
10813     // Unpack the sum into five 26-bit limbs and write to memory.
10814     __ ubfiz(rscratch1, U_0, 0, 26);
10815     __ ubfx(rscratch2, U_0, 26, 26);
10816     __ stp(rscratch1, rscratch2, Address(acc_start));
10817     __ ubfx(rscratch1, U_0, 52, 12);
10818     __ bfi(rscratch1, U_1, 12, 14);
10819     __ ubfx(rscratch2, U_1, 14, 26);
10820     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10821     __ ubfx(rscratch1, U_1, 40, 24);
10822     __ bfi(rscratch1, U_2, 24, 3);
10823     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10824 
10825     __ bind(DONE);
10826     __ pop(callee_saved, sp);
10827     __ leave();
10828     __ ret(lr);
10829 
10830     return start;
10831   }
10832 
10833   // exception handler for upcall stubs
10834   address generate_upcall_stub_exception_handler() {
10835     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10836     StubCodeMark mark(this, stub_id);
10837     address start = __ pc();
10838 
10839     // Native caller has no idea how to handle exceptions,
10840     // so we just crash here. Up to callee to catch exceptions.
10841     __ verify_oop(r0);
10842     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10843     __ blr(rscratch1);
10844     __ should_not_reach_here();
10845 
10846     return start;
10847   }
10848 
10849   // load Method* target of MethodHandle
10850   // j_rarg0 = jobject receiver
10851   // rmethod = result
10852   address generate_upcall_stub_load_target() {
10853     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10854     StubCodeMark mark(this, stub_id);
10855     address start = __ pc();
10856 
10857     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10858       // Load target method from receiver
10859     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10860     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10861     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10862     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10863                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10864                       noreg, noreg);
10865     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10866 
10867     __ ret(lr);
10868 
10869     return start;
10870   }
10871 
10872 #undef __
10873 #define __ masm->
10874 
10875   class MontgomeryMultiplyGenerator : public MacroAssembler {
10876 
10877     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10878       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10879 
10880     RegSet _toSave;
10881     bool _squaring;
10882 
10883   public:
10884     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10885       : MacroAssembler(as->code()), _squaring(squaring) {
10886 
10887       // Register allocation
10888 
10889       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10890       Pa_base = *regs;       // Argument registers
10891       if (squaring)
10892         Pb_base = Pa_base;
10893       else
10894         Pb_base = *++regs;
10895       Pn_base = *++regs;
10896       Rlen= *++regs;
10897       inv = *++regs;
10898       Pm_base = *++regs;
10899 
10900                           // Working registers:
10901       Ra =  *++regs;        // The current digit of a, b, n, and m.
10902       Rb =  *++regs;
10903       Rm =  *++regs;
10904       Rn =  *++regs;
10905 
10906       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10907       Pb =  *++regs;
10908       Pm =  *++regs;
10909       Pn =  *++regs;
10910 
10911       t0 =  *++regs;        // Three registers which form a
10912       t1 =  *++regs;        // triple-precision accumuator.
10913       t2 =  *++regs;
10914 
10915       Ri =  *++regs;        // Inner and outer loop indexes.
10916       Rj =  *++regs;
10917 
10918       Rhi_ab = *++regs;     // Product registers: low and high parts
10919       Rlo_ab = *++regs;     // of a*b and m*n.
10920       Rhi_mn = *++regs;
10921       Rlo_mn = *++regs;
10922 
10923       // r19 and up are callee-saved.
10924       _toSave = RegSet::range(r19, *regs) + Pm_base;
10925     }
10926 
10927   private:
10928     void save_regs() {
10929       push(_toSave, sp);
10930     }
10931 
10932     void restore_regs() {
10933       pop(_toSave, sp);
10934     }
10935 
10936     template <typename T>
10937     void unroll_2(Register count, T block) {
10938       Label loop, end, odd;
10939       tbnz(count, 0, odd);
10940       cbz(count, end);
10941       align(16);
10942       bind(loop);
10943       (this->*block)();
10944       bind(odd);
10945       (this->*block)();
10946       subs(count, count, 2);
10947       br(Assembler::GT, loop);
10948       bind(end);
10949     }
10950 
10951     template <typename T>
10952     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10953       Label loop, end, odd;
10954       tbnz(count, 0, odd);
10955       cbz(count, end);
10956       align(16);
10957       bind(loop);
10958       (this->*block)(d, s, tmp);
10959       bind(odd);
10960       (this->*block)(d, s, tmp);
10961       subs(count, count, 2);
10962       br(Assembler::GT, loop);
10963       bind(end);
10964     }
10965 
10966     void pre1(RegisterOrConstant i) {
10967       block_comment("pre1");
10968       // Pa = Pa_base;
10969       // Pb = Pb_base + i;
10970       // Pm = Pm_base;
10971       // Pn = Pn_base + i;
10972       // Ra = *Pa;
10973       // Rb = *Pb;
10974       // Rm = *Pm;
10975       // Rn = *Pn;
10976       ldr(Ra, Address(Pa_base));
10977       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10978       ldr(Rm, Address(Pm_base));
10979       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10980       lea(Pa, Address(Pa_base));
10981       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10982       lea(Pm, Address(Pm_base));
10983       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10984 
10985       // Zero the m*n result.
10986       mov(Rhi_mn, zr);
10987       mov(Rlo_mn, zr);
10988     }
10989 
10990     // The core multiply-accumulate step of a Montgomery
10991     // multiplication.  The idea is to schedule operations as a
10992     // pipeline so that instructions with long latencies (loads and
10993     // multiplies) have time to complete before their results are
10994     // used.  This most benefits in-order implementations of the
10995     // architecture but out-of-order ones also benefit.
10996     void step() {
10997       block_comment("step");
10998       // MACC(Ra, Rb, t0, t1, t2);
10999       // Ra = *++Pa;
11000       // Rb = *--Pb;
11001       umulh(Rhi_ab, Ra, Rb);
11002       mul(Rlo_ab, Ra, Rb);
11003       ldr(Ra, pre(Pa, wordSize));
11004       ldr(Rb, pre(Pb, -wordSize));
11005       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11006                                        // previous iteration.
11007       // MACC(Rm, Rn, t0, t1, t2);
11008       // Rm = *++Pm;
11009       // Rn = *--Pn;
11010       umulh(Rhi_mn, Rm, Rn);
11011       mul(Rlo_mn, Rm, Rn);
11012       ldr(Rm, pre(Pm, wordSize));
11013       ldr(Rn, pre(Pn, -wordSize));
11014       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11015     }
11016 
11017     void post1() {
11018       block_comment("post1");
11019 
11020       // MACC(Ra, Rb, t0, t1, t2);
11021       // Ra = *++Pa;
11022       // Rb = *--Pb;
11023       umulh(Rhi_ab, Ra, Rb);
11024       mul(Rlo_ab, Ra, Rb);
11025       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11026       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11027 
11028       // *Pm = Rm = t0 * inv;
11029       mul(Rm, t0, inv);
11030       str(Rm, Address(Pm));
11031 
11032       // MACC(Rm, Rn, t0, t1, t2);
11033       // t0 = t1; t1 = t2; t2 = 0;
11034       umulh(Rhi_mn, Rm, Rn);
11035 
11036 #ifndef PRODUCT
11037       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11038       {
11039         mul(Rlo_mn, Rm, Rn);
11040         add(Rlo_mn, t0, Rlo_mn);
11041         Label ok;
11042         cbz(Rlo_mn, ok); {
11043           stop("broken Montgomery multiply");
11044         } bind(ok);
11045       }
11046 #endif
11047       // We have very carefully set things up so that
11048       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11049       // the lower half of Rm * Rn because we know the result already:
11050       // it must be -t0.  t0 + (-t0) must generate a carry iff
11051       // t0 != 0.  So, rather than do a mul and an adds we just set
11052       // the carry flag iff t0 is nonzero.
11053       //
11054       // mul(Rlo_mn, Rm, Rn);
11055       // adds(zr, t0, Rlo_mn);
11056       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11057       adcs(t0, t1, Rhi_mn);
11058       adc(t1, t2, zr);
11059       mov(t2, zr);
11060     }
11061 
11062     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11063       block_comment("pre2");
11064       // Pa = Pa_base + i-len;
11065       // Pb = Pb_base + len;
11066       // Pm = Pm_base + i-len;
11067       // Pn = Pn_base + len;
11068 
11069       if (i.is_register()) {
11070         sub(Rj, i.as_register(), len);
11071       } else {
11072         mov(Rj, i.as_constant());
11073         sub(Rj, Rj, len);
11074       }
11075       // Rj == i-len
11076 
11077       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11078       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11079       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11080       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11081 
11082       // Ra = *++Pa;
11083       // Rb = *--Pb;
11084       // Rm = *++Pm;
11085       // Rn = *--Pn;
11086       ldr(Ra, pre(Pa, wordSize));
11087       ldr(Rb, pre(Pb, -wordSize));
11088       ldr(Rm, pre(Pm, wordSize));
11089       ldr(Rn, pre(Pn, -wordSize));
11090 
11091       mov(Rhi_mn, zr);
11092       mov(Rlo_mn, zr);
11093     }
11094 
11095     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11096       block_comment("post2");
11097       if (i.is_constant()) {
11098         mov(Rj, i.as_constant()-len.as_constant());
11099       } else {
11100         sub(Rj, i.as_register(), len);
11101       }
11102 
11103       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11104 
11105       // As soon as we know the least significant digit of our result,
11106       // store it.
11107       // Pm_base[i-len] = t0;
11108       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11109 
11110       // t0 = t1; t1 = t2; t2 = 0;
11111       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11112       adc(t1, t2, zr);
11113       mov(t2, zr);
11114     }
11115 
11116     // A carry in t0 after Montgomery multiplication means that we
11117     // should subtract multiples of n from our result in m.  We'll
11118     // keep doing that until there is no carry.
11119     void normalize(RegisterOrConstant len) {
11120       block_comment("normalize");
11121       // while (t0)
11122       //   t0 = sub(Pm_base, Pn_base, t0, len);
11123       Label loop, post, again;
11124       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11125       cbz(t0, post); {
11126         bind(again); {
11127           mov(i, zr);
11128           mov(cnt, len);
11129           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11130           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11131           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11132           align(16);
11133           bind(loop); {
11134             sbcs(Rm, Rm, Rn);
11135             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11136             add(i, i, 1);
11137             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11138             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11139             sub(cnt, cnt, 1);
11140           } cbnz(cnt, loop);
11141           sbc(t0, t0, zr);
11142         } cbnz(t0, again);
11143       } bind(post);
11144     }
11145 
11146     // Move memory at s to d, reversing words.
11147     //    Increments d to end of copied memory
11148     //    Destroys tmp1, tmp2
11149     //    Preserves len
11150     //    Leaves s pointing to the address which was in d at start
11151     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11152       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11153       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11154 
11155       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11156       mov(tmp1, len);
11157       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11158       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11159     }
11160     // where
11161     void reverse1(Register d, Register s, Register tmp) {
11162       ldr(tmp, pre(s, -wordSize));
11163       ror(tmp, tmp, 32);
11164       str(tmp, post(d, wordSize));
11165     }
11166 
11167     void step_squaring() {
11168       // An extra ACC
11169       step();
11170       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11171     }
11172 
11173     void last_squaring(RegisterOrConstant i) {
11174       Label dont;
11175       // if ((i & 1) == 0) {
11176       tbnz(i.as_register(), 0, dont); {
11177         // MACC(Ra, Rb, t0, t1, t2);
11178         // Ra = *++Pa;
11179         // Rb = *--Pb;
11180         umulh(Rhi_ab, Ra, Rb);
11181         mul(Rlo_ab, Ra, Rb);
11182         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11183       } bind(dont);
11184     }
11185 
11186     void extra_step_squaring() {
11187       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11188 
11189       // MACC(Rm, Rn, t0, t1, t2);
11190       // Rm = *++Pm;
11191       // Rn = *--Pn;
11192       umulh(Rhi_mn, Rm, Rn);
11193       mul(Rlo_mn, Rm, Rn);
11194       ldr(Rm, pre(Pm, wordSize));
11195       ldr(Rn, pre(Pn, -wordSize));
11196     }
11197 
11198     void post1_squaring() {
11199       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11200 
11201       // *Pm = Rm = t0 * inv;
11202       mul(Rm, t0, inv);
11203       str(Rm, Address(Pm));
11204 
11205       // MACC(Rm, Rn, t0, t1, t2);
11206       // t0 = t1; t1 = t2; t2 = 0;
11207       umulh(Rhi_mn, Rm, Rn);
11208 
11209 #ifndef PRODUCT
11210       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11211       {
11212         mul(Rlo_mn, Rm, Rn);
11213         add(Rlo_mn, t0, Rlo_mn);
11214         Label ok;
11215         cbz(Rlo_mn, ok); {
11216           stop("broken Montgomery multiply");
11217         } bind(ok);
11218       }
11219 #endif
11220       // We have very carefully set things up so that
11221       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11222       // the lower half of Rm * Rn because we know the result already:
11223       // it must be -t0.  t0 + (-t0) must generate a carry iff
11224       // t0 != 0.  So, rather than do a mul and an adds we just set
11225       // the carry flag iff t0 is nonzero.
11226       //
11227       // mul(Rlo_mn, Rm, Rn);
11228       // adds(zr, t0, Rlo_mn);
11229       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11230       adcs(t0, t1, Rhi_mn);
11231       adc(t1, t2, zr);
11232       mov(t2, zr);
11233     }
11234 
11235     void acc(Register Rhi, Register Rlo,
11236              Register t0, Register t1, Register t2) {
11237       adds(t0, t0, Rlo);
11238       adcs(t1, t1, Rhi);
11239       adc(t2, t2, zr);
11240     }
11241 
11242   public:
11243     /**
11244      * Fast Montgomery multiplication.  The derivation of the
11245      * algorithm is in A Cryptographic Library for the Motorola
11246      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11247      *
11248      * Arguments:
11249      *
11250      * Inputs for multiplication:
11251      *   c_rarg0   - int array elements a
11252      *   c_rarg1   - int array elements b
11253      *   c_rarg2   - int array elements n (the modulus)
11254      *   c_rarg3   - int length
11255      *   c_rarg4   - int inv
11256      *   c_rarg5   - int array elements m (the result)
11257      *
11258      * Inputs for squaring:
11259      *   c_rarg0   - int array elements a
11260      *   c_rarg1   - int array elements n (the modulus)
11261      *   c_rarg2   - int length
11262      *   c_rarg3   - int inv
11263      *   c_rarg4   - int array elements m (the result)
11264      *
11265      */
11266     address generate_multiply() {
11267       Label argh, nothing;
11268       bind(argh);
11269       stop("MontgomeryMultiply total_allocation must be <= 8192");
11270 
11271       align(CodeEntryAlignment);
11272       address entry = pc();
11273 
11274       cbzw(Rlen, nothing);
11275 
11276       enter();
11277 
11278       // Make room.
11279       cmpw(Rlen, 512);
11280       br(Assembler::HI, argh);
11281       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11282       andr(sp, Ra, -2 * wordSize);
11283 
11284       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11285 
11286       {
11287         // Copy input args, reversing as we go.  We use Ra as a
11288         // temporary variable.
11289         reverse(Ra, Pa_base, Rlen, t0, t1);
11290         if (!_squaring)
11291           reverse(Ra, Pb_base, Rlen, t0, t1);
11292         reverse(Ra, Pn_base, Rlen, t0, t1);
11293       }
11294 
11295       // Push all call-saved registers and also Pm_base which we'll need
11296       // at the end.
11297       save_regs();
11298 
11299 #ifndef PRODUCT
11300       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11301       {
11302         ldr(Rn, Address(Pn_base, 0));
11303         mul(Rlo_mn, Rn, inv);
11304         subs(zr, Rlo_mn, -1);
11305         Label ok;
11306         br(EQ, ok); {
11307           stop("broken inverse in Montgomery multiply");
11308         } bind(ok);
11309       }
11310 #endif
11311 
11312       mov(Pm_base, Ra);
11313 
11314       mov(t0, zr);
11315       mov(t1, zr);
11316       mov(t2, zr);
11317 
11318       block_comment("for (int i = 0; i < len; i++) {");
11319       mov(Ri, zr); {
11320         Label loop, end;
11321         cmpw(Ri, Rlen);
11322         br(Assembler::GE, end);
11323 
11324         bind(loop);
11325         pre1(Ri);
11326 
11327         block_comment("  for (j = i; j; j--) {"); {
11328           movw(Rj, Ri);
11329           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11330         } block_comment("  } // j");
11331 
11332         post1();
11333         addw(Ri, Ri, 1);
11334         cmpw(Ri, Rlen);
11335         br(Assembler::LT, loop);
11336         bind(end);
11337         block_comment("} // i");
11338       }
11339 
11340       block_comment("for (int i = len; i < 2*len; i++) {");
11341       mov(Ri, Rlen); {
11342         Label loop, end;
11343         cmpw(Ri, Rlen, Assembler::LSL, 1);
11344         br(Assembler::GE, end);
11345 
11346         bind(loop);
11347         pre2(Ri, Rlen);
11348 
11349         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11350           lslw(Rj, Rlen, 1);
11351           subw(Rj, Rj, Ri);
11352           subw(Rj, Rj, 1);
11353           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11354         } block_comment("  } // j");
11355 
11356         post2(Ri, Rlen);
11357         addw(Ri, Ri, 1);
11358         cmpw(Ri, Rlen, Assembler::LSL, 1);
11359         br(Assembler::LT, loop);
11360         bind(end);
11361       }
11362       block_comment("} // i");
11363 
11364       normalize(Rlen);
11365 
11366       mov(Ra, Pm_base);  // Save Pm_base in Ra
11367       restore_regs();  // Restore caller's Pm_base
11368 
11369       // Copy our result into caller's Pm_base
11370       reverse(Pm_base, Ra, Rlen, t0, t1);
11371 
11372       leave();
11373       bind(nothing);
11374       ret(lr);
11375 
11376       return entry;
11377     }
11378     // In C, approximately:
11379 
11380     // void
11381     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11382     //                     julong Pn_base[], julong Pm_base[],
11383     //                     julong inv, int len) {
11384     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11385     //   julong *Pa, *Pb, *Pn, *Pm;
11386     //   julong Ra, Rb, Rn, Rm;
11387 
11388     //   int i;
11389 
11390     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11391 
11392     //   for (i = 0; i < len; i++) {
11393     //     int j;
11394 
11395     //     Pa = Pa_base;
11396     //     Pb = Pb_base + i;
11397     //     Pm = Pm_base;
11398     //     Pn = Pn_base + i;
11399 
11400     //     Ra = *Pa;
11401     //     Rb = *Pb;
11402     //     Rm = *Pm;
11403     //     Rn = *Pn;
11404 
11405     //     int iters = i;
11406     //     for (j = 0; iters--; j++) {
11407     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11408     //       MACC(Ra, Rb, t0, t1, t2);
11409     //       Ra = *++Pa;
11410     //       Rb = *--Pb;
11411     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11412     //       MACC(Rm, Rn, t0, t1, t2);
11413     //       Rm = *++Pm;
11414     //       Rn = *--Pn;
11415     //     }
11416 
11417     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11418     //     MACC(Ra, Rb, t0, t1, t2);
11419     //     *Pm = Rm = t0 * inv;
11420     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11421     //     MACC(Rm, Rn, t0, t1, t2);
11422 
11423     //     assert(t0 == 0, "broken Montgomery multiply");
11424 
11425     //     t0 = t1; t1 = t2; t2 = 0;
11426     //   }
11427 
11428     //   for (i = len; i < 2*len; i++) {
11429     //     int j;
11430 
11431     //     Pa = Pa_base + i-len;
11432     //     Pb = Pb_base + len;
11433     //     Pm = Pm_base + i-len;
11434     //     Pn = Pn_base + len;
11435 
11436     //     Ra = *++Pa;
11437     //     Rb = *--Pb;
11438     //     Rm = *++Pm;
11439     //     Rn = *--Pn;
11440 
11441     //     int iters = len*2-i-1;
11442     //     for (j = i-len+1; iters--; j++) {
11443     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11444     //       MACC(Ra, Rb, t0, t1, t2);
11445     //       Ra = *++Pa;
11446     //       Rb = *--Pb;
11447     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11448     //       MACC(Rm, Rn, t0, t1, t2);
11449     //       Rm = *++Pm;
11450     //       Rn = *--Pn;
11451     //     }
11452 
11453     //     Pm_base[i-len] = t0;
11454     //     t0 = t1; t1 = t2; t2 = 0;
11455     //   }
11456 
11457     //   while (t0)
11458     //     t0 = sub(Pm_base, Pn_base, t0, len);
11459     // }
11460 
11461     /**
11462      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11463      * multiplies than Montgomery multiplication so it should be up to
11464      * 25% faster.  However, its loop control is more complex and it
11465      * may actually run slower on some machines.
11466      *
11467      * Arguments:
11468      *
11469      * Inputs:
11470      *   c_rarg0   - int array elements a
11471      *   c_rarg1   - int array elements n (the modulus)
11472      *   c_rarg2   - int length
11473      *   c_rarg3   - int inv
11474      *   c_rarg4   - int array elements m (the result)
11475      *
11476      */
11477     address generate_square() {
11478       Label argh;
11479       bind(argh);
11480       stop("MontgomeryMultiply total_allocation must be <= 8192");
11481 
11482       align(CodeEntryAlignment);
11483       address entry = pc();
11484 
11485       enter();
11486 
11487       // Make room.
11488       cmpw(Rlen, 512);
11489       br(Assembler::HI, argh);
11490       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11491       andr(sp, Ra, -2 * wordSize);
11492 
11493       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11494 
11495       {
11496         // Copy input args, reversing as we go.  We use Ra as a
11497         // temporary variable.
11498         reverse(Ra, Pa_base, Rlen, t0, t1);
11499         reverse(Ra, Pn_base, Rlen, t0, t1);
11500       }
11501 
11502       // Push all call-saved registers and also Pm_base which we'll need
11503       // at the end.
11504       save_regs();
11505 
11506       mov(Pm_base, Ra);
11507 
11508       mov(t0, zr);
11509       mov(t1, zr);
11510       mov(t2, zr);
11511 
11512       block_comment("for (int i = 0; i < len; i++) {");
11513       mov(Ri, zr); {
11514         Label loop, end;
11515         bind(loop);
11516         cmp(Ri, Rlen);
11517         br(Assembler::GE, end);
11518 
11519         pre1(Ri);
11520 
11521         block_comment("for (j = (i+1)/2; j; j--) {"); {
11522           add(Rj, Ri, 1);
11523           lsr(Rj, Rj, 1);
11524           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11525         } block_comment("  } // j");
11526 
11527         last_squaring(Ri);
11528 
11529         block_comment("  for (j = i/2; j; j--) {"); {
11530           lsr(Rj, Ri, 1);
11531           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11532         } block_comment("  } // j");
11533 
11534         post1_squaring();
11535         add(Ri, Ri, 1);
11536         cmp(Ri, Rlen);
11537         br(Assembler::LT, loop);
11538 
11539         bind(end);
11540         block_comment("} // i");
11541       }
11542 
11543       block_comment("for (int i = len; i < 2*len; i++) {");
11544       mov(Ri, Rlen); {
11545         Label loop, end;
11546         bind(loop);
11547         cmp(Ri, Rlen, Assembler::LSL, 1);
11548         br(Assembler::GE, end);
11549 
11550         pre2(Ri, Rlen);
11551 
11552         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11553           lsl(Rj, Rlen, 1);
11554           sub(Rj, Rj, Ri);
11555           sub(Rj, Rj, 1);
11556           lsr(Rj, Rj, 1);
11557           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11558         } block_comment("  } // j");
11559 
11560         last_squaring(Ri);
11561 
11562         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11563           lsl(Rj, Rlen, 1);
11564           sub(Rj, Rj, Ri);
11565           lsr(Rj, Rj, 1);
11566           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11567         } block_comment("  } // j");
11568 
11569         post2(Ri, Rlen);
11570         add(Ri, Ri, 1);
11571         cmp(Ri, Rlen, Assembler::LSL, 1);
11572 
11573         br(Assembler::LT, loop);
11574         bind(end);
11575         block_comment("} // i");
11576       }
11577 
11578       normalize(Rlen);
11579 
11580       mov(Ra, Pm_base);  // Save Pm_base in Ra
11581       restore_regs();  // Restore caller's Pm_base
11582 
11583       // Copy our result into caller's Pm_base
11584       reverse(Pm_base, Ra, Rlen, t0, t1);
11585 
11586       leave();
11587       ret(lr);
11588 
11589       return entry;
11590     }
11591     // In C, approximately:
11592 
11593     // void
11594     // montgomery_square(julong Pa_base[], julong Pn_base[],
11595     //                   julong Pm_base[], julong inv, int len) {
11596     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11597     //   julong *Pa, *Pb, *Pn, *Pm;
11598     //   julong Ra, Rb, Rn, Rm;
11599 
11600     //   int i;
11601 
11602     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11603 
11604     //   for (i = 0; i < len; i++) {
11605     //     int j;
11606 
11607     //     Pa = Pa_base;
11608     //     Pb = Pa_base + i;
11609     //     Pm = Pm_base;
11610     //     Pn = Pn_base + i;
11611 
11612     //     Ra = *Pa;
11613     //     Rb = *Pb;
11614     //     Rm = *Pm;
11615     //     Rn = *Pn;
11616 
11617     //     int iters = (i+1)/2;
11618     //     for (j = 0; iters--; j++) {
11619     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11620     //       MACC2(Ra, Rb, t0, t1, t2);
11621     //       Ra = *++Pa;
11622     //       Rb = *--Pb;
11623     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11624     //       MACC(Rm, Rn, t0, t1, t2);
11625     //       Rm = *++Pm;
11626     //       Rn = *--Pn;
11627     //     }
11628     //     if ((i & 1) == 0) {
11629     //       assert(Ra == Pa_base[j], "must be");
11630     //       MACC(Ra, Ra, t0, t1, t2);
11631     //     }
11632     //     iters = i/2;
11633     //     assert(iters == i-j, "must be");
11634     //     for (; iters--; j++) {
11635     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11636     //       MACC(Rm, Rn, t0, t1, t2);
11637     //       Rm = *++Pm;
11638     //       Rn = *--Pn;
11639     //     }
11640 
11641     //     *Pm = Rm = t0 * inv;
11642     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11643     //     MACC(Rm, Rn, t0, t1, t2);
11644 
11645     //     assert(t0 == 0, "broken Montgomery multiply");
11646 
11647     //     t0 = t1; t1 = t2; t2 = 0;
11648     //   }
11649 
11650     //   for (i = len; i < 2*len; i++) {
11651     //     int start = i-len+1;
11652     //     int end = start + (len - start)/2;
11653     //     int j;
11654 
11655     //     Pa = Pa_base + i-len;
11656     //     Pb = Pa_base + len;
11657     //     Pm = Pm_base + i-len;
11658     //     Pn = Pn_base + len;
11659 
11660     //     Ra = *++Pa;
11661     //     Rb = *--Pb;
11662     //     Rm = *++Pm;
11663     //     Rn = *--Pn;
11664 
11665     //     int iters = (2*len-i-1)/2;
11666     //     assert(iters == end-start, "must be");
11667     //     for (j = start; iters--; j++) {
11668     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11669     //       MACC2(Ra, Rb, t0, t1, t2);
11670     //       Ra = *++Pa;
11671     //       Rb = *--Pb;
11672     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11673     //       MACC(Rm, Rn, t0, t1, t2);
11674     //       Rm = *++Pm;
11675     //       Rn = *--Pn;
11676     //     }
11677     //     if ((i & 1) == 0) {
11678     //       assert(Ra == Pa_base[j], "must be");
11679     //       MACC(Ra, Ra, t0, t1, t2);
11680     //     }
11681     //     iters =  (2*len-i)/2;
11682     //     assert(iters == len-j, "must be");
11683     //     for (; iters--; j++) {
11684     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11685     //       MACC(Rm, Rn, t0, t1, t2);
11686     //       Rm = *++Pm;
11687     //       Rn = *--Pn;
11688     //     }
11689     //     Pm_base[i-len] = t0;
11690     //     t0 = t1; t1 = t2; t2 = 0;
11691     //   }
11692 
11693     //   while (t0)
11694     //     t0 = sub(Pm_base, Pn_base, t0, len);
11695     // }
11696   };
11697 
11698   // Call here from the interpreter or compiled code to either load
11699   // multiple returned values from the inline type instance being
11700   // returned to registers or to store returned values to a newly
11701   // allocated inline type instance.
11702   address generate_return_value_stub(address destination, const char* name, bool has_res) {
11703     // We need to save all registers the calling convention may use so
11704     // the runtime calls read or update those registers. This needs to
11705     // be in sync with SharedRuntime::java_return_convention().
11706     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11707     enum layout {
11708       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
11709       j_rarg6_off, j_rarg6_2,
11710       j_rarg5_off, j_rarg5_2,
11711       j_rarg4_off, j_rarg4_2,
11712       j_rarg3_off, j_rarg3_2,
11713       j_rarg2_off, j_rarg2_2,
11714       j_rarg1_off, j_rarg1_2,
11715       j_rarg0_off, j_rarg0_2,
11716 
11717       j_farg7_off, j_farg7_2,
11718       j_farg6_off, j_farg6_2,
11719       j_farg5_off, j_farg5_2,
11720       j_farg4_off, j_farg4_2,
11721       j_farg3_off, j_farg3_2,
11722       j_farg2_off, j_farg2_2,
11723       j_farg1_off, j_farg1_2,
11724       j_farg0_off, j_farg0_2,
11725 
11726       rfp_off, rfp_off2,
11727       return_off, return_off2,
11728 
11729       framesize // inclusive of return address
11730     };
11731 
11732     CodeBuffer code(name, 512, 64);
11733     MacroAssembler* masm = new MacroAssembler(&code);
11734 
11735     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11736     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11737     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11738     int frame_size_in_words = frame_size_in_bytes / wordSize;
11739 
11740     OopMapSet* oop_maps = new OopMapSet();
11741     OopMap* map = new OopMap(frame_size_in_slots, 0);
11742 
11743     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11744     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11745     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11746     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11747     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11748     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11749     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11750     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11751 
11752     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11753     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11754     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11755     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11756     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11757     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11758     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11759     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11760 
11761     address start = __ pc();
11762 
11763     __ enter(); // Save FP and LR before call
11764 
11765     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11766     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11767     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11768     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11769 
11770     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11771     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11772     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11773     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11774 
11775     int frame_complete = __ offset();
11776 
11777     // Set up last_Java_sp and last_Java_fp
11778     address the_pc = __ pc();
11779     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11780 
11781     // Call runtime
11782     __ mov(c_rarg1, r0);
11783     __ mov(c_rarg0, rthread);
11784 
11785     __ mov(rscratch1, destination);
11786     __ blr(rscratch1);
11787 
11788     oop_maps->add_gc_map(the_pc - start, map);
11789 
11790     __ reset_last_Java_frame(false);
11791 
11792     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11793     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11794     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11795     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11796 
11797     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11798     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11799     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11800     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11801 
11802     __ leave();
11803 
11804     // check for pending exceptions
11805     Label pending;
11806     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11807     __ cbnz(rscratch1, pending);
11808 
11809     if (has_res) {
11810       __ get_vm_result_oop(r0, rthread);
11811     }
11812 
11813     __ ret(lr);
11814 
11815     __ bind(pending);
11816     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11817 
11818     // -------------
11819     // make sure all code is generated
11820     masm->flush();
11821 
11822     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11823     return stub->entry_point();
11824   }
11825 
11826   // Initialization
11827   void generate_preuniverse_stubs() {
11828     // preuniverse stubs are not needed for aarch64
11829   }
11830 
11831   void generate_initial_stubs() {
11832     // Generate initial stubs and initializes the entry points
11833 
11834     // entry points that exist in all platforms Note: This is code
11835     // that could be shared among different platforms - however the
11836     // benefit seems to be smaller than the disadvantage of having a
11837     // much more complicated generator structure. See also comment in
11838     // stubRoutines.hpp.
11839 
11840     StubRoutines::_forward_exception_entry = generate_forward_exception();
11841 
11842     StubRoutines::_call_stub_entry =
11843       generate_call_stub(StubRoutines::_call_stub_return_address);
11844 
11845     // is referenced by megamorphic call
11846     StubRoutines::_catch_exception_entry = generate_catch_exception();
11847 
11848     // Initialize table for copy memory (arraycopy) check.
11849     if (UnsafeMemoryAccess::_table == nullptr) {
11850       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11851     }
11852 
11853     if (UseCRC32Intrinsics) {
11854       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11855     }
11856 
11857     if (UseCRC32CIntrinsics) {
11858       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11859     }
11860 
11861     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11862       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11863     }
11864 
11865     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11866       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11867     }
11868 
11869     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11870         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11871       StubRoutines::_hf2f = generate_float16ToFloat();
11872       StubRoutines::_f2hf = generate_floatToFloat16();
11873     }
11874 
11875     if (InlineTypeReturnedAsFields) {
11876       StubRoutines::_load_inline_type_fields_in_regs =
11877          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11878       StubRoutines::_store_inline_type_fields_to_buf =
11879          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11880     }
11881 
11882   }
11883 
11884   void generate_continuation_stubs() {
11885     // Continuation stubs:
11886     StubRoutines::_cont_thaw          = generate_cont_thaw();
11887     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11888     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11889     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11890   }
11891 
11892   void generate_final_stubs() {
11893     // support for verify_oop (must happen after universe_init)
11894     if (VerifyOops) {
11895       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11896     }
11897 
11898     // arraycopy stubs used by compilers
11899     generate_arraycopy_stubs();
11900 
11901     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11902 
11903     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11904 
11905     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11906     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11907 
11908 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11909 
11910     generate_atomic_entry_points();
11911 
11912 #endif // LINUX
11913 
11914 #ifdef COMPILER2
11915     if (UseSecondarySupersTable) {
11916       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11917       if (! InlineSecondarySupersTest) {
11918         generate_lookup_secondary_supers_table_stub();
11919       }
11920     }
11921 #endif
11922 
11923     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
11924       StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11925     }
11926 
11927     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11928   }
11929 
11930   void generate_compiler_stubs() {
11931 #if COMPILER2_OR_JVMCI
11932 
11933     if (UseSVE == 0) {
11934       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11935     }
11936 
11937     // array equals stub for large arrays.
11938     if (!UseSimpleArrayEquals) {
11939       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11940     }
11941 
11942     // arrays_hascode stub for large arrays.
11943     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11944     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11945     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11946     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11947     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11948 
11949     // byte_array_inflate stub for large arrays.
11950     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11951 
11952     // countPositives stub for large arrays.
11953     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11954 
11955     generate_compare_long_strings();
11956 
11957     generate_string_indexof_stubs();
11958 
11959 #ifdef COMPILER2
11960     if (UseMultiplyToLenIntrinsic) {
11961       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11962     }
11963 
11964     if (UseSquareToLenIntrinsic) {
11965       StubRoutines::_squareToLen = generate_squareToLen();
11966     }
11967 
11968     if (UseMulAddIntrinsic) {
11969       StubRoutines::_mulAdd = generate_mulAdd();
11970     }
11971 
11972     if (UseSIMDForBigIntegerShiftIntrinsics) {
11973       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11974       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11975     }
11976 
11977     if (UseMontgomeryMultiplyIntrinsic) {
11978       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11979       StubCodeMark mark(this, stub_id);
11980       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11981       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11982     }
11983 
11984     if (UseMontgomerySquareIntrinsic) {
11985       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11986       StubCodeMark mark(this, stub_id);
11987       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11988       // We use generate_multiply() rather than generate_square()
11989       // because it's faster for the sizes of modulus we care about.
11990       StubRoutines::_montgomerySquare = g.generate_multiply();
11991     }
11992 
11993 #endif // COMPILER2
11994 
11995     if (UseChaCha20Intrinsics) {
11996       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11997     }
11998 
11999     if (UseKyberIntrinsics) {
12000       StubRoutines::_kyberNtt = generate_kyberNtt();
12001       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12002       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12003       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12004       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12005       StubRoutines::_kyber12To16 = generate_kyber12To16();
12006       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12007     }
12008 
12009     if (UseDilithiumIntrinsics) {
12010       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12011       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12012       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12013       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12014       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12015     }
12016 
12017     if (UseBASE64Intrinsics) {
12018         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12019         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12020     }
12021 
12022     // data cache line writeback
12023     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12024     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12025 
12026     if (UseAESIntrinsics) {
12027       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12028       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12029       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12030       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12031       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12032     }
12033     if (UseGHASHIntrinsics) {
12034       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12035       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
12036     }
12037     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12038       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12039     }
12040 
12041     if (UseMD5Intrinsics) {
12042       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12043       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12044     }
12045     if (UseSHA1Intrinsics) {
12046       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12047       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12048     }
12049     if (UseSHA256Intrinsics) {
12050       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12051       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12052     }
12053     if (UseSHA512Intrinsics) {
12054       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12055       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12056     }
12057     if (UseSHA3Intrinsics) {
12058 
12059       StubRoutines::_double_keccak         = generate_double_keccak();
12060       if (UseSIMDForSHA3Intrinsic) {
12061          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12062          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12063       } else {
12064          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12065          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12066       }
12067     }
12068 
12069     if (UsePoly1305Intrinsics) {
12070       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12071     }
12072 
12073     // generate Adler32 intrinsics code
12074     if (UseAdler32Intrinsics) {
12075       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12076     }
12077 
12078 #endif // COMPILER2_OR_JVMCI
12079   }
12080 
12081  public:
12082   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
12083     switch(blob_id) {
12084     case BlobId::stubgen_preuniverse_id:
12085       generate_preuniverse_stubs();
12086       break;
12087     case BlobId::stubgen_initial_id:
12088       generate_initial_stubs();
12089       break;
12090      case BlobId::stubgen_continuation_id:
12091       generate_continuation_stubs();
12092       break;
12093     case BlobId::stubgen_compiler_id:
12094       generate_compiler_stubs();
12095       break;
12096     case BlobId::stubgen_final_id:
12097       generate_final_stubs();
12098       break;
12099     default:
12100       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12101       break;
12102     };
12103   }
12104 }; // end class declaration
12105 
12106 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12107   StubGenerator g(code, blob_id);
12108 }
12109 
12110 
12111 #if defined (LINUX)
12112 
12113 // Define pointers to atomic stubs and initialize them to point to the
12114 // code in atomic_aarch64.S.
12115 
12116 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12117   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12118     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12119   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12120     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12121 
12122 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12123 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12124 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12125 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12126 DEFAULT_ATOMIC_OP(xchg, 4, )
12127 DEFAULT_ATOMIC_OP(xchg, 8, )
12128 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12129 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12130 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12131 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12132 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12133 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12134 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12135 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12136 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12137 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12138 
12139 #undef DEFAULT_ATOMIC_OP
12140 
12141 #endif // LINUX