1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "code/aotCodeCache.hpp"
   31 #include "compiler/oopMap.hpp"
   32 #include "gc/shared/barrierSet.hpp"
   33 #include "gc/shared/barrierSetAssembler.hpp"
   34 #include "gc/shared/gc_globals.hpp"
   35 #include "gc/shared/tlab_globals.hpp"
   36 #include "interpreter/interpreter.hpp"
   37 #include "memory/universe.hpp"
   38 #include "nativeInst_aarch64.hpp"
   39 #include "oops/instanceOop.hpp"
   40 #include "oops/method.hpp"
   41 #include "oops/objArrayKlass.hpp"
   42 #include "oops/oop.inline.hpp"
   43 #include "prims/methodHandles.hpp"
   44 #include "prims/upcallLinker.hpp"
   45 #include "runtime/arguments.hpp"
   46 #include "runtime/atomicAccess.hpp"
   47 #include "runtime/continuation.hpp"
   48 #include "runtime/continuationEntry.inline.hpp"
   49 #include "runtime/frame.inline.hpp"
   50 #include "runtime/handles.inline.hpp"
   51 #include "runtime/javaThread.hpp"
   52 #include "runtime/sharedRuntime.hpp"
   53 #include "runtime/stubCodeGenerator.hpp"
   54 #include "runtime/stubRoutines.hpp"
   55 #include "utilities/align.hpp"
   56 #include "utilities/checkedCast.hpp"
   57 #include "utilities/debug.hpp"
   58 #include "utilities/globalDefinitions.hpp"
   59 #include "utilities/intpow.hpp"
   60 #include "utilities/powerOfTwo.hpp"
   61 #ifdef COMPILER2
   62 #include "opto/runtime.hpp"
   63 #endif
   64 #if INCLUDE_ZGC
   65 #include "gc/z/zThreadLocalData.hpp"
   66 #endif
   67 
   68 // Declaration and definition of StubGenerator (no .hpp file).
   69 // For a more detailed description of the stub routine structure
   70 // see the comment in stubRoutines.hpp
   71 
   72 #undef __
   73 #define __ _masm->
   74 
   75 #ifdef PRODUCT
   76 #define BLOCK_COMMENT(str) /* nothing */
   77 #else
   78 #define BLOCK_COMMENT(str) __ block_comment(str)
   79 #endif
   80 
   81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   82 
   83 // Stub Code definitions
   84 
   85 class StubGenerator: public StubCodeGenerator {
   86  private:
   87 
   88 #ifdef PRODUCT
   89 #define inc_counter_np(counter) ((void)0)
   90 #else
   91   void inc_counter_np_(uint& counter) {
   92     __ incrementw(ExternalAddress((address)&counter));
   93   }
   94 #define inc_counter_np(counter) \
   95   BLOCK_COMMENT("inc_counter " #counter); \
   96   inc_counter_np_(counter);
   97 #endif
   98 
   99   // Call stubs are used to call Java from C
  100   //
  101   // Arguments:
  102   //    c_rarg0:   call wrapper address                   address
  103   //    c_rarg1:   result                                 address
  104   //    c_rarg2:   result type                            BasicType
  105   //    c_rarg3:   method                                 Method*
  106   //    c_rarg4:   (interpreter) entry point              address
  107   //    c_rarg5:   parameters                             intptr_t*
  108   //    c_rarg6:   parameter size (in words)              int
  109   //    c_rarg7:   thread                                 Thread*
  110   //
  111   // There is no return from the stub itself as any Java result
  112   // is written to result
  113   //
  114   // we save r30 (lr) as the return PC at the base of the frame and
  115   // link r29 (fp) below it as the frame pointer installing sp (r31)
  116   // into fp.
  117   //
  118   // we save r0-r7, which accounts for all the c arguments.
  119   //
  120   // TODO: strictly do we need to save them all? they are treated as
  121   // volatile by C so could we omit saving the ones we are going to
  122   // place in global registers (thread? method?) or those we only use
  123   // during setup of the Java call?
  124   //
  125   // we don't need to save r8 which C uses as an indirect result location
  126   // return register.
  127   //
  128   // we don't need to save r9-r15 which both C and Java treat as
  129   // volatile
  130   //
  131   // we don't need to save r16-18 because Java does not use them
  132   //
  133   // we save r19-r28 which Java uses as scratch registers and C
  134   // expects to be callee-save
  135   //
  136   // we save the bottom 64 bits of each value stored in v8-v15; it is
  137   // the responsibility of the caller to preserve larger values.
  138   //
  139   // so the stub frame looks like this when we enter Java code
  140   //
  141   //     [ return_from_Java     ] <--- sp
  142   //     [ argument word n      ]
  143   //      ...
  144   // -29 [ argument word 1      ]
  145   // -28 [ saved Floating-point Control Register ]
  146   // -26 [ saved v15            ] <--- sp_after_call
  147   // -25 [ saved v14            ]
  148   // -24 [ saved v13            ]
  149   // -23 [ saved v12            ]
  150   // -22 [ saved v11            ]
  151   // -21 [ saved v10            ]
  152   // -20 [ saved v9             ]
  153   // -19 [ saved v8             ]
  154   // -18 [ saved r28            ]
  155   // -17 [ saved r27            ]
  156   // -16 [ saved r26            ]
  157   // -15 [ saved r25            ]
  158   // -14 [ saved r24            ]
  159   // -13 [ saved r23            ]
  160   // -12 [ saved r22            ]
  161   // -11 [ saved r21            ]
  162   // -10 [ saved r20            ]
  163   //  -9 [ saved r19            ]
  164   //  -8 [ call wrapper    (r0) ]
  165   //  -7 [ result          (r1) ]
  166   //  -6 [ result type     (r2) ]
  167   //  -5 [ method          (r3) ]
  168   //  -4 [ entry point     (r4) ]
  169   //  -3 [ parameters      (r5) ]
  170   //  -2 [ parameter size  (r6) ]
  171   //  -1 [ thread (r7)          ]
  172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  173   //   1 [ saved lr       (r30) ]
  174 
  175   // Call stub stack layout word offsets from fp
  176   enum call_stub_layout {
  177     sp_after_call_off  = -28,
  178 
  179     fpcr_off           = sp_after_call_off,
  180     d15_off            = -26,
  181     d13_off            = -24,
  182     d11_off            = -22,
  183     d9_off             = -20,
  184 
  185     r28_off            = -18,
  186     r26_off            = -16,
  187     r24_off            = -14,
  188     r22_off            = -12,
  189     r20_off            = -10,
  190     call_wrapper_off   =  -8,
  191     result_off         =  -7,
  192     result_type_off    =  -6,
  193     method_off         =  -5,
  194     entry_point_off    =  -4,
  195     parameter_size_off =  -2,
  196     thread_off         =  -1,
  197     fp_f               =   0,
  198     retaddr_off        =   1,
  199   };
  200 
  201   address generate_call_stub(address& return_address) {
  202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  204            "adjust this code");
  205 
  206     StubId stub_id = StubId::stubgen_call_stub_id;
  207     StubCodeMark mark(this, stub_id);
  208     address start = __ pc();
  209 
  210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  211 
  212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  214     const Address result        (rfp, result_off         * wordSize);
  215     const Address result_type   (rfp, result_type_off    * wordSize);
  216     const Address method        (rfp, method_off         * wordSize);
  217     const Address entry_point   (rfp, entry_point_off    * wordSize);
  218     const Address parameter_size(rfp, parameter_size_off * wordSize);
  219 
  220     const Address thread        (rfp, thread_off         * wordSize);
  221 
  222     const Address d15_save      (rfp, d15_off * wordSize);
  223     const Address d13_save      (rfp, d13_off * wordSize);
  224     const Address d11_save      (rfp, d11_off * wordSize);
  225     const Address d9_save       (rfp, d9_off * wordSize);
  226 
  227     const Address r28_save      (rfp, r28_off * wordSize);
  228     const Address r26_save      (rfp, r26_off * wordSize);
  229     const Address r24_save      (rfp, r24_off * wordSize);
  230     const Address r22_save      (rfp, r22_off * wordSize);
  231     const Address r20_save      (rfp, r20_off * wordSize);
  232 
  233     // stub code
  234 
  235     address aarch64_entry = __ pc();
  236 
  237     // set up frame and move sp to end of save area
  238     __ enter();
  239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  240 
  241     // save register parameters and Java scratch/global registers
  242     // n.b. we save thread even though it gets installed in
  243     // rthread because we want to sanity check rthread later
  244     __ str(c_rarg7,  thread);
  245     __ strw(c_rarg6, parameter_size);
  246     __ stp(c_rarg4, c_rarg5,  entry_point);
  247     __ stp(c_rarg2, c_rarg3,  result_type);
  248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  249 
  250     __ stp(r20, r19,   r20_save);
  251     __ stp(r22, r21,   r22_save);
  252     __ stp(r24, r23,   r24_save);
  253     __ stp(r26, r25,   r26_save);
  254     __ stp(r28, r27,   r28_save);
  255 
  256     __ stpd(v9,  v8,   d9_save);
  257     __ stpd(v11, v10,  d11_save);
  258     __ stpd(v13, v12,  d13_save);
  259     __ stpd(v15, v14,  d15_save);
  260 
  261     __ get_fpcr(rscratch1);
  262     __ str(rscratch1, fpcr_save);
  263     // Set FPCR to the state we need. We do want Round to Nearest. We
  264     // don't want non-IEEE rounding modes or floating-point traps.
  265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  267     __ set_fpcr(rscratch1);
  268 
  269     // install Java thread in global register now we have saved
  270     // whatever value it held
  271     __ mov(rthread, c_rarg7);
  272     // And method
  273     __ mov(rmethod, c_rarg3);
  274 
  275     // set up the heapbase register
  276     __ reinit_heapbase();
  277 
  278 #ifdef ASSERT
  279     // make sure we have no pending exceptions
  280     {
  281       Label L;
  282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  283       __ cmp(rscratch1, (u1)NULL_WORD);
  284       __ br(Assembler::EQ, L);
  285       __ stop("StubRoutines::call_stub: entered with pending exception");
  286       __ BIND(L);
  287     }
  288 #endif
  289     // pass parameters if any
  290     __ mov(esp, sp);
  291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  292     __ andr(sp, rscratch1, -2 * wordSize);
  293 
  294     BLOCK_COMMENT("pass parameters if any");
  295     Label parameters_done;
  296     // parameter count is still in c_rarg6
  297     // and parameter pointer identifying param 1 is in c_rarg5
  298     __ cbzw(c_rarg6, parameters_done);
  299 
  300     address loop = __ pc();
  301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  302     __ subsw(c_rarg6, c_rarg6, 1);
  303     __ push(rscratch1);
  304     __ br(Assembler::GT, loop);
  305 
  306     __ BIND(parameters_done);
  307 
  308     // call Java entry -- passing methdoOop, and current sp
  309     //      rmethod: Method*
  310     //      r19_sender_sp: sender sp
  311     BLOCK_COMMENT("call Java function");
  312     __ mov(r19_sender_sp, sp);
  313     __ blr(c_rarg4);
  314 
  315     // we do this here because the notify will already have been done
  316     // if we get to the next instruction via an exception
  317     //
  318     // n.b. adding this instruction here affects the calculation of
  319     // whether or not a routine returns to the call stub (used when
  320     // doing stack walks) since the normal test is to check the return
  321     // pc against the address saved below. so we may need to allow for
  322     // this extra instruction in the check.
  323 
  324     // save current address for use by exception handling code
  325 
  326     return_address = __ pc();
  327 
  328     // store result depending on type (everything that is not
  329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  330     // n.b. this assumes Java returns an integral result in r0
  331     // and a floating result in j_farg0
  332     __ ldr(j_rarg2, result);
  333     Label is_long, is_float, is_double, exit;
  334     __ ldr(j_rarg1, result_type);
  335     __ cmp(j_rarg1, (u1)T_OBJECT);
  336     __ br(Assembler::EQ, is_long);
  337     __ cmp(j_rarg1, (u1)T_LONG);
  338     __ br(Assembler::EQ, is_long);
  339     __ cmp(j_rarg1, (u1)T_FLOAT);
  340     __ br(Assembler::EQ, is_float);
  341     __ cmp(j_rarg1, (u1)T_DOUBLE);
  342     __ br(Assembler::EQ, is_double);
  343 
  344     // handle T_INT case
  345     __ strw(r0, Address(j_rarg2));
  346 
  347     __ BIND(exit);
  348 
  349     // pop parameters
  350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  351 
  352 #ifdef ASSERT
  353     // verify that threads correspond
  354     {
  355       Label L, S;
  356       __ ldr(rscratch1, thread);
  357       __ cmp(rthread, rscratch1);
  358       __ br(Assembler::NE, S);
  359       __ get_thread(rscratch1);
  360       __ cmp(rthread, rscratch1);
  361       __ br(Assembler::EQ, L);
  362       __ BIND(S);
  363       __ stop("StubRoutines::call_stub: threads must correspond");
  364       __ BIND(L);
  365     }
  366 #endif
  367 
  368     __ pop_cont_fastpath(rthread);
  369 
  370     // restore callee-save registers
  371     __ ldpd(v15, v14,  d15_save);
  372     __ ldpd(v13, v12,  d13_save);
  373     __ ldpd(v11, v10,  d11_save);
  374     __ ldpd(v9,  v8,   d9_save);
  375 
  376     __ ldp(r28, r27,   r28_save);
  377     __ ldp(r26, r25,   r26_save);
  378     __ ldp(r24, r23,   r24_save);
  379     __ ldp(r22, r21,   r22_save);
  380     __ ldp(r20, r19,   r20_save);
  381 
  382     // restore fpcr
  383     __ ldr(rscratch1,  fpcr_save);
  384     __ set_fpcr(rscratch1);
  385 
  386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  387     __ ldrw(c_rarg2, result_type);
  388     __ ldr(c_rarg3,  method);
  389     __ ldp(c_rarg4, c_rarg5,  entry_point);
  390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  391 
  392     // leave frame and return to caller
  393     __ leave();
  394     __ ret(lr);
  395 
  396     // handle return types different from T_INT
  397 
  398     __ BIND(is_long);
  399     __ str(r0, Address(j_rarg2, 0));
  400     __ br(Assembler::AL, exit);
  401 
  402     __ BIND(is_float);
  403     __ strs(j_farg0, Address(j_rarg2, 0));
  404     __ br(Assembler::AL, exit);
  405 
  406     __ BIND(is_double);
  407     __ strd(j_farg0, Address(j_rarg2, 0));
  408     __ br(Assembler::AL, exit);
  409 
  410     return start;
  411   }
  412 
  413   // Return point for a Java call if there's an exception thrown in
  414   // Java code.  The exception is caught and transformed into a
  415   // pending exception stored in JavaThread that can be tested from
  416   // within the VM.
  417   //
  418   // Note: Usually the parameters are removed by the callee. In case
  419   // of an exception crossing an activation frame boundary, that is
  420   // not the case if the callee is compiled code => need to setup the
  421   // rsp.
  422   //
  423   // r0: exception oop
  424 
  425   address generate_catch_exception() {
  426     StubId stub_id = StubId::stubgen_catch_exception_id;
  427     StubCodeMark mark(this, stub_id);
  428     address start = __ pc();
  429 
  430     // same as in generate_call_stub():
  431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  432     const Address thread        (rfp, thread_off         * wordSize);
  433 
  434 #ifdef ASSERT
  435     // verify that threads correspond
  436     {
  437       Label L, S;
  438       __ ldr(rscratch1, thread);
  439       __ cmp(rthread, rscratch1);
  440       __ br(Assembler::NE, S);
  441       __ get_thread(rscratch1);
  442       __ cmp(rthread, rscratch1);
  443       __ br(Assembler::EQ, L);
  444       __ bind(S);
  445       __ stop("StubRoutines::catch_exception: threads must correspond");
  446       __ bind(L);
  447     }
  448 #endif
  449 
  450     // set pending exception
  451     __ verify_oop(r0);
  452 
  453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  454     __ mov(rscratch1, (address)__FILE__);
  455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  456     __ movw(rscratch1, (int)__LINE__);
  457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  458 
  459     // complete return to VM
  460     assert(StubRoutines::_call_stub_return_address != nullptr,
  461            "_call_stub_return_address must have been generated before");
  462     __ b(StubRoutines::_call_stub_return_address);
  463 
  464     return start;
  465   }
  466 
  467   // Continuation point for runtime calls returning with a pending
  468   // exception.  The pending exception check happened in the runtime
  469   // or native call stub.  The pending exception in Thread is
  470   // converted into a Java-level exception.
  471   //
  472   // Contract with Java-level exception handlers:
  473   // r0: exception
  474   // r3: throwing pc
  475   //
  476   // NOTE: At entry of this stub, exception-pc must be in LR !!
  477 
  478   // NOTE: this is always used as a jump target within generated code
  479   // so it just needs to be generated code with no x86 prolog
  480 
  481   address generate_forward_exception() {
  482     StubId stub_id = StubId::stubgen_forward_exception_id;
  483     StubCodeMark mark(this, stub_id);
  484     address start = __ pc();
  485 
  486     // Upon entry, LR points to the return address returning into
  487     // Java (interpreted or compiled) code; i.e., the return address
  488     // becomes the throwing pc.
  489     //
  490     // Arguments pushed before the runtime call are still on the stack
  491     // but the exception handler will reset the stack pointer ->
  492     // ignore them.  A potential result in registers can be ignored as
  493     // well.
  494 
  495 #ifdef ASSERT
  496     // make sure this code is only executed if there is a pending exception
  497     {
  498       Label L;
  499       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  500       __ cbnz(rscratch1, L);
  501       __ stop("StubRoutines::forward exception: no pending exception (1)");
  502       __ bind(L);
  503     }
  504 #endif
  505 
  506     // compute exception handler into r19
  507 
  508     // call the VM to find the handler address associated with the
  509     // caller address. pass thread in r0 and caller pc (ret address)
  510     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  511     // the stack.
  512     __ mov(c_rarg1, lr);
  513     // lr will be trashed by the VM call so we move it to R19
  514     // (callee-saved) because we also need to pass it to the handler
  515     // returned by this call.
  516     __ mov(r19, lr);
  517     BLOCK_COMMENT("call exception_handler_for_return_address");
  518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  519                          SharedRuntime::exception_handler_for_return_address),
  520                     rthread, c_rarg1);
  521     // Reinitialize the ptrue predicate register, in case the external runtime
  522     // call clobbers ptrue reg, as we may return to SVE compiled code.
  523     __ reinitialize_ptrue();
  524 
  525     // we should not really care that lr is no longer the callee
  526     // address. we saved the value the handler needs in r19 so we can
  527     // just copy it to r3. however, the C2 handler will push its own
  528     // frame and then calls into the VM and the VM code asserts that
  529     // the PC for the frame above the handler belongs to a compiled
  530     // Java method. So, we restore lr here to satisfy that assert.
  531     __ mov(lr, r19);
  532     // setup r0 & r3 & clear pending exception
  533     __ mov(r3, r19);
  534     __ mov(r19, r0);
  535     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  536     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  537 
  538 #ifdef ASSERT
  539     // make sure exception is set
  540     {
  541       Label L;
  542       __ cbnz(r0, L);
  543       __ stop("StubRoutines::forward exception: no pending exception (2)");
  544       __ bind(L);
  545     }
  546 #endif
  547 
  548     // continue at exception handler
  549     // r0: exception
  550     // r3: throwing pc
  551     // r19: exception handler
  552     __ verify_oop(r0);
  553     __ br(r19);
  554 
  555     return start;
  556   }
  557 
  558   // Non-destructive plausibility checks for oops
  559   //
  560   // Arguments:
  561   //    r0: oop to verify
  562   //    rscratch1: error message
  563   //
  564   // Stack after saving c_rarg3:
  565   //    [tos + 0]: saved c_rarg3
  566   //    [tos + 1]: saved c_rarg2
  567   //    [tos + 2]: saved lr
  568   //    [tos + 3]: saved rscratch2
  569   //    [tos + 4]: saved r0
  570   //    [tos + 5]: saved rscratch1
  571   address generate_verify_oop() {
  572     StubId stub_id = StubId::stubgen_verify_oop_id;
  573     StubCodeMark mark(this, stub_id);
  574     address start = __ pc();
  575 
  576     Label exit, error;
  577 
  578     // save c_rarg2 and c_rarg3
  579     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  580 
  581     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  583     __ ldr(c_rarg3, Address(c_rarg2));
  584     __ add(c_rarg3, c_rarg3, 1);
  585     __ str(c_rarg3, Address(c_rarg2));
  586 
  587     // object is in r0
  588     // make sure object is 'reasonable'
  589     __ cbz(r0, exit); // if obj is null it is OK
  590 
  591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  592     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  593 
  594     // return if everything seems ok
  595     __ bind(exit);
  596 
  597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  598     __ ret(lr);
  599 
  600     // handle errors
  601     __ bind(error);
  602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  603 
  604     __ push(RegSet::range(r0, r29), sp);
  605     // debug(char* msg, int64_t pc, int64_t regs[])
  606     __ mov(c_rarg0, rscratch1);      // pass address of error message
  607     __ mov(c_rarg1, lr);             // pass return address
  608     __ mov(c_rarg2, sp);             // pass address of regs on stack
  609 #ifndef PRODUCT
  610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  611 #endif
  612     BLOCK_COMMENT("call MacroAssembler::debug");
  613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  614     __ blr(rscratch1);
  615     __ hlt(0);
  616 
  617     return start;
  618   }
  619 
  620   // Generate indices for iota vector.
  621   address generate_iota_indices(StubId stub_id) {
  622     __ align(CodeEntryAlignment);
  623     StubCodeMark mark(this, stub_id);
  624     address start = __ pc();
  625     // B
  626     __ emit_data64(0x0706050403020100, relocInfo::none);
  627     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  628     // H
  629     __ emit_data64(0x0003000200010000, relocInfo::none);
  630     __ emit_data64(0x0007000600050004, relocInfo::none);
  631     // S
  632     __ emit_data64(0x0000000100000000, relocInfo::none);
  633     __ emit_data64(0x0000000300000002, relocInfo::none);
  634     // D
  635     __ emit_data64(0x0000000000000000, relocInfo::none);
  636     __ emit_data64(0x0000000000000001, relocInfo::none);
  637     // S - FP
  638     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  639     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  640     // D - FP
  641     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  642     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  643     return start;
  644   }
  645 
  646   // The inner part of zero_words().  This is the bulk operation,
  647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  648   // caller is responsible for zeroing the last few words.
  649   //
  650   // Inputs:
  651   // r10: the HeapWord-aligned base address of an array to zero.
  652   // r11: the count in HeapWords, r11 > 0.
  653   //
  654   // Returns r10 and r11, adjusted for the caller to clear.
  655   // r10: the base address of the tail of words left to clear.
  656   // r11: the number of words in the tail.
  657   //      r11 < MacroAssembler::zero_words_block_size.
  658 
  659   address generate_zero_blocks() {
  660     Label done;
  661     Label base_aligned;
  662 
  663     Register base = r10, cnt = r11;
  664 
  665     __ align(CodeEntryAlignment);
  666     StubId stub_id = StubId::stubgen_zero_blocks_id;
  667     StubCodeMark mark(this, stub_id);
  668     address start = __ pc();
  669 
  670     if (UseBlockZeroing) {
  671       int zva_length = VM_Version::zva_length();
  672 
  673       // Ensure ZVA length can be divided by 16. This is required by
  674       // the subsequent operations.
  675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  676 
  677       __ tbz(base, 3, base_aligned);
  678       __ str(zr, Address(__ post(base, 8)));
  679       __ sub(cnt, cnt, 1);
  680       __ bind(base_aligned);
  681 
  682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  683       // alignment.
  684       Label small;
  685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  686       __ subs(rscratch1, cnt, low_limit >> 3);
  687       __ br(Assembler::LT, small);
  688       __ zero_dcache_blocks(base, cnt);
  689       __ bind(small);
  690     }
  691 
  692     {
  693       // Number of stp instructions we'll unroll
  694       const int unroll =
  695         MacroAssembler::zero_words_block_size / 2;
  696       // Clear the remaining blocks.
  697       Label loop;
  698       __ subs(cnt, cnt, unroll * 2);
  699       __ br(Assembler::LT, done);
  700       __ bind(loop);
  701       for (int i = 0; i < unroll; i++)
  702         __ stp(zr, zr, __ post(base, 16));
  703       __ subs(cnt, cnt, unroll * 2);
  704       __ br(Assembler::GE, loop);
  705       __ bind(done);
  706       __ add(cnt, cnt, unroll * 2);
  707     }
  708 
  709     __ ret(lr);
  710 
  711     return start;
  712   }
  713 
  714 
  715   typedef enum {
  716     copy_forwards = 1,
  717     copy_backwards = -1
  718   } copy_direction;
  719 
  720   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  721   // for arraycopy stubs.
  722   class ArrayCopyBarrierSetHelper : StackObj {
  723     BarrierSetAssembler* _bs_asm;
  724     MacroAssembler* _masm;
  725     DecoratorSet _decorators;
  726     BasicType _type;
  727     Register _gct1;
  728     Register _gct2;
  729     Register _gct3;
  730     FloatRegister _gcvt1;
  731     FloatRegister _gcvt2;
  732     FloatRegister _gcvt3;
  733 
  734   public:
  735     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  736                               DecoratorSet decorators,
  737                               BasicType type,
  738                               Register gct1,
  739                               Register gct2,
  740                               Register gct3,
  741                               FloatRegister gcvt1,
  742                               FloatRegister gcvt2,
  743                               FloatRegister gcvt3)
  744       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  745         _masm(masm),
  746         _decorators(decorators),
  747         _type(type),
  748         _gct1(gct1),
  749         _gct2(gct2),
  750         _gct3(gct3),
  751         _gcvt1(gcvt1),
  752         _gcvt2(gcvt2),
  753         _gcvt3(gcvt3) {
  754     }
  755 
  756     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  757       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  758                             dst1, dst2, src,
  759                             _gct1, _gct2, _gcvt1);
  760     }
  761 
  762     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  763       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  764                              dst, src1, src2,
  765                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  766     }
  767 
  768     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  769       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  770                             dst1, dst2, src,
  771                             _gct1);
  772     }
  773 
  774     void copy_store_at_16(Address dst, Register src1, Register src2) {
  775       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  776                              dst, src1, src2,
  777                              _gct1, _gct2, _gct3);
  778     }
  779 
  780     void copy_load_at_8(Register dst, Address src) {
  781       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  782                             dst, noreg, src,
  783                             _gct1);
  784     }
  785 
  786     void copy_store_at_8(Address dst, Register src) {
  787       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  788                              dst, src, noreg,
  789                              _gct1, _gct2, _gct3);
  790     }
  791   };
  792 
  793   // Bulk copy of blocks of 8 words.
  794   //
  795   // count is a count of words.
  796   //
  797   // Precondition: count >= 8
  798   //
  799   // Postconditions:
  800   //
  801   // The least significant bit of count contains the remaining count
  802   // of words to copy.  The rest of count is trash.
  803   //
  804   // s and d are adjusted to point to the remaining words to copy
  805   //
  806   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
  807     BasicType type;
  808     copy_direction direction;
  809 
  810     switch (stub_id) {
  811     case StubId::stubgen_copy_byte_f_id:
  812       direction = copy_forwards;
  813       type = T_BYTE;
  814       break;
  815     case StubId::stubgen_copy_byte_b_id:
  816       direction = copy_backwards;
  817       type = T_BYTE;
  818       break;
  819     case StubId::stubgen_copy_oop_f_id:
  820       direction = copy_forwards;
  821       type = T_OBJECT;
  822       break;
  823     case StubId::stubgen_copy_oop_b_id:
  824       direction = copy_backwards;
  825       type = T_OBJECT;
  826       break;
  827     case StubId::stubgen_copy_oop_uninit_f_id:
  828       direction = copy_forwards;
  829       type = T_OBJECT;
  830       break;
  831     case StubId::stubgen_copy_oop_uninit_b_id:
  832       direction = copy_backwards;
  833       type = T_OBJECT;
  834       break;
  835     default:
  836       ShouldNotReachHere();
  837     }
  838 
  839     int unit = wordSize * direction;
  840     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  841 
  842     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  843       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  844     const Register stride = r14;
  845     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  846     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  847     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  848 
  849     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  850     assert_different_registers(s, d, count, rscratch1, rscratch2);
  851 
  852     Label again, drain;
  853 
  854     __ align(CodeEntryAlignment);
  855 
  856     StubCodeMark mark(this, stub_id);
  857 
  858     address start = __ pc();
  859 
  860     Label unaligned_copy_long;
  861     if (AvoidUnalignedAccesses) {
  862       __ tbnz(d, 3, unaligned_copy_long);
  863     }
  864 
  865     if (direction == copy_forwards) {
  866       __ sub(s, s, bias);
  867       __ sub(d, d, bias);
  868     }
  869 
  870 #ifdef ASSERT
  871     // Make sure we are never given < 8 words
  872     {
  873       Label L;
  874       __ cmp(count, (u1)8);
  875       __ br(Assembler::GE, L);
  876       __ stop("genrate_copy_longs called with < 8 words");
  877       __ bind(L);
  878     }
  879 #endif
  880 
  881     // Fill 8 registers
  882     if (UseSIMDForMemoryOps) {
  883       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  884       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  885     } else {
  886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  888       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  890     }
  891 
  892     __ subs(count, count, 16);
  893     __ br(Assembler::LO, drain);
  894 
  895     int prefetch = PrefetchCopyIntervalInBytes;
  896     bool use_stride = false;
  897     if (direction == copy_backwards) {
  898       use_stride = prefetch > 256;
  899       prefetch = -prefetch;
  900       if (use_stride) __ mov(stride, prefetch);
  901     }
  902 
  903     __ bind(again);
  904 
  905     if (PrefetchCopyIntervalInBytes > 0)
  906       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  907 
  908     if (UseSIMDForMemoryOps) {
  909       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  910       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  911       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  912       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  913     } else {
  914       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  915       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  916       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  917       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  919       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  920       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  921       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  922     }
  923 
  924     __ subs(count, count, 8);
  925     __ br(Assembler::HS, again);
  926 
  927     // Drain
  928     __ bind(drain);
  929     if (UseSIMDForMemoryOps) {
  930       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  931       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  932     } else {
  933       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  934       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  935       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  936       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  937     }
  938 
  939     {
  940       Label L1, L2;
  941       __ tbz(count, exact_log2(4), L1);
  942       if (UseSIMDForMemoryOps) {
  943         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  944         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  945       } else {
  946         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  947         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  948         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  950       }
  951       __ bind(L1);
  952 
  953       if (direction == copy_forwards) {
  954         __ add(s, s, bias);
  955         __ add(d, d, bias);
  956       }
  957 
  958       __ tbz(count, 1, L2);
  959       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  960       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  961       __ bind(L2);
  962     }
  963 
  964     __ ret(lr);
  965 
  966     if (AvoidUnalignedAccesses) {
  967       Label drain, again;
  968       // Register order for storing. Order is different for backward copy.
  969 
  970       __ bind(unaligned_copy_long);
  971 
  972       // source address is even aligned, target odd aligned
  973       //
  974       // when forward copying word pairs we read long pairs at offsets
  975       // {0, 2, 4, 6} (in long words). when backwards copying we read
  976       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  977       // address by -2 in the forwards case so we can compute the
  978       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  979       // or -1.
  980       //
  981       // when forward copying we need to store 1 word, 3 pairs and
  982       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  983       // zero offset We adjust the destination by -1 which means we
  984       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  985       //
  986       // When backwards copyng we need to store 1 word, 3 pairs and
  987       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  988       // offsets {1, 3, 5, 7, 8} * unit.
  989 
  990       if (direction == copy_forwards) {
  991         __ sub(s, s, 16);
  992         __ sub(d, d, 8);
  993       }
  994 
  995       // Fill 8 registers
  996       //
  997       // for forwards copy s was offset by -16 from the original input
  998       // value of s so the register contents are at these offsets
  999       // relative to the 64 bit block addressed by that original input
 1000       // and so on for each successive 64 byte block when s is updated
 1001       //
 1002       // t0 at offset 0,  t1 at offset 8
 1003       // t2 at offset 16, t3 at offset 24
 1004       // t4 at offset 32, t5 at offset 40
 1005       // t6 at offset 48, t7 at offset 56
 1006 
 1007       // for backwards copy s was not offset so the register contents
 1008       // are at these offsets into the preceding 64 byte block
 1009       // relative to that original input and so on for each successive
 1010       // preceding 64 byte block when s is updated. this explains the
 1011       // slightly counter-intuitive looking pattern of register usage
 1012       // in the stp instructions for backwards copy.
 1013       //
 1014       // t0 at offset -16, t1 at offset -8
 1015       // t2 at offset -32, t3 at offset -24
 1016       // t4 at offset -48, t5 at offset -40
 1017       // t6 at offset -64, t7 at offset -56
 1018 
 1019       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1020       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1021       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1022       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1023 
 1024       __ subs(count, count, 16);
 1025       __ br(Assembler::LO, drain);
 1026 
 1027       int prefetch = PrefetchCopyIntervalInBytes;
 1028       bool use_stride = false;
 1029       if (direction == copy_backwards) {
 1030         use_stride = prefetch > 256;
 1031         prefetch = -prefetch;
 1032         if (use_stride) __ mov(stride, prefetch);
 1033       }
 1034 
 1035       __ bind(again);
 1036 
 1037       if (PrefetchCopyIntervalInBytes > 0)
 1038         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1039 
 1040       if (direction == copy_forwards) {
 1041         // allowing for the offset of -8 the store instructions place
 1042         // registers into the target 64 bit block at the following
 1043         // offsets
 1044         //
 1045         // t0 at offset 0
 1046         // t1 at offset 8,  t2 at offset 16
 1047         // t3 at offset 24, t4 at offset 32
 1048         // t5 at offset 40, t6 at offset 48
 1049         // t7 at offset 56
 1050 
 1051         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1052         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1053         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1054         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1055         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1056         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1057         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1058         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1059         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1060       } else {
 1061         // d was not offset when we started so the registers are
 1062         // written into the 64 bit block preceding d with the following
 1063         // offsets
 1064         //
 1065         // t1 at offset -8
 1066         // t3 at offset -24, t0 at offset -16
 1067         // t5 at offset -48, t2 at offset -32
 1068         // t7 at offset -56, t4 at offset -48
 1069         //                   t6 at offset -64
 1070         //
 1071         // note that this matches the offsets previously noted for the
 1072         // loads
 1073 
 1074         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1075         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1076         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1077         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1078         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1079         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1080         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1082         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1083       }
 1084 
 1085       __ subs(count, count, 8);
 1086       __ br(Assembler::HS, again);
 1087 
 1088       // Drain
 1089       //
 1090       // this uses the same pattern of offsets and register arguments
 1091       // as above
 1092       __ bind(drain);
 1093       if (direction == copy_forwards) {
 1094         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1095         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1096         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1097         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1098         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1099       } else {
 1100         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1101         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1102         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1103         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1104         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1105       }
 1106       // now we need to copy any remaining part block which may
 1107       // include a 4 word block subblock and/or a 2 word subblock.
 1108       // bits 2 and 1 in the count are the tell-tale for whether we
 1109       // have each such subblock
 1110       {
 1111         Label L1, L2;
 1112         __ tbz(count, exact_log2(4), L1);
 1113         // this is the same as above but copying only 4 longs hence
 1114         // with only one intervening stp between the str instructions
 1115         // but note that the offsets and registers still follow the
 1116         // same pattern
 1117         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1118         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1119         if (direction == copy_forwards) {
 1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1121           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1122           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1123         } else {
 1124           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1125           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1126           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1127         }
 1128         __ bind(L1);
 1129 
 1130         __ tbz(count, 1, L2);
 1131         // this is the same as above but copying only 2 longs hence
 1132         // there is no intervening stp between the str instructions
 1133         // but note that the offset and register patterns are still
 1134         // the same
 1135         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1136         if (direction == copy_forwards) {
 1137           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1138           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1139         } else {
 1140           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1141           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1142         }
 1143         __ bind(L2);
 1144 
 1145         // for forwards copy we need to re-adjust the offsets we
 1146         // applied so that s and d are follow the last words written
 1147 
 1148         if (direction == copy_forwards) {
 1149           __ add(s, s, 16);
 1150           __ add(d, d, 8);
 1151         }
 1152 
 1153       }
 1154 
 1155       __ ret(lr);
 1156     }
 1157 
 1158     return start;
 1159   }
 1160 
 1161   // Small copy: less than 16 bytes.
 1162   //
 1163   // NB: Ignores all of the bits of count which represent more than 15
 1164   // bytes, so a caller doesn't have to mask them.
 1165 
 1166   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1167     bool is_backwards = step < 0;
 1168     size_t granularity = g_uabs(step);
 1169     int direction = is_backwards ? -1 : 1;
 1170 
 1171     Label Lword, Lint, Lshort, Lbyte;
 1172 
 1173     assert(granularity
 1174            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1175 
 1176     const Register t0 = r3;
 1177     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1178     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1179 
 1180     // ??? I don't know if this bit-test-and-branch is the right thing
 1181     // to do.  It does a lot of jumping, resulting in several
 1182     // mispredicted branches.  It might make more sense to do this
 1183     // with something like Duff's device with a single computed branch.
 1184 
 1185     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1186     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1187     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1188     __ bind(Lword);
 1189 
 1190     if (granularity <= sizeof (jint)) {
 1191       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1192       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1193       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1194       __ bind(Lint);
 1195     }
 1196 
 1197     if (granularity <= sizeof (jshort)) {
 1198       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1199       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1200       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1201       __ bind(Lshort);
 1202     }
 1203 
 1204     if (granularity <= sizeof (jbyte)) {
 1205       __ tbz(count, 0, Lbyte);
 1206       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1207       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1208       __ bind(Lbyte);
 1209     }
 1210   }
 1211 
 1212   // All-singing all-dancing memory copy.
 1213   //
 1214   // Copy count units of memory from s to d.  The size of a unit is
 1215   // step, which can be positive or negative depending on the direction
 1216   // of copy.  If is_aligned is false, we align the source address.
 1217   //
 1218 
 1219   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1220                    Register s, Register d, Register count, int step) {
 1221     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1222     bool is_backwards = step < 0;
 1223     unsigned int granularity = g_uabs(step);
 1224     const Register t0 = r3, t1 = r4;
 1225 
 1226     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1227     // load all the data before writing anything
 1228     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1229     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1230     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1231     const Register send = r17, dend = r16;
 1232     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1233     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1234     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1235 
 1236     if (PrefetchCopyIntervalInBytes > 0)
 1237       __ prfm(Address(s, 0), PLDL1KEEP);
 1238     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1239     __ br(Assembler::HI, copy_big);
 1240 
 1241     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1242     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1243 
 1244     __ cmp(count, u1(16/granularity));
 1245     __ br(Assembler::LS, copy16);
 1246 
 1247     __ cmp(count, u1(64/granularity));
 1248     __ br(Assembler::HI, copy80);
 1249 
 1250     __ cmp(count, u1(32/granularity));
 1251     __ br(Assembler::LS, copy32);
 1252 
 1253     // 33..64 bytes
 1254     if (UseSIMDForMemoryOps) {
 1255       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1256       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1257       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1258       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1259     } else {
 1260       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1261       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1262       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1263       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1264 
 1265       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1266       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1267       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1268       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1269     }
 1270     __ b(finish);
 1271 
 1272     // 17..32 bytes
 1273     __ bind(copy32);
 1274     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1275     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1276 
 1277     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1278     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1279     __ b(finish);
 1280 
 1281     // 65..80/96 bytes
 1282     // (96 bytes if SIMD because we do 32 byes per instruction)
 1283     __ bind(copy80);
 1284     if (UseSIMDForMemoryOps) {
 1285       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1286       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1287       // Unaligned pointers can be an issue for copying.
 1288       // The issue has more chances to happen when granularity of data is
 1289       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1290       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1291       // The most performance drop has been seen for the range 65-80 bytes.
 1292       // For such cases using the pair of ldp/stp instead of the third pair of
 1293       // ldpq/stpq fixes the performance issue.
 1294       if (granularity < sizeof (jint)) {
 1295         Label copy96;
 1296         __ cmp(count, u1(80/granularity));
 1297         __ br(Assembler::HI, copy96);
 1298         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1299 
 1300         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1301         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1302 
 1303         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1304         __ b(finish);
 1305 
 1306         __ bind(copy96);
 1307       }
 1308       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1309 
 1310       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1311       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1312 
 1313       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1314     } else {
 1315       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1316       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1317       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1318       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1319       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1320 
 1321       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1322       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1323       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1324       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1325       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1326     }
 1327     __ b(finish);
 1328 
 1329     // 0..16 bytes
 1330     __ bind(copy16);
 1331     __ cmp(count, u1(8/granularity));
 1332     __ br(Assembler::LO, copy8);
 1333 
 1334     // 8..16 bytes
 1335     bs.copy_load_at_8(t0, Address(s, 0));
 1336     bs.copy_load_at_8(t1, Address(send, -8));
 1337     bs.copy_store_at_8(Address(d, 0), t0);
 1338     bs.copy_store_at_8(Address(dend, -8), t1);
 1339     __ b(finish);
 1340 
 1341     if (granularity < 8) {
 1342       // 4..7 bytes
 1343       __ bind(copy8);
 1344       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1345       __ ldrw(t0, Address(s, 0));
 1346       __ ldrw(t1, Address(send, -4));
 1347       __ strw(t0, Address(d, 0));
 1348       __ strw(t1, Address(dend, -4));
 1349       __ b(finish);
 1350       if (granularity < 4) {
 1351         // 0..3 bytes
 1352         __ bind(copy4);
 1353         __ cbz(count, finish); // get rid of 0 case
 1354         if (granularity == 2) {
 1355           __ ldrh(t0, Address(s, 0));
 1356           __ strh(t0, Address(d, 0));
 1357         } else { // granularity == 1
 1358           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1359           // the first and last byte.
 1360           // Handle the 3 byte case by loading and storing base + count/2
 1361           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1362           // This does means in the 1 byte case we load/store the same
 1363           // byte 3 times.
 1364           __ lsr(count, count, 1);
 1365           __ ldrb(t0, Address(s, 0));
 1366           __ ldrb(t1, Address(send, -1));
 1367           __ ldrb(t2, Address(s, count));
 1368           __ strb(t0, Address(d, 0));
 1369           __ strb(t1, Address(dend, -1));
 1370           __ strb(t2, Address(d, count));
 1371         }
 1372         __ b(finish);
 1373       }
 1374     }
 1375 
 1376     __ bind(copy_big);
 1377     if (is_backwards) {
 1378       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1379       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1380     }
 1381 
 1382     // Now we've got the small case out of the way we can align the
 1383     // source address on a 2-word boundary.
 1384 
 1385     // Here we will materialize a count in r15, which is used by copy_memory_small
 1386     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1387     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1388     // can not be used as a temp register, as it contains the count.
 1389 
 1390     Label aligned;
 1391 
 1392     if (is_aligned) {
 1393       // We may have to adjust by 1 word to get s 2-word-aligned.
 1394       __ tbz(s, exact_log2(wordSize), aligned);
 1395       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1396       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1397       __ sub(count, count, wordSize/granularity);
 1398     } else {
 1399       if (is_backwards) {
 1400         __ andr(r15, s, 2 * wordSize - 1);
 1401       } else {
 1402         __ neg(r15, s);
 1403         __ andr(r15, r15, 2 * wordSize - 1);
 1404       }
 1405       // r15 is the byte adjustment needed to align s.
 1406       __ cbz(r15, aligned);
 1407       int shift = exact_log2(granularity);
 1408       if (shift > 0) {
 1409         __ lsr(r15, r15, shift);
 1410       }
 1411       __ sub(count, count, r15);
 1412 
 1413 #if 0
 1414       // ?? This code is only correct for a disjoint copy.  It may or
 1415       // may not make sense to use it in that case.
 1416 
 1417       // Copy the first pair; s and d may not be aligned.
 1418       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1419       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1420 
 1421       // Align s and d, adjust count
 1422       if (is_backwards) {
 1423         __ sub(s, s, r15);
 1424         __ sub(d, d, r15);
 1425       } else {
 1426         __ add(s, s, r15);
 1427         __ add(d, d, r15);
 1428       }
 1429 #else
 1430       copy_memory_small(decorators, type, s, d, r15, step);
 1431 #endif
 1432     }
 1433 
 1434     __ bind(aligned);
 1435 
 1436     // s is now 2-word-aligned.
 1437 
 1438     // We have a count of units and some trailing bytes. Adjust the
 1439     // count and do a bulk copy of words. If the shift is zero
 1440     // perform a move instead to benefit from zero latency moves.
 1441     int shift = exact_log2(wordSize/granularity);
 1442     if (shift > 0) {
 1443       __ lsr(r15, count, shift);
 1444     } else {
 1445       __ mov(r15, count);
 1446     }
 1447     if (direction == copy_forwards) {
 1448       if (type != T_OBJECT) {
 1449         __ bl(StubRoutines::aarch64::copy_byte_f());
 1450       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1451         __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
 1452       } else {
 1453         __ bl(StubRoutines::aarch64::copy_oop_f());
 1454       }
 1455     } else {
 1456       if (type != T_OBJECT) {
 1457         __ bl(StubRoutines::aarch64::copy_byte_b());
 1458       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1459         __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
 1460       } else {
 1461         __ bl(StubRoutines::aarch64::copy_oop_b());
 1462       }
 1463     }
 1464 
 1465     // And the tail.
 1466     copy_memory_small(decorators, type, s, d, count, step);
 1467 
 1468     if (granularity >= 8) __ bind(copy8);
 1469     if (granularity >= 4) __ bind(copy4);
 1470     __ bind(finish);
 1471   }
 1472 
 1473 
 1474   void clobber_registers() {
 1475 #ifdef ASSERT
 1476     RegSet clobbered
 1477       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1478     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1479     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1480     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1481       __ mov(*it, rscratch1);
 1482     }
 1483 #endif
 1484 
 1485   }
 1486 
 1487   // Scan over array at a for count oops, verifying each one.
 1488   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1489   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1490     Label loop, end;
 1491     __ mov(rscratch1, a);
 1492     __ mov(rscratch2, zr);
 1493     __ bind(loop);
 1494     __ cmp(rscratch2, count);
 1495     __ br(Assembler::HS, end);
 1496     if (size == wordSize) {
 1497       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1498       __ verify_oop(temp);
 1499     } else {
 1500       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1501       __ decode_heap_oop(temp); // calls verify_oop
 1502     }
 1503     __ add(rscratch2, rscratch2, 1);
 1504     __ b(loop);
 1505     __ bind(end);
 1506   }
 1507 
 1508   // Arguments:
 1509   //   stub_id - is used to name the stub and identify all details of
 1510   //             how to perform the copy.
 1511   //
 1512   //   entry - is assigned to the stub's post push entry point unless
 1513   //           it is null
 1514   //
 1515   // Inputs:
 1516   //   c_rarg0   - source array address
 1517   //   c_rarg1   - destination array address
 1518   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1519   //
 1520   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1521   // the hardware handle it.  The two dwords within qwords that span
 1522   // cache line boundaries will still be loaded and stored atomically.
 1523   //
 1524   // Side Effects: nopush_entry is set to the (post push) entry point
 1525   //               so it can be used by the corresponding conjoint
 1526   //               copy method
 1527   //
 1528   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1529     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1530     RegSet saved_reg = RegSet::of(s, d, count);
 1531     int size;
 1532     bool aligned;
 1533     bool is_oop;
 1534     bool dest_uninitialized;
 1535     switch (stub_id) {
 1536     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1537       size = sizeof(jbyte);
 1538       aligned = false;
 1539       is_oop = false;
 1540       dest_uninitialized = false;
 1541       break;
 1542     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1543       size = sizeof(jbyte);
 1544       aligned = true;
 1545       is_oop = false;
 1546       dest_uninitialized = false;
 1547       break;
 1548     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1549       size = sizeof(jshort);
 1550       aligned = false;
 1551       is_oop = false;
 1552       dest_uninitialized = false;
 1553       break;
 1554     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1555       size = sizeof(jshort);
 1556       aligned = true;
 1557       is_oop = false;
 1558       dest_uninitialized = false;
 1559       break;
 1560     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1561       size = sizeof(jint);
 1562       aligned = false;
 1563       is_oop = false;
 1564       dest_uninitialized = false;
 1565       break;
 1566     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1567       size = sizeof(jint);
 1568       aligned = true;
 1569       is_oop = false;
 1570       dest_uninitialized = false;
 1571       break;
 1572     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1573       // since this is always aligned we can (should!) use the same
 1574       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1575       ShouldNotReachHere();
 1576       break;
 1577     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1578       size = sizeof(jlong);
 1579       aligned = true;
 1580       is_oop = false;
 1581       dest_uninitialized = false;
 1582       break;
 1583     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1584       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1585       aligned = !UseCompressedOops;
 1586       is_oop = true;
 1587       dest_uninitialized = false;
 1588       break;
 1589     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1590       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1591       aligned = !UseCompressedOops;
 1592       is_oop = true;
 1593       dest_uninitialized = false;
 1594       break;
 1595     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1596       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1597       aligned = !UseCompressedOops;
 1598       is_oop = true;
 1599       dest_uninitialized = true;
 1600       break;
 1601     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1602       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1603       aligned = !UseCompressedOops;
 1604       is_oop = true;
 1605       dest_uninitialized = true;
 1606       break;
 1607     default:
 1608       ShouldNotReachHere();
 1609       break;
 1610     }
 1611 
 1612     __ align(CodeEntryAlignment);
 1613     StubCodeMark mark(this, stub_id);
 1614     address start = __ pc();
 1615     __ enter();
 1616 
 1617     if (nopush_entry != nullptr) {
 1618       *nopush_entry = __ pc();
 1619       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1620       BLOCK_COMMENT("Entry:");
 1621     }
 1622 
 1623     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1624     if (dest_uninitialized) {
 1625       decorators |= IS_DEST_UNINITIALIZED;
 1626     }
 1627     if (aligned) {
 1628       decorators |= ARRAYCOPY_ALIGNED;
 1629     }
 1630 
 1631     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1632     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1633 
 1634     if (is_oop) {
 1635       // save regs before copy_memory
 1636       __ push(RegSet::of(d, count), sp);
 1637     }
 1638     {
 1639       // UnsafeMemoryAccess page error: continue after unsafe access
 1640       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1641       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1642       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1643     }
 1644 
 1645     if (is_oop) {
 1646       __ pop(RegSet::of(d, count), sp);
 1647       if (VerifyOops)
 1648         verify_oop_array(size, d, count, r16);
 1649     }
 1650 
 1651     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1652 
 1653     __ leave();
 1654     __ mov(r0, zr); // return 0
 1655     __ ret(lr);
 1656     return start;
 1657   }
 1658 
 1659   // Arguments:
 1660   //   stub_id - is used to name the stub and identify all details of
 1661   //             how to perform the copy.
 1662   //
 1663   //   nooverlap_target - identifes the (post push) entry for the
 1664   //             corresponding disjoint copy routine which can be
 1665   //             jumped to if the ranges do not actually overlap
 1666   //
 1667   //   entry - is assigned to the stub's post push entry point unless
 1668   //           it is null
 1669   //
 1670   //
 1671   // Inputs:
 1672   //   c_rarg0   - source array address
 1673   //   c_rarg1   - destination array address
 1674   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1675   //
 1676   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1677   // the hardware handle it.  The two dwords within qwords that span
 1678   // cache line boundaries will still be loaded and stored atomically.
 1679   //
 1680   // Side Effects:
 1681   //   nopush_entry is set to the no-overlap entry point so it can be
 1682   //   used by some other conjoint copy method
 1683   //
 1684   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1685     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1686     RegSet saved_regs = RegSet::of(s, d, count);
 1687     int size;
 1688     bool aligned;
 1689     bool is_oop;
 1690     bool dest_uninitialized;
 1691     switch (stub_id) {
 1692     case StubId::stubgen_jbyte_arraycopy_id:
 1693       size = sizeof(jbyte);
 1694       aligned = false;
 1695       is_oop = false;
 1696       dest_uninitialized = false;
 1697       break;
 1698     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1699       size = sizeof(jbyte);
 1700       aligned = true;
 1701       is_oop = false;
 1702       dest_uninitialized = false;
 1703       break;
 1704     case StubId::stubgen_jshort_arraycopy_id:
 1705       size = sizeof(jshort);
 1706       aligned = false;
 1707       is_oop = false;
 1708       dest_uninitialized = false;
 1709       break;
 1710     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1711       size = sizeof(jshort);
 1712       aligned = true;
 1713       is_oop = false;
 1714       dest_uninitialized = false;
 1715       break;
 1716     case StubId::stubgen_jint_arraycopy_id:
 1717       size = sizeof(jint);
 1718       aligned = false;
 1719       is_oop = false;
 1720       dest_uninitialized = false;
 1721       break;
 1722     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1723       size = sizeof(jint);
 1724       aligned = true;
 1725       is_oop = false;
 1726       dest_uninitialized = false;
 1727       break;
 1728     case StubId::stubgen_jlong_arraycopy_id:
 1729       // since this is always aligned we can (should!) use the same
 1730       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1731       ShouldNotReachHere();
 1732       break;
 1733     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1734       size = sizeof(jlong);
 1735       aligned = true;
 1736       is_oop = false;
 1737       dest_uninitialized = false;
 1738       break;
 1739     case StubId::stubgen_oop_arraycopy_id:
 1740       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1741       aligned = !UseCompressedOops;
 1742       is_oop = true;
 1743       dest_uninitialized = false;
 1744       break;
 1745     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1746       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1747       aligned = !UseCompressedOops;
 1748       is_oop = true;
 1749       dest_uninitialized = false;
 1750       break;
 1751     case StubId::stubgen_oop_arraycopy_uninit_id:
 1752       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1753       aligned = !UseCompressedOops;
 1754       is_oop = true;
 1755       dest_uninitialized = true;
 1756       break;
 1757     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1758       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1759       aligned = !UseCompressedOops;
 1760       is_oop = true;
 1761       dest_uninitialized = true;
 1762       break;
 1763     default:
 1764       ShouldNotReachHere();
 1765     }
 1766 
 1767     StubCodeMark mark(this, stub_id);
 1768     address start = __ pc();
 1769     __ enter();
 1770 
 1771     if (nopush_entry != nullptr) {
 1772       *nopush_entry = __ pc();
 1773       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1774       BLOCK_COMMENT("Entry:");
 1775     }
 1776 
 1777     // use fwd copy when (d-s) above_equal (count*size)
 1778     Label L_overlapping;
 1779     __ sub(rscratch1, d, s);
 1780     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1781     __ br(Assembler::LO, L_overlapping);
 1782     __ b(RuntimeAddress(nooverlap_target));
 1783     __ bind(L_overlapping);
 1784 
 1785     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1786     if (dest_uninitialized) {
 1787       decorators |= IS_DEST_UNINITIALIZED;
 1788     }
 1789     if (aligned) {
 1790       decorators |= ARRAYCOPY_ALIGNED;
 1791     }
 1792 
 1793     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1794     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1795 
 1796     if (is_oop) {
 1797       // save regs before copy_memory
 1798       __ push(RegSet::of(d, count), sp);
 1799     }
 1800     {
 1801       // UnsafeMemoryAccess page error: continue after unsafe access
 1802       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1803       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1804       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1805     }
 1806     if (is_oop) {
 1807       __ pop(RegSet::of(d, count), sp);
 1808       if (VerifyOops)
 1809         verify_oop_array(size, d, count, r16);
 1810     }
 1811     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1812     __ leave();
 1813     __ mov(r0, zr); // return 0
 1814     __ ret(lr);
 1815     return start;
 1816   }
 1817 
 1818   // Helper for generating a dynamic type check.
 1819   // Smashes rscratch1, rscratch2.
 1820   void generate_type_check(Register sub_klass,
 1821                            Register super_check_offset,
 1822                            Register super_klass,
 1823                            Register temp1,
 1824                            Register temp2,
 1825                            Register result,
 1826                            Label& L_success) {
 1827     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1828 
 1829     BLOCK_COMMENT("type_check:");
 1830 
 1831     Label L_miss;
 1832 
 1833     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1834                                      super_check_offset);
 1835     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1836 
 1837     // Fall through on failure!
 1838     __ BIND(L_miss);
 1839   }
 1840 
 1841   //
 1842   //  Generate checkcasting array copy stub
 1843   //
 1844   //  Input:
 1845   //    c_rarg0   - source array address
 1846   //    c_rarg1   - destination array address
 1847   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1848   //    c_rarg3   - size_t ckoff (super_check_offset)
 1849   //    c_rarg4   - oop ckval (super_klass)
 1850   //
 1851   //  Output:
 1852   //    r0 ==  0  -  success
 1853   //    r0 == -1^K - failure, where K is partial transfer count
 1854   //
 1855   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 1856     bool dest_uninitialized;
 1857     switch (stub_id) {
 1858     case StubId::stubgen_checkcast_arraycopy_id:
 1859       dest_uninitialized = false;
 1860       break;
 1861     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1862       dest_uninitialized = true;
 1863       break;
 1864     default:
 1865       ShouldNotReachHere();
 1866     }
 1867 
 1868     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1869 
 1870     // Input registers (after setup_arg_regs)
 1871     const Register from        = c_rarg0;   // source array address
 1872     const Register to          = c_rarg1;   // destination array address
 1873     const Register count       = c_rarg2;   // elementscount
 1874     const Register ckoff       = c_rarg3;   // super_check_offset
 1875     const Register ckval       = c_rarg4;   // super_klass
 1876 
 1877     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1878 
 1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1880     const Register copied_oop  = r22;       // actual oop copied
 1881     const Register count_save  = r21;       // orig elementscount
 1882     const Register start_to    = r20;       // destination array start address
 1883     const Register r19_klass   = r19;       // oop._klass
 1884 
 1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1887 
 1888     //---------------------------------------------------------------
 1889     // Assembler stub will be used for this call to arraycopy
 1890     // if the two arrays are subtypes of Object[] but the
 1891     // destination array type is not equal to or a supertype
 1892     // of the source type.  Each element must be separately
 1893     // checked.
 1894 
 1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1896                                copied_oop, r19_klass, count_save);
 1897 
 1898     __ align(CodeEntryAlignment);
 1899     StubCodeMark mark(this, stub_id);
 1900     address start = __ pc();
 1901 
 1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1903 
 1904 #ifdef ASSERT
 1905     // caller guarantees that the arrays really are different
 1906     // otherwise, we would have to make conjoint checks
 1907     { Label L;
 1908       __ b(L);                  // conjoint check not yet implemented
 1909       __ stop("checkcast_copy within a single array");
 1910       __ bind(L);
 1911     }
 1912 #endif //ASSERT
 1913 
 1914     // Caller of this entry point must set up the argument registers.
 1915     if (nopush_entry != nullptr) {
 1916       *nopush_entry = __ pc();
 1917       BLOCK_COMMENT("Entry:");
 1918     }
 1919 
 1920      // Empty array:  Nothing to do.
 1921     __ cbz(count, L_done);
 1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1923 
 1924 #ifdef ASSERT
 1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1926     // The ckoff and ckval must be mutually consistent,
 1927     // even though caller generates both.
 1928     { Label L;
 1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1930       __ ldrw(start_to, Address(ckval, sco_offset));
 1931       __ cmpw(ckoff, start_to);
 1932       __ br(Assembler::EQ, L);
 1933       __ stop("super_check_offset inconsistent");
 1934       __ bind(L);
 1935     }
 1936 #endif //ASSERT
 1937 
 1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1939     bool is_oop = true;
 1940     int element_size = UseCompressedOops ? 4 : 8;
 1941     if (dest_uninitialized) {
 1942       decorators |= IS_DEST_UNINITIALIZED;
 1943     }
 1944 
 1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1947 
 1948     // save the original count
 1949     __ mov(count_save, count);
 1950 
 1951     // Copy from low to high addresses
 1952     __ mov(start_to, to);              // Save destination array start address
 1953     __ b(L_load_element);
 1954 
 1955     // ======== begin loop ========
 1956     // (Loop is rotated; its entry is L_load_element.)
 1957     // Loop control:
 1958     //   for (; count != 0; count--) {
 1959     //     copied_oop = load_heap_oop(from++);
 1960     //     ... generate_type_check ...;
 1961     //     store_heap_oop(to++, copied_oop);
 1962     //   }
 1963     __ align(OptoLoopAlignment);
 1964 
 1965     __ BIND(L_store_element);
 1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1967                       __ post(to, element_size), copied_oop, noreg,
 1968                       gct1, gct2, gct3);
 1969     __ sub(count, count, 1);
 1970     __ cbz(count, L_do_card_marks);
 1971 
 1972     // ======== loop entry is here ========
 1973     __ BIND(L_load_element);
 1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1975                      copied_oop, noreg, __ post(from, element_size),
 1976                      gct1);
 1977     __ cbz(copied_oop, L_store_element);
 1978 
 1979     __ load_klass(r19_klass, copied_oop);// query the object klass
 1980 
 1981     BLOCK_COMMENT("type_check:");
 1982     generate_type_check(/*sub_klass*/r19_klass,
 1983                         /*super_check_offset*/ckoff,
 1984                         /*super_klass*/ckval,
 1985                         /*r_array_base*/gct1,
 1986                         /*temp2*/gct2,
 1987                         /*result*/r10, L_store_element);
 1988 
 1989     // Fall through on failure!
 1990 
 1991     // ======== end loop ========
 1992 
 1993     // It was a real error; we must depend on the caller to finish the job.
 1994     // Register count = remaining oops, count_orig = total oops.
 1995     // Emit GC store barriers for the oops we have copied and report
 1996     // their number to the caller.
 1997 
 1998     __ subs(count, count_save, count);     // K = partially copied oop count
 1999     __ eon(count, count, zr);              // report (-1^K) to caller
 2000     __ br(Assembler::EQ, L_done_pop);
 2001 
 2002     __ BIND(L_do_card_marks);
 2003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2004 
 2005     __ bind(L_done_pop);
 2006     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2007     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2008 
 2009     __ bind(L_done);
 2010     __ mov(r0, count);
 2011     __ leave();
 2012     __ ret(lr);
 2013 
 2014     return start;
 2015   }
 2016 
 2017   // Perform range checks on the proposed arraycopy.
 2018   // Kills temp, but nothing else.
 2019   // Also, clean the sign bits of src_pos and dst_pos.
 2020   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2021                               Register src_pos, // source position (c_rarg1)
 2022                               Register dst,     // destination array oo (c_rarg2)
 2023                               Register dst_pos, // destination position (c_rarg3)
 2024                               Register length,
 2025                               Register temp,
 2026                               Label& L_failed) {
 2027     BLOCK_COMMENT("arraycopy_range_checks:");
 2028 
 2029     assert_different_registers(rscratch1, temp);
 2030 
 2031     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2032     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2033     __ addw(temp, length, src_pos);
 2034     __ cmpw(temp, rscratch1);
 2035     __ br(Assembler::HI, L_failed);
 2036 
 2037     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2038     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2039     __ addw(temp, length, dst_pos);
 2040     __ cmpw(temp, rscratch1);
 2041     __ br(Assembler::HI, L_failed);
 2042 
 2043     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2044     __ movw(src_pos, src_pos);
 2045     __ movw(dst_pos, dst_pos);
 2046 
 2047     BLOCK_COMMENT("arraycopy_range_checks done");
 2048   }
 2049 
 2050   // These stubs get called from some dumb test routine.
 2051   // I'll write them properly when they're called from
 2052   // something that's actually doing something.
 2053   static void fake_arraycopy_stub(address src, address dst, int count) {
 2054     assert(count == 0, "huh?");
 2055   }
 2056 
 2057 
 2058   //
 2059   //  Generate 'unsafe' array copy stub
 2060   //  Though just as safe as the other stubs, it takes an unscaled
 2061   //  size_t argument instead of an element count.
 2062   //
 2063   //  Input:
 2064   //    c_rarg0   - source array address
 2065   //    c_rarg1   - destination array address
 2066   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2067   //
 2068   // Examines the alignment of the operands and dispatches
 2069   // to a long, int, short, or byte copy loop.
 2070   //
 2071   address generate_unsafe_copy(address byte_copy_entry,
 2072                                address short_copy_entry,
 2073                                address int_copy_entry,
 2074                                address long_copy_entry) {
 2075     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2076 
 2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2079 
 2080     __ align(CodeEntryAlignment);
 2081     StubCodeMark mark(this, stub_id);
 2082     address start = __ pc();
 2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2084 
 2085     // bump this on entry, not on exit:
 2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2087 
 2088     __ orr(rscratch1, s, d);
 2089     __ orr(rscratch1, rscratch1, count);
 2090 
 2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2092     __ cbz(rscratch1, L_long_aligned);
 2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2094     __ cbz(rscratch1, L_int_aligned);
 2095     __ tbz(rscratch1, 0, L_short_aligned);
 2096     __ b(RuntimeAddress(byte_copy_entry));
 2097 
 2098     __ BIND(L_short_aligned);
 2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2100     __ b(RuntimeAddress(short_copy_entry));
 2101     __ BIND(L_int_aligned);
 2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2103     __ b(RuntimeAddress(int_copy_entry));
 2104     __ BIND(L_long_aligned);
 2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2106     __ b(RuntimeAddress(long_copy_entry));
 2107 
 2108     return start;
 2109   }
 2110 
 2111   //
 2112   //  Generate generic array copy stubs
 2113   //
 2114   //  Input:
 2115   //    c_rarg0    -  src oop
 2116   //    c_rarg1    -  src_pos (32-bits)
 2117   //    c_rarg2    -  dst oop
 2118   //    c_rarg3    -  dst_pos (32-bits)
 2119   //    c_rarg4    -  element count (32-bits)
 2120   //
 2121   //  Output:
 2122   //    r0 ==  0  -  success
 2123   //    r0 == -1^K - failure, where K is partial transfer count
 2124   //
 2125   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2126                                 address int_copy_entry, address oop_copy_entry,
 2127                                 address long_copy_entry, address checkcast_copy_entry) {
 2128     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2129 
 2130     Label L_failed, L_objArray;
 2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2132 
 2133     // Input registers
 2134     const Register src        = c_rarg0;  // source array oop
 2135     const Register src_pos    = c_rarg1;  // source position
 2136     const Register dst        = c_rarg2;  // destination array oop
 2137     const Register dst_pos    = c_rarg3;  // destination position
 2138     const Register length     = c_rarg4;
 2139 
 2140 
 2141     // Registers used as temps
 2142     const Register dst_klass  = c_rarg5;
 2143 
 2144     __ align(CodeEntryAlignment);
 2145 
 2146     StubCodeMark mark(this, stub_id);
 2147 
 2148     address start = __ pc();
 2149 
 2150     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2151 
 2152     // bump this on entry, not on exit:
 2153     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2154 
 2155     //-----------------------------------------------------------------------
 2156     // Assembler stub will be used for this call to arraycopy
 2157     // if the following conditions are met:
 2158     //
 2159     // (1) src and dst must not be null.
 2160     // (2) src_pos must not be negative.
 2161     // (3) dst_pos must not be negative.
 2162     // (4) length  must not be negative.
 2163     // (5) src klass and dst klass should be the same and not null.
 2164     // (6) src and dst should be arrays.
 2165     // (7) src_pos + length must not exceed length of src.
 2166     // (8) dst_pos + length must not exceed length of dst.
 2167     //
 2168 
 2169     //  if (src == nullptr) return -1;
 2170     __ cbz(src, L_failed);
 2171 
 2172     //  if (src_pos < 0) return -1;
 2173     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2174 
 2175     //  if (dst == nullptr) return -1;
 2176     __ cbz(dst, L_failed);
 2177 
 2178     //  if (dst_pos < 0) return -1;
 2179     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2180 
 2181     // registers used as temp
 2182     const Register scratch_length    = r16; // elements count to copy
 2183     const Register scratch_src_klass = r17; // array klass
 2184     const Register lh                = r15; // layout helper
 2185 
 2186     //  if (length < 0) return -1;
 2187     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2188     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     __ load_klass(scratch_src_klass, src);
 2191 #ifdef ASSERT
 2192     //  assert(src->klass() != nullptr);
 2193     {
 2194       BLOCK_COMMENT("assert klasses not null {");
 2195       Label L1, L2;
 2196       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2197       __ bind(L1);
 2198       __ stop("broken null klass");
 2199       __ bind(L2);
 2200       __ load_klass(rscratch1, dst);
 2201       __ cbz(rscratch1, L1);     // this would be broken also
 2202       BLOCK_COMMENT("} assert klasses not null done");
 2203     }
 2204 #endif
 2205 
 2206     // Load layout helper (32-bits)
 2207     //
 2208     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2209     // 32        30    24            16              8     2                 0
 2210     //
 2211     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2212     //
 2213 
 2214     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2215 
 2216     // Handle objArrays completely differently...
 2217     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2218     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2219     __ movw(rscratch1, objArray_lh);
 2220     __ eorw(rscratch2, lh, rscratch1);
 2221     __ cbzw(rscratch2, L_objArray);
 2222 
 2223     //  if (src->klass() != dst->klass()) return -1;
 2224     __ load_klass(rscratch2, dst);
 2225     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2226     __ cbnz(rscratch2, L_failed);
 2227 
 2228     //  if (!src->is_Array()) return -1;
 2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2230 
 2231     // At this point, it is known to be a typeArray (array_tag 0x3).
 2232 #ifdef ASSERT
 2233     {
 2234       BLOCK_COMMENT("assert primitive array {");
 2235       Label L;
 2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2237       __ cmpw(lh, rscratch2);
 2238       __ br(Assembler::GE, L);
 2239       __ stop("must be a primitive array");
 2240       __ bind(L);
 2241       BLOCK_COMMENT("} assert primitive array done");
 2242     }
 2243 #endif
 2244 
 2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2246                            rscratch2, L_failed);
 2247 
 2248     // TypeArrayKlass
 2249     //
 2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2252     //
 2253 
 2254     const Register rscratch1_offset = rscratch1;    // array offset
 2255     const Register r15_elsize = lh; // element size
 2256 
 2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2259     __ add(src, src, rscratch1_offset);           // src array offset
 2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2261     BLOCK_COMMENT("choose copy loop based on element size");
 2262 
 2263     // next registers should be set before the jump to corresponding stub
 2264     const Register from     = c_rarg0;  // source array address
 2265     const Register to       = c_rarg1;  // destination array address
 2266     const Register count    = c_rarg2;  // elements count
 2267 
 2268     // 'from', 'to', 'count' registers should be set in such order
 2269     // since they are the same as 'src', 'src_pos', 'dst'.
 2270 
 2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2272 
 2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2274     // size in bytes).  We do a simple bitwise binary search.
 2275   __ BIND(L_copy_bytes);
 2276     __ tbnz(r15_elsize, 1, L_copy_ints);
 2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2278     __ lea(from, Address(src, src_pos));// src_addr
 2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2280     __ movw(count, scratch_length); // length
 2281     __ b(RuntimeAddress(byte_copy_entry));
 2282 
 2283   __ BIND(L_copy_shorts);
 2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2286     __ movw(count, scratch_length); // length
 2287     __ b(RuntimeAddress(short_copy_entry));
 2288 
 2289   __ BIND(L_copy_ints);
 2290     __ tbnz(r15_elsize, 0, L_copy_longs);
 2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2293     __ movw(count, scratch_length); // length
 2294     __ b(RuntimeAddress(int_copy_entry));
 2295 
 2296   __ BIND(L_copy_longs);
 2297 #ifdef ASSERT
 2298     {
 2299       BLOCK_COMMENT("assert long copy {");
 2300       Label L;
 2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2302       __ cmpw(r15_elsize, LogBytesPerLong);
 2303       __ br(Assembler::EQ, L);
 2304       __ stop("must be long copy, but elsize is wrong");
 2305       __ bind(L);
 2306       BLOCK_COMMENT("} assert long copy done");
 2307     }
 2308 #endif
 2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2311     __ movw(count, scratch_length); // length
 2312     __ b(RuntimeAddress(long_copy_entry));
 2313 
 2314     // ObjArrayKlass
 2315   __ BIND(L_objArray);
 2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2317 
 2318     Label L_plain_copy, L_checkcast_copy;
 2319     //  test array classes for subtyping
 2320     __ load_klass(r15, dst);
 2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2322     __ br(Assembler::NE, L_checkcast_copy);
 2323 
 2324     // Identically typed arrays can be copied without element-wise checks.
 2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2326                            rscratch2, L_failed);
 2327 
 2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2332     __ movw(count, scratch_length); // length
 2333   __ BIND(L_plain_copy);
 2334     __ b(RuntimeAddress(oop_copy_entry));
 2335 
 2336   __ BIND(L_checkcast_copy);
 2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2338     {
 2339       // Before looking at dst.length, make sure dst is also an objArray.
 2340       __ ldrw(rscratch1, Address(r15, lh_offset));
 2341       __ movw(rscratch2, objArray_lh);
 2342       __ eorw(rscratch1, rscratch1, rscratch2);
 2343       __ cbnzw(rscratch1, L_failed);
 2344 
 2345       // It is safe to examine both src.length and dst.length.
 2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                              r15, L_failed);
 2348 
 2349       __ load_klass(dst_klass, dst); // reload
 2350 
 2351       // Marshal the base address arguments now, freeing registers.
 2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2356       __ movw(count, length);           // length (reloaded)
 2357       Register sco_temp = c_rarg3;      // this register is free now
 2358       assert_different_registers(from, to, count, sco_temp,
 2359                                  dst_klass, scratch_src_klass);
 2360       // assert_clean_int(count, sco_temp);
 2361 
 2362       // Generate the type check.
 2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2365 
 2366       // Smashes rscratch1, rscratch2
 2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2368                           L_plain_copy);
 2369 
 2370       // Fetch destination element klass from the ObjArrayKlass header.
 2371       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2372       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2373       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2374 
 2375       // the checkcast_copy loop needs two extra arguments:
 2376       assert(c_rarg3 == sco_temp, "#3 already in place");
 2377       // Set up arguments for checkcast_copy_entry.
 2378       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2379       __ b(RuntimeAddress(checkcast_copy_entry));
 2380     }
 2381 
 2382   __ BIND(L_failed);
 2383     __ mov(r0, -1);
 2384     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2385     __ ret(lr);
 2386 
 2387     return start;
 2388   }
 2389 
 2390   //
 2391   // Generate stub for array fill. If "aligned" is true, the
 2392   // "to" address is assumed to be heapword aligned.
 2393   //
 2394   // Arguments for generated stub:
 2395   //   to:    c_rarg0
 2396   //   value: c_rarg1
 2397   //   count: c_rarg2 treated as signed
 2398   //
 2399   address generate_fill(StubId stub_id) {
 2400     BasicType t;
 2401     bool aligned;
 2402 
 2403     switch (stub_id) {
 2404     case StubId::stubgen_jbyte_fill_id:
 2405       t = T_BYTE;
 2406       aligned = false;
 2407       break;
 2408     case StubId::stubgen_jshort_fill_id:
 2409       t = T_SHORT;
 2410       aligned = false;
 2411       break;
 2412     case StubId::stubgen_jint_fill_id:
 2413       t = T_INT;
 2414       aligned = false;
 2415       break;
 2416     case StubId::stubgen_arrayof_jbyte_fill_id:
 2417       t = T_BYTE;
 2418       aligned = true;
 2419       break;
 2420     case StubId::stubgen_arrayof_jshort_fill_id:
 2421       t = T_SHORT;
 2422       aligned = true;
 2423       break;
 2424     case StubId::stubgen_arrayof_jint_fill_id:
 2425       t = T_INT;
 2426       aligned = true;
 2427       break;
 2428     default:
 2429       ShouldNotReachHere();
 2430     };
 2431 
 2432     __ align(CodeEntryAlignment);
 2433     StubCodeMark mark(this, stub_id);
 2434     address start = __ pc();
 2435 
 2436     BLOCK_COMMENT("Entry:");
 2437 
 2438     const Register to        = c_rarg0;  // source array address
 2439     const Register value     = c_rarg1;  // value
 2440     const Register count     = c_rarg2;  // elements count
 2441 
 2442     const Register bz_base = r10;        // base for block_zero routine
 2443     const Register cnt_words = r11;      // temp register
 2444 
 2445     __ enter();
 2446 
 2447     Label L_fill_elements, L_exit1;
 2448 
 2449     int shift = -1;
 2450     switch (t) {
 2451       case T_BYTE:
 2452         shift = 0;
 2453         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2454         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2455         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2456         __ br(Assembler::LO, L_fill_elements);
 2457         break;
 2458       case T_SHORT:
 2459         shift = 1;
 2460         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2461         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2462         __ br(Assembler::LO, L_fill_elements);
 2463         break;
 2464       case T_INT:
 2465         shift = 2;
 2466         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2467         __ br(Assembler::LO, L_fill_elements);
 2468         break;
 2469       default: ShouldNotReachHere();
 2470     }
 2471 
 2472     // Align source address at 8 bytes address boundary.
 2473     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2474     if (!aligned) {
 2475       switch (t) {
 2476         case T_BYTE:
 2477           // One byte misalignment happens only for byte arrays.
 2478           __ tbz(to, 0, L_skip_align1);
 2479           __ strb(value, Address(__ post(to, 1)));
 2480           __ subw(count, count, 1);
 2481           __ bind(L_skip_align1);
 2482           // Fallthrough
 2483         case T_SHORT:
 2484           // Two bytes misalignment happens only for byte and short (char) arrays.
 2485           __ tbz(to, 1, L_skip_align2);
 2486           __ strh(value, Address(__ post(to, 2)));
 2487           __ subw(count, count, 2 >> shift);
 2488           __ bind(L_skip_align2);
 2489           // Fallthrough
 2490         case T_INT:
 2491           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2492           __ tbz(to, 2, L_skip_align4);
 2493           __ strw(value, Address(__ post(to, 4)));
 2494           __ subw(count, count, 4 >> shift);
 2495           __ bind(L_skip_align4);
 2496           break;
 2497         default: ShouldNotReachHere();
 2498       }
 2499     }
 2500 
 2501     //
 2502     //  Fill large chunks
 2503     //
 2504     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2505     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2506     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2507     if (UseBlockZeroing) {
 2508       Label non_block_zeroing, rest;
 2509       // If the fill value is zero we can use the fast zero_words().
 2510       __ cbnz(value, non_block_zeroing);
 2511       __ mov(bz_base, to);
 2512       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2513       address tpc = __ zero_words(bz_base, cnt_words);
 2514       if (tpc == nullptr) {
 2515         fatal("CodeCache is full at generate_fill");
 2516       }
 2517       __ b(rest);
 2518       __ bind(non_block_zeroing);
 2519       __ fill_words(to, cnt_words, value);
 2520       __ bind(rest);
 2521     } else {
 2522       __ fill_words(to, cnt_words, value);
 2523     }
 2524 
 2525     // Remaining count is less than 8 bytes. Fill it by a single store.
 2526     // Note that the total length is no less than 8 bytes.
 2527     if (t == T_BYTE || t == T_SHORT) {
 2528       Label L_exit1;
 2529       __ cbzw(count, L_exit1);
 2530       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2531       __ str(value, Address(to, -8));    // overwrite some elements
 2532       __ bind(L_exit1);
 2533       __ leave();
 2534       __ ret(lr);
 2535     }
 2536 
 2537     // Handle copies less than 8 bytes.
 2538     Label L_fill_2, L_fill_4, L_exit2;
 2539     __ bind(L_fill_elements);
 2540     switch (t) {
 2541       case T_BYTE:
 2542         __ tbz(count, 0, L_fill_2);
 2543         __ strb(value, Address(__ post(to, 1)));
 2544         __ bind(L_fill_2);
 2545         __ tbz(count, 1, L_fill_4);
 2546         __ strh(value, Address(__ post(to, 2)));
 2547         __ bind(L_fill_4);
 2548         __ tbz(count, 2, L_exit2);
 2549         __ strw(value, Address(to));
 2550         break;
 2551       case T_SHORT:
 2552         __ tbz(count, 0, L_fill_4);
 2553         __ strh(value, Address(__ post(to, 2)));
 2554         __ bind(L_fill_4);
 2555         __ tbz(count, 1, L_exit2);
 2556         __ strw(value, Address(to));
 2557         break;
 2558       case T_INT:
 2559         __ cbzw(count, L_exit2);
 2560         __ strw(value, Address(to));
 2561         break;
 2562       default: ShouldNotReachHere();
 2563     }
 2564     __ bind(L_exit2);
 2565     __ leave();
 2566     __ ret(lr);
 2567     return start;
 2568   }
 2569 
 2570   address generate_unsafecopy_common_error_exit() {
 2571     address start_pc = __ pc();
 2572       __ leave();
 2573       __ mov(r0, 0);
 2574       __ ret(lr);
 2575     return start_pc;
 2576   }
 2577 
 2578   //
 2579   //  Generate 'unsafe' set memory stub
 2580   //  Though just as safe as the other stubs, it takes an unscaled
 2581   //  size_t (# bytes) argument instead of an element count.
 2582   //
 2583   //  This fill operation is atomicity preserving: as long as the
 2584   //  address supplied is sufficiently aligned, all writes of up to 64
 2585   //  bits in size are single-copy atomic.
 2586   //
 2587   //  Input:
 2588   //    c_rarg0   - destination array address
 2589   //    c_rarg1   - byte count (size_t)
 2590   //    c_rarg2   - byte value
 2591   //
 2592   address generate_unsafe_setmemory() {
 2593     __ align(CodeEntryAlignment);
 2594     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2595     address start = __ pc();
 2596 
 2597     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2598     Label tail;
 2599 
 2600     UnsafeMemoryAccessMark umam(this, true, false);
 2601 
 2602     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2603 
 2604     __ dup(v0, __ T16B, value);
 2605 
 2606     if (AvoidUnalignedAccesses) {
 2607       __ cmp(count, (u1)16);
 2608       __ br(__ LO, tail);
 2609 
 2610       __ mov(rscratch1, 16);
 2611       __ andr(rscratch2, dest, 15);
 2612       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2613       __ strq(v0, Address(dest));
 2614       __ sub(count, count, rscratch1);
 2615       __ add(dest, dest, rscratch1);
 2616     }
 2617 
 2618     __ subs(count, count, (u1)64);
 2619     __ br(__ LO, tail);
 2620     {
 2621       Label again;
 2622       __ bind(again);
 2623       __ stpq(v0, v0, Address(dest));
 2624       __ stpq(v0, v0, Address(dest, 32));
 2625 
 2626       __ subs(count, count, 64);
 2627       __ add(dest, dest, 64);
 2628       __ br(__ HS, again);
 2629     }
 2630 
 2631     __ bind(tail);
 2632     // The count of bytes is off by 64, but we don't need to correct
 2633     // it because we're only going to use the least-significant few
 2634     // count bits from here on.
 2635     // __ add(count, count, 64);
 2636 
 2637     {
 2638       Label dont;
 2639       __ tbz(count, exact_log2(32), dont);
 2640       __ stpq(v0, v0, __ post(dest, 32));
 2641       __ bind(dont);
 2642     }
 2643     {
 2644       Label dont;
 2645       __ tbz(count, exact_log2(16), dont);
 2646       __ strq(v0, __ post(dest, 16));
 2647       __ bind(dont);
 2648     }
 2649     {
 2650       Label dont;
 2651       __ tbz(count, exact_log2(8), dont);
 2652       __ strd(v0, __ post(dest, 8));
 2653       __ bind(dont);
 2654     }
 2655 
 2656     Label finished;
 2657     __ tst(count, 7);
 2658     __ br(__ EQ, finished);
 2659 
 2660     {
 2661       Label dont;
 2662       __ tbz(count, exact_log2(4), dont);
 2663       __ strs(v0, __ post(dest, 4));
 2664       __ bind(dont);
 2665     }
 2666     {
 2667       Label dont;
 2668       __ tbz(count, exact_log2(2), dont);
 2669       __ bfi(value, value, 8, 8);
 2670       __ strh(value, __ post(dest, 2));
 2671       __ bind(dont);
 2672     }
 2673     {
 2674       Label dont;
 2675       __ tbz(count, exact_log2(1), dont);
 2676       __ strb(value, Address(dest));
 2677       __ bind(dont);
 2678     }
 2679 
 2680     __ bind(finished);
 2681     __ leave();
 2682     __ ret(lr);
 2683 
 2684     return start;
 2685   }
 2686 
 2687   address generate_data_cache_writeback() {
 2688     const Register line        = c_rarg0;  // address of line to write back
 2689 
 2690     __ align(CodeEntryAlignment);
 2691 
 2692     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2693     StubCodeMark mark(this, stub_id);
 2694 
 2695     address start = __ pc();
 2696     __ enter();
 2697     __ cache_wb(Address(line, 0));
 2698     __ leave();
 2699     __ ret(lr);
 2700 
 2701     return start;
 2702   }
 2703 
 2704   address generate_data_cache_writeback_sync() {
 2705     const Register is_pre     = c_rarg0;  // pre or post sync
 2706 
 2707     __ align(CodeEntryAlignment);
 2708 
 2709     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2710     StubCodeMark mark(this, stub_id);
 2711 
 2712     // pre wbsync is a no-op
 2713     // post wbsync translates to an sfence
 2714 
 2715     Label skip;
 2716     address start = __ pc();
 2717     __ enter();
 2718     __ cbnz(is_pre, skip);
 2719     __ cache_wbsync(false);
 2720     __ bind(skip);
 2721     __ leave();
 2722     __ ret(lr);
 2723 
 2724     return start;
 2725   }
 2726 
 2727   void generate_arraycopy_stubs() {
 2728     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 2729     // entry immediately following their stack push. This can be used
 2730     // as a post-push branch target for compatible stubs when they
 2731     // identify a special case that can be handled by the fallback
 2732     // stub e.g a disjoint copy stub may be use as a special case
 2733     // fallback for its compatible conjoint copy stub.
 2734     //
 2735     // A no push entry is always returned in the following local and
 2736     // then published by assigning to the appropriate entry field in
 2737     // class StubRoutines. The entry value is then passed to the
 2738     // generator for the compatible stub. That means the entry must be
 2739     // listed when saving to/restoring from the AOT cache, ensuring
 2740     // that the inter-stub jumps are noted at AOT-cache save and
 2741     // relocated at AOT cache load.
 2742     address nopush_entry;
 2743 
 2744     // generate the common exit first so later stubs can rely on it if
 2745     // they want an UnsafeMemoryAccess exit non-local to the stub
 2746     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2747     // register the stub as the default exit with class UnsafeMemoryAccess
 2748     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2749 
 2750     // generate and publish arch64-specific bulk copy routines first
 2751     // so we can call them from other copy stubs
 2752     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2753     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2754 
 2755     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2756     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2757 
 2758     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2759     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2760 
 2761     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2762 
 2763     //*** jbyte
 2764     // Always need aligned and unaligned versions
 2765     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2766     // disjoint nopush entry is needed by conjoint copy
 2767     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2768     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 2769     // conjoint nopush entry is needed by generic/unsafe copy
 2770     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 2771     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2772     // disjoint arrayof nopush entry is needed by conjoint copy
 2773     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2774     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 2775 
 2776     //*** jshort
 2777     // Always need aligned and unaligned versions
 2778     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 2779     // disjoint nopush entry is needed by conjoint copy
 2780     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 2781     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 2782     // conjoint nopush entry is used by generic/unsafe copy
 2783     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 2784     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 2785     // disjoint arrayof nopush entry is needed by conjoint copy
 2786     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 2787     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 2788 
 2789     //*** jint
 2790     // Aligned versions
 2791     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 2792     // disjoint arrayof nopush entry is needed by conjoint copy
 2793     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 2794     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 2795     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2796     // jint_arraycopy_nopush always points to the unaligned version
 2797     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 2798     // disjoint nopush entry is needed by conjoint copy
 2799     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 2800     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 2801     // conjoint nopush entry is needed by generic/unsafe copy
 2802     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 2803 
 2804     //*** jlong
 2805     // It is always aligned
 2806     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 2807     // disjoint arrayof nopush entry is needed by conjoint copy
 2808     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 2809     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 2810     // conjoint nopush entry is needed by generic/unsafe copy
 2811     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 2812     // disjoint normal/nopush and conjoint normal entries are not
 2813     // generated since the arrayof versions are the same
 2814     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2815     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 2816     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2817 
 2818     //*** oops
 2819     {
 2820       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2821         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 2822       // disjoint arrayof nopush entry is needed by conjoint copy
 2823       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 2824       StubRoutines::_arrayof_oop_arraycopy
 2825         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 2826       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 2827       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 2828       // Aligned versions without pre-barriers
 2829       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2830         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 2831       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 2832       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 2833       // note that we don't need a returned nopush entry because the
 2834       // generic/unsafe copy does not cater for uninit arrays.
 2835       StubRoutines::_arrayof_oop_arraycopy_uninit
 2836         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 2837     }
 2838 
 2839     // for oop copies reuse arrayof entries for non-arrayof cases
 2840     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2841     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 2842     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2843     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2844     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 2845     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2846 
 2847     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 2848     // checkcast nopush entry is needed by generic copy
 2849     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 2850     // note that we don't need a returned nopush entry because the
 2851     // generic copy does not cater for uninit arrays.
 2852     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2853 
 2854     // unsafe arraycopy may fallback on conjoint stubs
 2855     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2856                                                               StubRoutines::_jshort_arraycopy_nopush,
 2857                                                               StubRoutines::_jint_arraycopy_nopush,
 2858                                                               StubRoutines::_jlong_arraycopy_nopush);
 2859 
 2860     // generic arraycopy may fallback on conjoint stubs
 2861     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2862                                                                StubRoutines::_jshort_arraycopy_nopush,
 2863                                                                StubRoutines::_jint_arraycopy_nopush,
 2864                                                                StubRoutines::_oop_arraycopy_nopush,
 2865                                                                StubRoutines::_jlong_arraycopy_nopush,
 2866                                                                StubRoutines::_checkcast_arraycopy_nopush);
 2867 
 2868     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2869     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2870     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2871     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2872     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2873     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2874   }
 2875 
 2876   void generate_math_stubs() { Unimplemented(); }
 2877 
 2878   // Arguments:
 2879   //
 2880   // Inputs:
 2881   //   c_rarg0   - source byte array address
 2882   //   c_rarg1   - destination byte array address
 2883   //   c_rarg2   - sessionKe (key) in little endian int array
 2884   //
 2885   address generate_aescrypt_encryptBlock() {
 2886     __ align(CodeEntryAlignment);
 2887     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2888     StubCodeMark mark(this, stub_id);
 2889 
 2890     const Register from        = c_rarg0;  // source array address
 2891     const Register to          = c_rarg1;  // destination array address
 2892     const Register key         = c_rarg2;  // key array address
 2893     const Register keylen      = rscratch1;
 2894 
 2895     address start = __ pc();
 2896     __ enter();
 2897 
 2898     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2899 
 2900     __ aesenc_loadkeys(key, keylen);
 2901     __ aesecb_encrypt(from, to, keylen);
 2902 
 2903     __ mov(r0, 0);
 2904 
 2905     __ leave();
 2906     __ ret(lr);
 2907 
 2908     return start;
 2909   }
 2910 
 2911   // Arguments:
 2912   //
 2913   // Inputs:
 2914   //   c_rarg0   - source byte array address
 2915   //   c_rarg1   - destination byte array address
 2916   //   c_rarg2   - sessionKd (key) in little endian int array
 2917   //
 2918   address generate_aescrypt_decryptBlock() {
 2919     assert(UseAES, "need AES cryptographic extension support");
 2920     __ align(CodeEntryAlignment);
 2921     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2922     StubCodeMark mark(this, stub_id);
 2923     Label L_doLast;
 2924 
 2925     const Register from        = c_rarg0;  // source array address
 2926     const Register to          = c_rarg1;  // destination array address
 2927     const Register key         = c_rarg2;  // key array address
 2928     const Register keylen      = rscratch1;
 2929 
 2930     address start = __ pc();
 2931     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2932 
 2933     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2934 
 2935     __ aesecb_decrypt(from, to, key, keylen);
 2936 
 2937     __ mov(r0, 0);
 2938 
 2939     __ leave();
 2940     __ ret(lr);
 2941 
 2942     return start;
 2943   }
 2944 
 2945   // Arguments:
 2946   //
 2947   // Inputs:
 2948   //   c_rarg0   - source byte array address
 2949   //   c_rarg1   - destination byte array address
 2950   //   c_rarg2   - sessionKe (key) in little endian int array
 2951   //   c_rarg3   - r vector byte array address
 2952   //   c_rarg4   - input length
 2953   //
 2954   // Output:
 2955   //   x0        - input length
 2956   //
 2957   address generate_cipherBlockChaining_encryptAESCrypt() {
 2958     assert(UseAES, "need AES cryptographic extension support");
 2959     __ align(CodeEntryAlignment);
 2960     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2961     StubCodeMark mark(this, stub_id);
 2962 
 2963     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2964 
 2965     const Register from        = c_rarg0;  // source array address
 2966     const Register to          = c_rarg1;  // destination array address
 2967     const Register key         = c_rarg2;  // key array address
 2968     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2969                                            // and left with the results of the last encryption block
 2970     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2971     const Register keylen      = rscratch1;
 2972 
 2973     address start = __ pc();
 2974 
 2975       __ enter();
 2976 
 2977       __ movw(rscratch2, len_reg);
 2978 
 2979       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2980 
 2981       __ ld1(v0, __ T16B, rvec);
 2982 
 2983       __ cmpw(keylen, 52);
 2984       __ br(Assembler::CC, L_loadkeys_44);
 2985       __ br(Assembler::EQ, L_loadkeys_52);
 2986 
 2987       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2988       __ rev32(v17, __ T16B, v17);
 2989       __ rev32(v18, __ T16B, v18);
 2990     __ BIND(L_loadkeys_52);
 2991       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2992       __ rev32(v19, __ T16B, v19);
 2993       __ rev32(v20, __ T16B, v20);
 2994     __ BIND(L_loadkeys_44);
 2995       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2996       __ rev32(v21, __ T16B, v21);
 2997       __ rev32(v22, __ T16B, v22);
 2998       __ rev32(v23, __ T16B, v23);
 2999       __ rev32(v24, __ T16B, v24);
 3000       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3001       __ rev32(v25, __ T16B, v25);
 3002       __ rev32(v26, __ T16B, v26);
 3003       __ rev32(v27, __ T16B, v27);
 3004       __ rev32(v28, __ T16B, v28);
 3005       __ ld1(v29, v30, v31, __ T16B, key);
 3006       __ rev32(v29, __ T16B, v29);
 3007       __ rev32(v30, __ T16B, v30);
 3008       __ rev32(v31, __ T16B, v31);
 3009 
 3010     __ BIND(L_aes_loop);
 3011       __ ld1(v1, __ T16B, __ post(from, 16));
 3012       __ eor(v0, __ T16B, v0, v1);
 3013 
 3014       __ br(Assembler::CC, L_rounds_44);
 3015       __ br(Assembler::EQ, L_rounds_52);
 3016 
 3017       __ aese(v0, v17); __ aesmc(v0, v0);
 3018       __ aese(v0, v18); __ aesmc(v0, v0);
 3019     __ BIND(L_rounds_52);
 3020       __ aese(v0, v19); __ aesmc(v0, v0);
 3021       __ aese(v0, v20); __ aesmc(v0, v0);
 3022     __ BIND(L_rounds_44);
 3023       __ aese(v0, v21); __ aesmc(v0, v0);
 3024       __ aese(v0, v22); __ aesmc(v0, v0);
 3025       __ aese(v0, v23); __ aesmc(v0, v0);
 3026       __ aese(v0, v24); __ aesmc(v0, v0);
 3027       __ aese(v0, v25); __ aesmc(v0, v0);
 3028       __ aese(v0, v26); __ aesmc(v0, v0);
 3029       __ aese(v0, v27); __ aesmc(v0, v0);
 3030       __ aese(v0, v28); __ aesmc(v0, v0);
 3031       __ aese(v0, v29); __ aesmc(v0, v0);
 3032       __ aese(v0, v30);
 3033       __ eor(v0, __ T16B, v0, v31);
 3034 
 3035       __ st1(v0, __ T16B, __ post(to, 16));
 3036 
 3037       __ subw(len_reg, len_reg, 16);
 3038       __ cbnzw(len_reg, L_aes_loop);
 3039 
 3040       __ st1(v0, __ T16B, rvec);
 3041 
 3042       __ mov(r0, rscratch2);
 3043 
 3044       __ leave();
 3045       __ ret(lr);
 3046 
 3047       return start;
 3048   }
 3049 
 3050   // Arguments:
 3051   //
 3052   // Inputs:
 3053   //   c_rarg0   - source byte array address
 3054   //   c_rarg1   - destination byte array address
 3055   //   c_rarg2   - sessionKd (key) in little endian int array
 3056   //   c_rarg3   - r vector byte array address
 3057   //   c_rarg4   - input length
 3058   //
 3059   // Output:
 3060   //   r0        - input length
 3061   //
 3062   address generate_cipherBlockChaining_decryptAESCrypt() {
 3063     assert(UseAES, "need AES cryptographic extension support");
 3064     __ align(CodeEntryAlignment);
 3065     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3066     StubCodeMark mark(this, stub_id);
 3067 
 3068     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3069 
 3070     const Register from        = c_rarg0;  // source array address
 3071     const Register to          = c_rarg1;  // destination array address
 3072     const Register key         = c_rarg2;  // key array address
 3073     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3074                                            // and left with the results of the last encryption block
 3075     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3076     const Register keylen      = rscratch1;
 3077 
 3078     address start = __ pc();
 3079 
 3080       __ enter();
 3081 
 3082       __ movw(rscratch2, len_reg);
 3083 
 3084       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3085 
 3086       __ ld1(v2, __ T16B, rvec);
 3087 
 3088       __ ld1(v31, __ T16B, __ post(key, 16));
 3089       __ rev32(v31, __ T16B, v31);
 3090 
 3091       __ cmpw(keylen, 52);
 3092       __ br(Assembler::CC, L_loadkeys_44);
 3093       __ br(Assembler::EQ, L_loadkeys_52);
 3094 
 3095       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3096       __ rev32(v17, __ T16B, v17);
 3097       __ rev32(v18, __ T16B, v18);
 3098     __ BIND(L_loadkeys_52);
 3099       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3100       __ rev32(v19, __ T16B, v19);
 3101       __ rev32(v20, __ T16B, v20);
 3102     __ BIND(L_loadkeys_44);
 3103       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3104       __ rev32(v21, __ T16B, v21);
 3105       __ rev32(v22, __ T16B, v22);
 3106       __ rev32(v23, __ T16B, v23);
 3107       __ rev32(v24, __ T16B, v24);
 3108       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3109       __ rev32(v25, __ T16B, v25);
 3110       __ rev32(v26, __ T16B, v26);
 3111       __ rev32(v27, __ T16B, v27);
 3112       __ rev32(v28, __ T16B, v28);
 3113       __ ld1(v29, v30, __ T16B, key);
 3114       __ rev32(v29, __ T16B, v29);
 3115       __ rev32(v30, __ T16B, v30);
 3116 
 3117     __ BIND(L_aes_loop);
 3118       __ ld1(v0, __ T16B, __ post(from, 16));
 3119       __ orr(v1, __ T16B, v0, v0);
 3120 
 3121       __ br(Assembler::CC, L_rounds_44);
 3122       __ br(Assembler::EQ, L_rounds_52);
 3123 
 3124       __ aesd(v0, v17); __ aesimc(v0, v0);
 3125       __ aesd(v0, v18); __ aesimc(v0, v0);
 3126     __ BIND(L_rounds_52);
 3127       __ aesd(v0, v19); __ aesimc(v0, v0);
 3128       __ aesd(v0, v20); __ aesimc(v0, v0);
 3129     __ BIND(L_rounds_44);
 3130       __ aesd(v0, v21); __ aesimc(v0, v0);
 3131       __ aesd(v0, v22); __ aesimc(v0, v0);
 3132       __ aesd(v0, v23); __ aesimc(v0, v0);
 3133       __ aesd(v0, v24); __ aesimc(v0, v0);
 3134       __ aesd(v0, v25); __ aesimc(v0, v0);
 3135       __ aesd(v0, v26); __ aesimc(v0, v0);
 3136       __ aesd(v0, v27); __ aesimc(v0, v0);
 3137       __ aesd(v0, v28); __ aesimc(v0, v0);
 3138       __ aesd(v0, v29); __ aesimc(v0, v0);
 3139       __ aesd(v0, v30);
 3140       __ eor(v0, __ T16B, v0, v31);
 3141       __ eor(v0, __ T16B, v0, v2);
 3142 
 3143       __ st1(v0, __ T16B, __ post(to, 16));
 3144       __ orr(v2, __ T16B, v1, v1);
 3145 
 3146       __ subw(len_reg, len_reg, 16);
 3147       __ cbnzw(len_reg, L_aes_loop);
 3148 
 3149       __ st1(v2, __ T16B, rvec);
 3150 
 3151       __ mov(r0, rscratch2);
 3152 
 3153       __ leave();
 3154       __ ret(lr);
 3155 
 3156     return start;
 3157   }
 3158 
 3159   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3160   // Inputs: 128-bits. in is preserved.
 3161   // The least-significant 64-bit word is in the upper dword of each vector.
 3162   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3163   // Output: result
 3164   void be_add_128_64(FloatRegister result, FloatRegister in,
 3165                      FloatRegister inc, FloatRegister tmp) {
 3166     assert_different_registers(result, tmp, inc);
 3167 
 3168     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3169                                            // input
 3170     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3171     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3172                                            // MSD == 0 (must be!) to LSD
 3173     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3174   }
 3175 
 3176   // CTR AES crypt.
 3177   // Arguments:
 3178   //
 3179   // Inputs:
 3180   //   c_rarg0   - source byte array address
 3181   //   c_rarg1   - destination byte array address
 3182   //   c_rarg2   - sessionKe (key) in little endian int array
 3183   //   c_rarg3   - counter vector byte array address
 3184   //   c_rarg4   - input length
 3185   //   c_rarg5   - saved encryptedCounter start
 3186   //   c_rarg6   - saved used length
 3187   //
 3188   // Output:
 3189   //   r0       - input length
 3190   //
 3191   address generate_counterMode_AESCrypt() {
 3192     const Register in = c_rarg0;
 3193     const Register out = c_rarg1;
 3194     const Register key = c_rarg2;
 3195     const Register counter = c_rarg3;
 3196     const Register saved_len = c_rarg4, len = r10;
 3197     const Register saved_encrypted_ctr = c_rarg5;
 3198     const Register used_ptr = c_rarg6, used = r12;
 3199 
 3200     const Register offset = r7;
 3201     const Register keylen = r11;
 3202 
 3203     const unsigned char block_size = 16;
 3204     const int bulk_width = 4;
 3205     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3206     // performance with larger data sizes, but it also means that the
 3207     // fast path isn't used until you have at least 8 blocks, and up
 3208     // to 127 bytes of data will be executed on the slow path. For
 3209     // that reason, and also so as not to blow away too much icache, 4
 3210     // blocks seems like a sensible compromise.
 3211 
 3212     // Algorithm:
 3213     //
 3214     //    if (len == 0) {
 3215     //        goto DONE;
 3216     //    }
 3217     //    int result = len;
 3218     //    do {
 3219     //        if (used >= blockSize) {
 3220     //            if (len >= bulk_width * blockSize) {
 3221     //                CTR_large_block();
 3222     //                if (len == 0)
 3223     //                    goto DONE;
 3224     //            }
 3225     //            for (;;) {
 3226     //                16ByteVector v0 = counter;
 3227     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3228     //                used = 0;
 3229     //                if (len < blockSize)
 3230     //                    break;    /* goto NEXT */
 3231     //                16ByteVector v1 = load16Bytes(in, offset);
 3232     //                v1 = v1 ^ encryptedCounter;
 3233     //                store16Bytes(out, offset);
 3234     //                used = blockSize;
 3235     //                offset += blockSize;
 3236     //                len -= blockSize;
 3237     //                if (len == 0)
 3238     //                    goto DONE;
 3239     //            }
 3240     //        }
 3241     //      NEXT:
 3242     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3243     //        len--;
 3244     //    } while (len != 0);
 3245     //  DONE:
 3246     //    return result;
 3247     //
 3248     // CTR_large_block()
 3249     //    Wide bulk encryption of whole blocks.
 3250 
 3251     __ align(CodeEntryAlignment);
 3252     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3253     StubCodeMark mark(this, stub_id);
 3254     const address start = __ pc();
 3255     __ enter();
 3256 
 3257     Label DONE, CTR_large_block, large_block_return;
 3258     __ ldrw(used, Address(used_ptr));
 3259     __ cbzw(saved_len, DONE);
 3260 
 3261     __ mov(len, saved_len);
 3262     __ mov(offset, 0);
 3263 
 3264     // Compute #rounds for AES based on the length of the key array
 3265     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3266 
 3267     __ aesenc_loadkeys(key, keylen);
 3268 
 3269     {
 3270       Label L_CTR_loop, NEXT;
 3271 
 3272       __ bind(L_CTR_loop);
 3273 
 3274       __ cmp(used, block_size);
 3275       __ br(__ LO, NEXT);
 3276 
 3277       // Maybe we have a lot of data
 3278       __ subsw(rscratch1, len, bulk_width * block_size);
 3279       __ br(__ HS, CTR_large_block);
 3280       __ BIND(large_block_return);
 3281       __ cbzw(len, DONE);
 3282 
 3283       // Setup the counter
 3284       __ movi(v4, __ T4S, 0);
 3285       __ movi(v5, __ T4S, 1);
 3286       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3287 
 3288       // 128-bit big-endian increment
 3289       __ ld1(v0, __ T16B, counter);
 3290       __ rev64(v16, __ T16B, v0);
 3291       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3292       __ rev64(v16, __ T16B, v16);
 3293       __ st1(v16, __ T16B, counter);
 3294       // Previous counter value is in v0
 3295       // v4 contains { 0, 1 }
 3296 
 3297       {
 3298         // We have fewer than bulk_width blocks of data left. Encrypt
 3299         // them one by one until there is less than a full block
 3300         // remaining, being careful to save both the encrypted counter
 3301         // and the counter.
 3302 
 3303         Label inner_loop;
 3304         __ bind(inner_loop);
 3305         // Counter to encrypt is in v0
 3306         __ aesecb_encrypt(noreg, noreg, keylen);
 3307         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3308 
 3309         // Do we have a remaining full block?
 3310 
 3311         __ mov(used, 0);
 3312         __ cmp(len, block_size);
 3313         __ br(__ LO, NEXT);
 3314 
 3315         // Yes, we have a full block
 3316         __ ldrq(v1, Address(in, offset));
 3317         __ eor(v1, __ T16B, v1, v0);
 3318         __ strq(v1, Address(out, offset));
 3319         __ mov(used, block_size);
 3320         __ add(offset, offset, block_size);
 3321 
 3322         __ subw(len, len, block_size);
 3323         __ cbzw(len, DONE);
 3324 
 3325         // Increment the counter, store it back
 3326         __ orr(v0, __ T16B, v16, v16);
 3327         __ rev64(v16, __ T16B, v16);
 3328         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3329         __ rev64(v16, __ T16B, v16);
 3330         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3331 
 3332         __ b(inner_loop);
 3333       }
 3334 
 3335       __ BIND(NEXT);
 3336 
 3337       // Encrypt a single byte, and loop.
 3338       // We expect this to be a rare event.
 3339       __ ldrb(rscratch1, Address(in, offset));
 3340       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3341       __ eor(rscratch1, rscratch1, rscratch2);
 3342       __ strb(rscratch1, Address(out, offset));
 3343       __ add(offset, offset, 1);
 3344       __ add(used, used, 1);
 3345       __ subw(len, len,1);
 3346       __ cbnzw(len, L_CTR_loop);
 3347     }
 3348 
 3349     __ bind(DONE);
 3350     __ strw(used, Address(used_ptr));
 3351     __ mov(r0, saved_len);
 3352 
 3353     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3354     __ ret(lr);
 3355 
 3356     // Bulk encryption
 3357 
 3358     __ BIND (CTR_large_block);
 3359     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3360 
 3361     if (bulk_width == 8) {
 3362       __ sub(sp, sp, 4 * 16);
 3363       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3364     }
 3365     __ sub(sp, sp, 4 * 16);
 3366     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3367     RegSet saved_regs = (RegSet::of(in, out, offset)
 3368                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3369     __ push(saved_regs, sp);
 3370     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3371     __ add(in, in, offset);
 3372     __ add(out, out, offset);
 3373 
 3374     // Keys should already be loaded into the correct registers
 3375 
 3376     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3377     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3378 
 3379     // AES/CTR loop
 3380     {
 3381       Label L_CTR_loop;
 3382       __ BIND(L_CTR_loop);
 3383 
 3384       // Setup the counters
 3385       __ movi(v8, __ T4S, 0);
 3386       __ movi(v9, __ T4S, 1);
 3387       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3388 
 3389       for (int i = 0; i < bulk_width; i++) {
 3390         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3391         __ rev64(v0_ofs, __ T16B, v16);
 3392         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3393       }
 3394 
 3395       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3396 
 3397       // Encrypt the counters
 3398       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3399 
 3400       if (bulk_width == 8) {
 3401         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3402       }
 3403 
 3404       // XOR the encrypted counters with the inputs
 3405       for (int i = 0; i < bulk_width; i++) {
 3406         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3407         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3408         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3409       }
 3410 
 3411       // Write the encrypted data
 3412       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3413       if (bulk_width == 8) {
 3414         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3415       }
 3416 
 3417       __ subw(len, len, 16 * bulk_width);
 3418       __ cbnzw(len, L_CTR_loop);
 3419     }
 3420 
 3421     // Save the counter back where it goes
 3422     __ rev64(v16, __ T16B, v16);
 3423     __ st1(v16, __ T16B, counter);
 3424 
 3425     __ pop(saved_regs, sp);
 3426 
 3427     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3428     if (bulk_width == 8) {
 3429       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3430     }
 3431 
 3432     __ andr(rscratch1, len, -16 * bulk_width);
 3433     __ sub(len, len, rscratch1);
 3434     __ add(offset, offset, rscratch1);
 3435     __ mov(used, 16);
 3436     __ strw(used, Address(used_ptr));
 3437     __ b(large_block_return);
 3438 
 3439     return start;
 3440   }
 3441 
 3442   // Vector AES Galois Counter Mode implementation. Parameters:
 3443   //
 3444   // in = c_rarg0
 3445   // len = c_rarg1
 3446   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3447   // out = c_rarg3
 3448   // key = c_rarg4
 3449   // state = c_rarg5 - GHASH.state
 3450   // subkeyHtbl = c_rarg6 - powers of H
 3451   // counter = c_rarg7 - 16 bytes of CTR
 3452   // return - number of processed bytes
 3453   address generate_galoisCounterMode_AESCrypt() {
 3454     Label ghash_polynomial; // local data generated after code
 3455 
 3456    __ align(CodeEntryAlignment);
 3457     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3458     StubCodeMark mark(this, stub_id);
 3459     address start = __ pc();
 3460     __ enter();
 3461 
 3462     const Register in = c_rarg0;
 3463     const Register len = c_rarg1;
 3464     const Register ct = c_rarg2;
 3465     const Register out = c_rarg3;
 3466     // and updated with the incremented counter in the end
 3467 
 3468     const Register key = c_rarg4;
 3469     const Register state = c_rarg5;
 3470 
 3471     const Register subkeyHtbl = c_rarg6;
 3472 
 3473     const Register counter = c_rarg7;
 3474 
 3475     const Register keylen = r10;
 3476     // Save state before entering routine
 3477     __ sub(sp, sp, 4 * 16);
 3478     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3479     __ sub(sp, sp, 4 * 16);
 3480     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3481 
 3482     // __ andr(len, len, -512);
 3483     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3484     __ str(len, __ pre(sp, -2 * wordSize));
 3485 
 3486     Label DONE;
 3487     __ cbz(len, DONE);
 3488 
 3489     // Compute #rounds for AES based on the length of the key array
 3490     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3491 
 3492     __ aesenc_loadkeys(key, keylen);
 3493     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3494     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3495 
 3496     // AES/CTR loop
 3497     {
 3498       Label L_CTR_loop;
 3499       __ BIND(L_CTR_loop);
 3500 
 3501       // Setup the counters
 3502       __ movi(v8, __ T4S, 0);
 3503       __ movi(v9, __ T4S, 1);
 3504       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3505 
 3506       assert(v0->encoding() < v8->encoding(), "");
 3507       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3508         FloatRegister f = as_FloatRegister(i);
 3509         __ rev32(f, __ T16B, v16);
 3510         __ addv(v16, __ T4S, v16, v8);
 3511       }
 3512 
 3513       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3514 
 3515       // Encrypt the counters
 3516       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3517 
 3518       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3519 
 3520       // XOR the encrypted counters with the inputs
 3521       for (int i = 0; i < 8; i++) {
 3522         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3523         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3524         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3525       }
 3526       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3527       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3528 
 3529       __ subw(len, len, 16 * 8);
 3530       __ cbnzw(len, L_CTR_loop);
 3531     }
 3532 
 3533     __ rev32(v16, __ T16B, v16);
 3534     __ st1(v16, __ T16B, counter);
 3535 
 3536     __ ldr(len, Address(sp));
 3537     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3538 
 3539     // GHASH/CTR loop
 3540     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3541                                 len, /*unrolls*/4);
 3542 
 3543 #ifdef ASSERT
 3544     { Label L;
 3545       __ cmp(len, (unsigned char)0);
 3546       __ br(Assembler::EQ, L);
 3547       __ stop("stubGenerator: abort");
 3548       __ bind(L);
 3549   }
 3550 #endif
 3551 
 3552   __ bind(DONE);
 3553     // Return the number of bytes processed
 3554     __ ldr(r0, __ post(sp, 2 * wordSize));
 3555 
 3556     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3557     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3558 
 3559     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3560     __ ret(lr);
 3561 
 3562     // bind label and generate polynomial data
 3563     __ align(wordSize * 2);
 3564     __ bind(ghash_polynomial);
 3565     __ emit_int64(0x87);  // The low-order bits of the field
 3566                           // polynomial (i.e. p = z^7+z^2+z+1)
 3567                           // repeated in the low and high parts of a
 3568                           // 128-bit vector
 3569     __ emit_int64(0x87);
 3570 
 3571     return start;
 3572   }
 3573 
 3574   class Cached64Bytes {
 3575   private:
 3576     MacroAssembler *_masm;
 3577     Register _regs[8];
 3578 
 3579   public:
 3580     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3581       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3582       auto it = rs.begin();
 3583       for (auto &r: _regs) {
 3584         r = *it;
 3585         ++it;
 3586       }
 3587     }
 3588 
 3589     void gen_loads(Register base) {
 3590       for (int i = 0; i < 8; i += 2) {
 3591         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3592       }
 3593     }
 3594 
 3595     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3596     void extract_u32(Register dest, int i) {
 3597       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3598     }
 3599   };
 3600 
 3601   // Utility routines for md5.
 3602   // Clobbers r10 and r11.
 3603   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3604               int k, int s, int t) {
 3605     Register rscratch3 = r10;
 3606     Register rscratch4 = r11;
 3607 
 3608     __ eorw(rscratch3, r3, r4);
 3609     __ movw(rscratch2, t);
 3610     __ andw(rscratch3, rscratch3, r2);
 3611     __ addw(rscratch4, r1, rscratch2);
 3612     reg_cache.extract_u32(rscratch1, k);
 3613     __ eorw(rscratch3, rscratch3, r4);
 3614     __ addw(rscratch4, rscratch4, rscratch1);
 3615     __ addw(rscratch3, rscratch3, rscratch4);
 3616     __ rorw(rscratch2, rscratch3, 32 - s);
 3617     __ addw(r1, rscratch2, r2);
 3618   }
 3619 
 3620   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3621               int k, int s, int t) {
 3622     Register rscratch3 = r10;
 3623     Register rscratch4 = r11;
 3624 
 3625     reg_cache.extract_u32(rscratch1, k);
 3626     __ movw(rscratch2, t);
 3627     __ addw(rscratch4, r1, rscratch2);
 3628     __ addw(rscratch4, rscratch4, rscratch1);
 3629     __ bicw(rscratch2, r3, r4);
 3630     __ andw(rscratch3, r2, r4);
 3631     __ addw(rscratch2, rscratch2, rscratch4);
 3632     __ addw(rscratch2, rscratch2, rscratch3);
 3633     __ rorw(rscratch2, rscratch2, 32 - s);
 3634     __ addw(r1, rscratch2, r2);
 3635   }
 3636 
 3637   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3638               int k, int s, int t) {
 3639     Register rscratch3 = r10;
 3640     Register rscratch4 = r11;
 3641 
 3642     __ eorw(rscratch3, r3, r4);
 3643     __ movw(rscratch2, t);
 3644     __ addw(rscratch4, r1, rscratch2);
 3645     reg_cache.extract_u32(rscratch1, k);
 3646     __ eorw(rscratch3, rscratch3, r2);
 3647     __ addw(rscratch4, rscratch4, rscratch1);
 3648     __ addw(rscratch3, rscratch3, rscratch4);
 3649     __ rorw(rscratch2, rscratch3, 32 - s);
 3650     __ addw(r1, rscratch2, r2);
 3651   }
 3652 
 3653   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3654               int k, int s, int t) {
 3655     Register rscratch3 = r10;
 3656     Register rscratch4 = r11;
 3657 
 3658     __ movw(rscratch3, t);
 3659     __ ornw(rscratch2, r2, r4);
 3660     __ addw(rscratch4, r1, rscratch3);
 3661     reg_cache.extract_u32(rscratch1, k);
 3662     __ eorw(rscratch3, rscratch2, r3);
 3663     __ addw(rscratch4, rscratch4, rscratch1);
 3664     __ addw(rscratch3, rscratch3, rscratch4);
 3665     __ rorw(rscratch2, rscratch3, 32 - s);
 3666     __ addw(r1, rscratch2, r2);
 3667   }
 3668 
 3669   // Arguments:
 3670   //
 3671   // Inputs:
 3672   //   c_rarg0   - byte[]  source+offset
 3673   //   c_rarg1   - int[]   SHA.state
 3674   //   c_rarg2   - int     offset
 3675   //   c_rarg3   - int     limit
 3676   //
 3677   address generate_md5_implCompress(StubId stub_id) {
 3678     bool multi_block;
 3679     switch (stub_id) {
 3680     case StubId::stubgen_md5_implCompress_id:
 3681       multi_block = false;
 3682       break;
 3683     case StubId::stubgen_md5_implCompressMB_id:
 3684       multi_block = true;
 3685       break;
 3686     default:
 3687       ShouldNotReachHere();
 3688     }
 3689     __ align(CodeEntryAlignment);
 3690 
 3691     StubCodeMark mark(this, stub_id);
 3692     address start = __ pc();
 3693 
 3694     Register buf       = c_rarg0;
 3695     Register state     = c_rarg1;
 3696     Register ofs       = c_rarg2;
 3697     Register limit     = c_rarg3;
 3698     Register a         = r4;
 3699     Register b         = r5;
 3700     Register c         = r6;
 3701     Register d         = r7;
 3702     Register rscratch3 = r10;
 3703     Register rscratch4 = r11;
 3704 
 3705     Register state_regs[2] = { r12, r13 };
 3706     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3707     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3708 
 3709     __ push(saved_regs, sp);
 3710 
 3711     __ ldp(state_regs[0], state_regs[1], Address(state));
 3712     __ ubfx(a, state_regs[0],  0, 32);
 3713     __ ubfx(b, state_regs[0], 32, 32);
 3714     __ ubfx(c, state_regs[1],  0, 32);
 3715     __ ubfx(d, state_regs[1], 32, 32);
 3716 
 3717     Label md5_loop;
 3718     __ BIND(md5_loop);
 3719 
 3720     reg_cache.gen_loads(buf);
 3721 
 3722     // Round 1
 3723     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3724     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3725     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3726     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3727     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3728     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3729     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3730     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3731     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3732     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3733     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3734     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3735     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3736     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3737     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3738     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3739 
 3740     // Round 2
 3741     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3742     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3743     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3744     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3745     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3746     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3747     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3748     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3749     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3750     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3751     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3752     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3753     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3754     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3755     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3756     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3757 
 3758     // Round 3
 3759     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3760     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3761     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3762     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3763     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3764     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3765     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3766     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3767     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3768     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3769     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3770     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3771     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3772     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3773     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3774     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3775 
 3776     // Round 4
 3777     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3778     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3779     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3780     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3781     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3782     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3783     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3784     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3785     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3786     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3787     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3788     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3789     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3790     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3791     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3792     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3793 
 3794     __ addw(a, state_regs[0], a);
 3795     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3796     __ addw(b, rscratch2, b);
 3797     __ addw(c, state_regs[1], c);
 3798     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3799     __ addw(d, rscratch4, d);
 3800 
 3801     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3802     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3803 
 3804     if (multi_block) {
 3805       __ add(buf, buf, 64);
 3806       __ add(ofs, ofs, 64);
 3807       __ cmp(ofs, limit);
 3808       __ br(Assembler::LE, md5_loop);
 3809       __ mov(c_rarg0, ofs); // return ofs
 3810     }
 3811 
 3812     // write hash values back in the correct order
 3813     __ stp(state_regs[0], state_regs[1], Address(state));
 3814 
 3815     __ pop(saved_regs, sp);
 3816 
 3817     __ ret(lr);
 3818 
 3819     return start;
 3820   }
 3821 
 3822   // Arguments:
 3823   //
 3824   // Inputs:
 3825   //   c_rarg0   - byte[]  source+offset
 3826   //   c_rarg1   - int[]   SHA.state
 3827   //   c_rarg2   - int     offset
 3828   //   c_rarg3   - int     limit
 3829   //
 3830   address generate_sha1_implCompress(StubId stub_id) {
 3831     bool multi_block;
 3832     switch (stub_id) {
 3833     case StubId::stubgen_sha1_implCompress_id:
 3834       multi_block = false;
 3835       break;
 3836     case StubId::stubgen_sha1_implCompressMB_id:
 3837       multi_block = true;
 3838       break;
 3839     default:
 3840       ShouldNotReachHere();
 3841     }
 3842 
 3843     __ align(CodeEntryAlignment);
 3844 
 3845     StubCodeMark mark(this, stub_id);
 3846     address start = __ pc();
 3847 
 3848     Register buf   = c_rarg0;
 3849     Register state = c_rarg1;
 3850     Register ofs   = c_rarg2;
 3851     Register limit = c_rarg3;
 3852 
 3853     Label keys;
 3854     Label sha1_loop;
 3855 
 3856     // load the keys into v0..v3
 3857     __ adr(rscratch1, keys);
 3858     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3859     // load 5 words state into v6, v7
 3860     __ ldrq(v6, Address(state, 0));
 3861     __ ldrs(v7, Address(state, 16));
 3862 
 3863 
 3864     __ BIND(sha1_loop);
 3865     // load 64 bytes of data into v16..v19
 3866     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3867     __ rev32(v16, __ T16B, v16);
 3868     __ rev32(v17, __ T16B, v17);
 3869     __ rev32(v18, __ T16B, v18);
 3870     __ rev32(v19, __ T16B, v19);
 3871 
 3872     // do the sha1
 3873     __ addv(v4, __ T4S, v16, v0);
 3874     __ orr(v20, __ T16B, v6, v6);
 3875 
 3876     FloatRegister d0 = v16;
 3877     FloatRegister d1 = v17;
 3878     FloatRegister d2 = v18;
 3879     FloatRegister d3 = v19;
 3880 
 3881     for (int round = 0; round < 20; round++) {
 3882       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3883       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3884       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3885       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3886       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3887 
 3888       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3889       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3890       __ sha1h(tmp2, __ T4S, v20);
 3891       if (round < 5)
 3892         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3893       else if (round < 10 || round >= 15)
 3894         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3895       else
 3896         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3897       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3898 
 3899       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3900     }
 3901 
 3902     __ addv(v7, __ T2S, v7, v21);
 3903     __ addv(v6, __ T4S, v6, v20);
 3904 
 3905     if (multi_block) {
 3906       __ add(ofs, ofs, 64);
 3907       __ cmp(ofs, limit);
 3908       __ br(Assembler::LE, sha1_loop);
 3909       __ mov(c_rarg0, ofs); // return ofs
 3910     }
 3911 
 3912     __ strq(v6, Address(state, 0));
 3913     __ strs(v7, Address(state, 16));
 3914 
 3915     __ ret(lr);
 3916 
 3917     __ bind(keys);
 3918     __ emit_int32(0x5a827999);
 3919     __ emit_int32(0x6ed9eba1);
 3920     __ emit_int32(0x8f1bbcdc);
 3921     __ emit_int32(0xca62c1d6);
 3922 
 3923     return start;
 3924   }
 3925 
 3926 
 3927   // Arguments:
 3928   //
 3929   // Inputs:
 3930   //   c_rarg0   - byte[]  source+offset
 3931   //   c_rarg1   - int[]   SHA.state
 3932   //   c_rarg2   - int     offset
 3933   //   c_rarg3   - int     limit
 3934   //
 3935   address generate_sha256_implCompress(StubId stub_id) {
 3936     bool multi_block;
 3937     switch (stub_id) {
 3938     case StubId::stubgen_sha256_implCompress_id:
 3939       multi_block = false;
 3940       break;
 3941     case StubId::stubgen_sha256_implCompressMB_id:
 3942       multi_block = true;
 3943       break;
 3944     default:
 3945       ShouldNotReachHere();
 3946     }
 3947 
 3948     static const uint32_t round_consts[64] = {
 3949       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3950       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3951       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3952       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3953       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3954       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3955       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3956       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3957       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3958       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3959       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3960       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3961       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3962       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3963       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3964       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3965     };
 3966 
 3967     __ align(CodeEntryAlignment);
 3968 
 3969     StubCodeMark mark(this, stub_id);
 3970     address start = __ pc();
 3971 
 3972     Register buf   = c_rarg0;
 3973     Register state = c_rarg1;
 3974     Register ofs   = c_rarg2;
 3975     Register limit = c_rarg3;
 3976 
 3977     Label sha1_loop;
 3978 
 3979     __ stpd(v8, v9, __ pre(sp, -32));
 3980     __ stpd(v10, v11, Address(sp, 16));
 3981 
 3982 // dga == v0
 3983 // dgb == v1
 3984 // dg0 == v2
 3985 // dg1 == v3
 3986 // dg2 == v4
 3987 // t0 == v6
 3988 // t1 == v7
 3989 
 3990     // load 16 keys to v16..v31
 3991     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3992     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3993     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3994     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3995     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3996 
 3997     // load 8 words (256 bits) state
 3998     __ ldpq(v0, v1, state);
 3999 
 4000     __ BIND(sha1_loop);
 4001     // load 64 bytes of data into v8..v11
 4002     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4003     __ rev32(v8, __ T16B, v8);
 4004     __ rev32(v9, __ T16B, v9);
 4005     __ rev32(v10, __ T16B, v10);
 4006     __ rev32(v11, __ T16B, v11);
 4007 
 4008     __ addv(v6, __ T4S, v8, v16);
 4009     __ orr(v2, __ T16B, v0, v0);
 4010     __ orr(v3, __ T16B, v1, v1);
 4011 
 4012     FloatRegister d0 = v8;
 4013     FloatRegister d1 = v9;
 4014     FloatRegister d2 = v10;
 4015     FloatRegister d3 = v11;
 4016 
 4017 
 4018     for (int round = 0; round < 16; round++) {
 4019       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4020       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4021       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4022       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4023 
 4024       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4025        __ orr(v4, __ T16B, v2, v2);
 4026       if (round < 15)
 4027         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4028       __ sha256h(v2, __ T4S, v3, tmp2);
 4029       __ sha256h2(v3, __ T4S, v4, tmp2);
 4030       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4031 
 4032       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4033     }
 4034 
 4035     __ addv(v0, __ T4S, v0, v2);
 4036     __ addv(v1, __ T4S, v1, v3);
 4037 
 4038     if (multi_block) {
 4039       __ add(ofs, ofs, 64);
 4040       __ cmp(ofs, limit);
 4041       __ br(Assembler::LE, sha1_loop);
 4042       __ mov(c_rarg0, ofs); // return ofs
 4043     }
 4044 
 4045     __ ldpd(v10, v11, Address(sp, 16));
 4046     __ ldpd(v8, v9, __ post(sp, 32));
 4047 
 4048     __ stpq(v0, v1, state);
 4049 
 4050     __ ret(lr);
 4051 
 4052     return start;
 4053   }
 4054 
 4055   // Double rounds for sha512.
 4056   void sha512_dround(int dr,
 4057                      FloatRegister vi0, FloatRegister vi1,
 4058                      FloatRegister vi2, FloatRegister vi3,
 4059                      FloatRegister vi4, FloatRegister vrc0,
 4060                      FloatRegister vrc1, FloatRegister vin0,
 4061                      FloatRegister vin1, FloatRegister vin2,
 4062                      FloatRegister vin3, FloatRegister vin4) {
 4063       if (dr < 36) {
 4064         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4065       }
 4066       __ addv(v5, __ T2D, vrc0, vin0);
 4067       __ ext(v6, __ T16B, vi2, vi3, 8);
 4068       __ ext(v5, __ T16B, v5, v5, 8);
 4069       __ ext(v7, __ T16B, vi1, vi2, 8);
 4070       __ addv(vi3, __ T2D, vi3, v5);
 4071       if (dr < 32) {
 4072         __ ext(v5, __ T16B, vin3, vin4, 8);
 4073         __ sha512su0(vin0, __ T2D, vin1);
 4074       }
 4075       __ sha512h(vi3, __ T2D, v6, v7);
 4076       if (dr < 32) {
 4077         __ sha512su1(vin0, __ T2D, vin2, v5);
 4078       }
 4079       __ addv(vi4, __ T2D, vi1, vi3);
 4080       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4081   }
 4082 
 4083   // Arguments:
 4084   //
 4085   // Inputs:
 4086   //   c_rarg0   - byte[]  source+offset
 4087   //   c_rarg1   - int[]   SHA.state
 4088   //   c_rarg2   - int     offset
 4089   //   c_rarg3   - int     limit
 4090   //
 4091   address generate_sha512_implCompress(StubId stub_id) {
 4092     bool multi_block;
 4093     switch (stub_id) {
 4094     case StubId::stubgen_sha512_implCompress_id:
 4095       multi_block = false;
 4096       break;
 4097     case StubId::stubgen_sha512_implCompressMB_id:
 4098       multi_block = true;
 4099       break;
 4100     default:
 4101       ShouldNotReachHere();
 4102     }
 4103 
 4104     static const uint64_t round_consts[80] = {
 4105       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4106       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4107       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4108       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4109       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4110       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4111       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4112       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4113       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4114       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4115       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4116       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4117       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4118       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4119       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4120       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4121       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4122       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4123       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4124       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4125       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4126       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4127       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4128       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4129       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4130       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4131       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4132     };
 4133 
 4134     __ align(CodeEntryAlignment);
 4135 
 4136     StubCodeMark mark(this, stub_id);
 4137     address start = __ pc();
 4138 
 4139     Register buf   = c_rarg0;
 4140     Register state = c_rarg1;
 4141     Register ofs   = c_rarg2;
 4142     Register limit = c_rarg3;
 4143 
 4144     __ stpd(v8, v9, __ pre(sp, -64));
 4145     __ stpd(v10, v11, Address(sp, 16));
 4146     __ stpd(v12, v13, Address(sp, 32));
 4147     __ stpd(v14, v15, Address(sp, 48));
 4148 
 4149     Label sha512_loop;
 4150 
 4151     // load state
 4152     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4153 
 4154     // load first 4 round constants
 4155     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4156     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4157 
 4158     __ BIND(sha512_loop);
 4159     // load 128B of data into v12..v19
 4160     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4161     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4162     __ rev64(v12, __ T16B, v12);
 4163     __ rev64(v13, __ T16B, v13);
 4164     __ rev64(v14, __ T16B, v14);
 4165     __ rev64(v15, __ T16B, v15);
 4166     __ rev64(v16, __ T16B, v16);
 4167     __ rev64(v17, __ T16B, v17);
 4168     __ rev64(v18, __ T16B, v18);
 4169     __ rev64(v19, __ T16B, v19);
 4170 
 4171     __ mov(rscratch2, rscratch1);
 4172 
 4173     __ mov(v0, __ T16B, v8);
 4174     __ mov(v1, __ T16B, v9);
 4175     __ mov(v2, __ T16B, v10);
 4176     __ mov(v3, __ T16B, v11);
 4177 
 4178     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4179     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4180     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4181     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4182     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4183     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4184     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4185     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4186     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4187     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4188     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4189     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4190     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4191     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4192     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4193     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4194     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4195     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4196     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4197     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4198     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4199     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4200     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4201     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4202     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4203     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4204     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4205     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4206     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4207     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4208     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4209     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4210     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4211     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4212     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4213     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4214     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4215     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4216     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4217     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4218 
 4219     __ addv(v8, __ T2D, v8, v0);
 4220     __ addv(v9, __ T2D, v9, v1);
 4221     __ addv(v10, __ T2D, v10, v2);
 4222     __ addv(v11, __ T2D, v11, v3);
 4223 
 4224     if (multi_block) {
 4225       __ add(ofs, ofs, 128);
 4226       __ cmp(ofs, limit);
 4227       __ br(Assembler::LE, sha512_loop);
 4228       __ mov(c_rarg0, ofs); // return ofs
 4229     }
 4230 
 4231     __ st1(v8, v9, v10, v11, __ T2D, state);
 4232 
 4233     __ ldpd(v14, v15, Address(sp, 48));
 4234     __ ldpd(v12, v13, Address(sp, 32));
 4235     __ ldpd(v10, v11, Address(sp, 16));
 4236     __ ldpd(v8, v9, __ post(sp, 64));
 4237 
 4238     __ ret(lr);
 4239 
 4240     return start;
 4241   }
 4242 
 4243   // Execute one round of keccak of two computations in parallel.
 4244   // One of the states should be loaded into the lower halves of
 4245   // the vector registers v0-v24, the other should be loaded into
 4246   // the upper halves of those registers. The ld1r instruction loads
 4247   // the round constant into both halves of register v31.
 4248   // Intermediate results c0...c5 and d0...d5 are computed
 4249   // in registers v25...v30.
 4250   // All vector instructions that are used operate on both register
 4251   // halves in parallel.
 4252   // If only a single computation is needed, one can only load the lower halves.
 4253   void keccak_round(Register rscratch1) {
 4254   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4255   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4256   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4257   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4258   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4259   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4260   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4261   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4262   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4263   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4264 
 4265   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4266   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4267   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4268   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4269   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4270 
 4271   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4272   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4273   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4274   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4275   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4276   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4277   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4278   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4279   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4280   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4281   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4282   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4283   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4284   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4285   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4286   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4287   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4288   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4289   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4290   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4291   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4292   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4293   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4294   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4295   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4296 
 4297   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4298   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4299   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4300   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4301   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4302 
 4303   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4304 
 4305   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4306   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4307   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4308   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4309   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4310 
 4311   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4312   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4313   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4314   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4315   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4316 
 4317   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4318   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4319   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4320   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4321   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4322 
 4323   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4324   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4325   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4326   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4327   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4328 
 4329   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4330   }
 4331 
 4332   // Arguments:
 4333   //
 4334   // Inputs:
 4335   //   c_rarg0   - byte[]  source+offset
 4336   //   c_rarg1   - byte[]  SHA.state
 4337   //   c_rarg2   - int     block_size
 4338   //   c_rarg3   - int     offset
 4339   //   c_rarg4   - int     limit
 4340   //
 4341   address generate_sha3_implCompress(StubId stub_id) {
 4342     bool multi_block;
 4343     switch (stub_id) {
 4344     case StubId::stubgen_sha3_implCompress_id:
 4345       multi_block = false;
 4346       break;
 4347     case StubId::stubgen_sha3_implCompressMB_id:
 4348       multi_block = true;
 4349       break;
 4350     default:
 4351       ShouldNotReachHere();
 4352     }
 4353 
 4354     static const uint64_t round_consts[24] = {
 4355       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4356       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4357       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4358       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4359       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4360       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4361       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4362       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4363     };
 4364 
 4365     __ align(CodeEntryAlignment);
 4366 
 4367     StubCodeMark mark(this, stub_id);
 4368     address start = __ pc();
 4369 
 4370     Register buf           = c_rarg0;
 4371     Register state         = c_rarg1;
 4372     Register block_size    = c_rarg2;
 4373     Register ofs           = c_rarg3;
 4374     Register limit         = c_rarg4;
 4375 
 4376     Label sha3_loop, rounds24_loop;
 4377     Label sha3_512_or_sha3_384, shake128;
 4378 
 4379     __ stpd(v8, v9, __ pre(sp, -64));
 4380     __ stpd(v10, v11, Address(sp, 16));
 4381     __ stpd(v12, v13, Address(sp, 32));
 4382     __ stpd(v14, v15, Address(sp, 48));
 4383 
 4384     // load state
 4385     __ add(rscratch1, state, 32);
 4386     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4387     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4388     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4389     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4390     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4391     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4392     __ ld1(v24, __ T1D, rscratch1);
 4393 
 4394     __ BIND(sha3_loop);
 4395 
 4396     // 24 keccak rounds
 4397     __ movw(rscratch2, 24);
 4398 
 4399     // load round_constants base
 4400     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4401 
 4402     // load input
 4403     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4404     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4405     __ eor(v0, __ T8B, v0, v25);
 4406     __ eor(v1, __ T8B, v1, v26);
 4407     __ eor(v2, __ T8B, v2, v27);
 4408     __ eor(v3, __ T8B, v3, v28);
 4409     __ eor(v4, __ T8B, v4, v29);
 4410     __ eor(v5, __ T8B, v5, v30);
 4411     __ eor(v6, __ T8B, v6, v31);
 4412 
 4413     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4414     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4415 
 4416     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4417     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4418     __ eor(v7, __ T8B, v7, v25);
 4419     __ eor(v8, __ T8B, v8, v26);
 4420     __ eor(v9, __ T8B, v9, v27);
 4421     __ eor(v10, __ T8B, v10, v28);
 4422     __ eor(v11, __ T8B, v11, v29);
 4423     __ eor(v12, __ T8B, v12, v30);
 4424     __ eor(v13, __ T8B, v13, v31);
 4425 
 4426     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4427     __ eor(v14, __ T8B, v14, v25);
 4428     __ eor(v15, __ T8B, v15, v26);
 4429     __ eor(v16, __ T8B, v16, v27);
 4430 
 4431     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4432     __ andw(c_rarg5, block_size, 48);
 4433     __ cbzw(c_rarg5, rounds24_loop);
 4434 
 4435     __ tbnz(block_size, 5, shake128);
 4436     // block_size == 144, bit5 == 0, SHA3-224
 4437     __ ldrd(v28, __ post(buf, 8));
 4438     __ eor(v17, __ T8B, v17, v28);
 4439     __ b(rounds24_loop);
 4440 
 4441     __ BIND(shake128);
 4442     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4443     __ eor(v17, __ T8B, v17, v28);
 4444     __ eor(v18, __ T8B, v18, v29);
 4445     __ eor(v19, __ T8B, v19, v30);
 4446     __ eor(v20, __ T8B, v20, v31);
 4447     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4448 
 4449     __ BIND(sha3_512_or_sha3_384);
 4450     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4451     __ eor(v7, __ T8B, v7, v25);
 4452     __ eor(v8, __ T8B, v8, v26);
 4453     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4454 
 4455     // SHA3-384
 4456     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4457     __ eor(v9,  __ T8B, v9,  v27);
 4458     __ eor(v10, __ T8B, v10, v28);
 4459     __ eor(v11, __ T8B, v11, v29);
 4460     __ eor(v12, __ T8B, v12, v30);
 4461 
 4462     __ BIND(rounds24_loop);
 4463     __ subw(rscratch2, rscratch2, 1);
 4464 
 4465     keccak_round(rscratch1);
 4466 
 4467     __ cbnzw(rscratch2, rounds24_loop);
 4468 
 4469     if (multi_block) {
 4470       __ add(ofs, ofs, block_size);
 4471       __ cmp(ofs, limit);
 4472       __ br(Assembler::LE, sha3_loop);
 4473       __ mov(c_rarg0, ofs); // return ofs
 4474     }
 4475 
 4476     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4477     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4478     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4479     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4480     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4481     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4482     __ st1(v24, __ T1D, state);
 4483 
 4484     // restore callee-saved registers
 4485     __ ldpd(v14, v15, Address(sp, 48));
 4486     __ ldpd(v12, v13, Address(sp, 32));
 4487     __ ldpd(v10, v11, Address(sp, 16));
 4488     __ ldpd(v8, v9, __ post(sp, 64));
 4489 
 4490     __ ret(lr);
 4491 
 4492     return start;
 4493   }
 4494 
 4495   // Inputs:
 4496   //   c_rarg0   - long[]  state0
 4497   //   c_rarg1   - long[]  state1
 4498   address generate_double_keccak() {
 4499     static const uint64_t round_consts[24] = {
 4500       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4501       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4502       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4503       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4504       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4505       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4506       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4507       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4508     };
 4509 
 4510     // Implements the double_keccak() method of the
 4511     // sun.secyrity.provider.SHA3Parallel class
 4512     __ align(CodeEntryAlignment);
 4513     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4514     address start = __ pc();
 4515     __ enter();
 4516 
 4517     Register state0        = c_rarg0;
 4518     Register state1        = c_rarg1;
 4519 
 4520     Label rounds24_loop;
 4521 
 4522     // save callee-saved registers
 4523     __ stpd(v8, v9, __ pre(sp, -64));
 4524     __ stpd(v10, v11, Address(sp, 16));
 4525     __ stpd(v12, v13, Address(sp, 32));
 4526     __ stpd(v14, v15, Address(sp, 48));
 4527 
 4528     // load states
 4529     __ add(rscratch1, state0, 32);
 4530     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4531     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4532     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4533     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4534     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4535     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4536     __ ld1(v24, __ D, 0, rscratch1);
 4537     __ add(rscratch1, state1, 32);
 4538     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4539     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4540     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4541     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4542     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4543     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4544     __ ld1(v24, __ D, 1, rscratch1);
 4545 
 4546     // 24 keccak rounds
 4547     __ movw(rscratch2, 24);
 4548 
 4549     // load round_constants base
 4550     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4551 
 4552     __ BIND(rounds24_loop);
 4553     __ subw(rscratch2, rscratch2, 1);
 4554     keccak_round(rscratch1);
 4555     __ cbnzw(rscratch2, rounds24_loop);
 4556 
 4557     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4558     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4559     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4560     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4561     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4562     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4563     __ st1(v24, __ D, 0, state0);
 4564     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4565     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4566     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4567     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4568     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4569     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4570     __ st1(v24, __ D, 1, state1);
 4571 
 4572     // restore callee-saved vector registers
 4573     __ ldpd(v14, v15, Address(sp, 48));
 4574     __ ldpd(v12, v13, Address(sp, 32));
 4575     __ ldpd(v10, v11, Address(sp, 16));
 4576     __ ldpd(v8, v9, __ post(sp, 64));
 4577 
 4578     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4579     __ mov(r0, zr); // return 0
 4580     __ ret(lr);
 4581 
 4582     return start;
 4583   }
 4584 
 4585   // ChaCha20 block function.  This version parallelizes the 32-bit
 4586   // state elements on each of 16 vectors, producing 4 blocks of
 4587   // keystream at a time.
 4588   //
 4589   // state (int[16]) = c_rarg0
 4590   // keystream (byte[256]) = c_rarg1
 4591   // return - number of bytes of produced keystream (always 256)
 4592   //
 4593   // This implementation takes each 32-bit integer from the state
 4594   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4595   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4596   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4597   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4598   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4599   // operations represented by one QUARTERROUND function, we instead stack all
 4600   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4601   // and then do the same for the second set of 4 quarter rounds.  This removes
 4602   // some latency that would otherwise be incurred by waiting for an add to
 4603   // complete before performing an xor (which depends on the result of the
 4604   // add), etc. An adjustment happens between the first and second groups of 4
 4605   // quarter rounds, but this is done only in the inputs to the macro functions
 4606   // that generate the assembly instructions - these adjustments themselves are
 4607   // not part of the resulting assembly.
 4608   // The 4 registers v0-v3 are used during the quarter round operations as
 4609   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4610   // registers become the vectors involved in adding the start state back onto
 4611   // the post-QR working state.  After the adds are complete, each of the 16
 4612   // vectors write their first lane back to the keystream buffer, followed
 4613   // by the second lane from all vectors and so on.
 4614   address generate_chacha20Block_blockpar() {
 4615     Label L_twoRounds, L_cc20_const;
 4616     __ align(CodeEntryAlignment);
 4617     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4618     StubCodeMark mark(this, stub_id);
 4619     address start = __ pc();
 4620     __ enter();
 4621 
 4622     int i, j;
 4623     const Register state = c_rarg0;
 4624     const Register keystream = c_rarg1;
 4625     const Register loopCtr = r10;
 4626     const Register tmpAddr = r11;
 4627     const FloatRegister ctrAddOverlay = v28;
 4628     const FloatRegister lrot8Tbl = v29;
 4629 
 4630     // Organize SIMD registers in an array that facilitates
 4631     // putting repetitive opcodes into loop structures.  It is
 4632     // important that each grouping of 4 registers is monotonically
 4633     // increasing to support the requirements of multi-register
 4634     // instructions (e.g. ld4r, st4, etc.)
 4635     const FloatRegister workSt[16] = {
 4636          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4637         v20, v21, v22, v23, v24, v25, v26, v27
 4638     };
 4639 
 4640     // Pull in constant data.  The first 16 bytes are the add overlay
 4641     // which is applied to the vector holding the counter (state[12]).
 4642     // The second 16 bytes is the index register for the 8-bit left
 4643     // rotation tbl instruction.
 4644     __ adr(tmpAddr, L_cc20_const);
 4645     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4646 
 4647     // Load from memory and interlace across 16 SIMD registers,
 4648     // With each word from memory being broadcast to all lanes of
 4649     // each successive SIMD register.
 4650     //      Addr(0) -> All lanes in workSt[i]
 4651     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4652     __ mov(tmpAddr, state);
 4653     for (i = 0; i < 16; i += 4) {
 4654       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4655           __ post(tmpAddr, 16));
 4656     }
 4657     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4658 
 4659     // Before entering the loop, create 5 4-register arrays.  These
 4660     // will hold the 4 registers that represent the a/b/c/d fields
 4661     // in the quarter round operation.  For instance the "b" field
 4662     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4663     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4664     // since it is part of a diagonal organization.  The aSet and scratch
 4665     // register sets are defined at declaration time because they do not change
 4666     // organization at any point during the 20-round processing.
 4667     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4668     FloatRegister bSet[4];
 4669     FloatRegister cSet[4];
 4670     FloatRegister dSet[4];
 4671     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4672 
 4673     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4674     __ mov(loopCtr, 10);
 4675     __ BIND(L_twoRounds);
 4676 
 4677     // Set to columnar organization and do the following 4 quarter-rounds:
 4678     // QUARTERROUND(0, 4, 8, 12)
 4679     // QUARTERROUND(1, 5, 9, 13)
 4680     // QUARTERROUND(2, 6, 10, 14)
 4681     // QUARTERROUND(3, 7, 11, 15)
 4682     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4683     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4684     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4685 
 4686     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4687     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4688     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4689 
 4690     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4691     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4692     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4693 
 4694     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4695     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4696     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4697 
 4698     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4699     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4700     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4701 
 4702     // Set to diagonal organization and do the next 4 quarter-rounds:
 4703     // QUARTERROUND(0, 5, 10, 15)
 4704     // QUARTERROUND(1, 6, 11, 12)
 4705     // QUARTERROUND(2, 7, 8, 13)
 4706     // QUARTERROUND(3, 4, 9, 14)
 4707     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4708     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4709     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4710 
 4711     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4712     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4713     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4714 
 4715     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4716     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4717     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4718 
 4719     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4720     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4721     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4722 
 4723     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4724     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4725     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4726 
 4727     // Decrement and iterate
 4728     __ sub(loopCtr, loopCtr, 1);
 4729     __ cbnz(loopCtr, L_twoRounds);
 4730 
 4731     __ mov(tmpAddr, state);
 4732 
 4733     // Add the starting state back to the post-loop keystream
 4734     // state.  We read/interlace the state array from memory into
 4735     // 4 registers similar to what we did in the beginning.  Then
 4736     // add the counter overlay onto workSt[12] at the end.
 4737     for (i = 0; i < 16; i += 4) {
 4738       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4739       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4740       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4741       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4742       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4743     }
 4744     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4745 
 4746     // Write working state into the keystream buffer.  This is accomplished
 4747     // by taking the lane "i" from each of the four vectors and writing
 4748     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4749     // repeating with the next 4 vectors until all 16 vectors have been used.
 4750     // Then move to the next lane and repeat the process until all lanes have
 4751     // been written.
 4752     for (i = 0; i < 4; i++) {
 4753       for (j = 0; j < 16; j += 4) {
 4754         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4755             __ post(keystream, 16));
 4756       }
 4757     }
 4758 
 4759     __ mov(r0, 256);             // Return length of output keystream
 4760     __ leave();
 4761     __ ret(lr);
 4762 
 4763     // bind label and generate local constant data used by this stub
 4764     // The constant data is broken into two 128-bit segments to be loaded
 4765     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4766     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4767     // The second 128-bits is a table constant used for 8-bit left rotations.
 4768     __ BIND(L_cc20_const);
 4769     __ emit_int64(0x0000000100000000UL);
 4770     __ emit_int64(0x0000000300000002UL);
 4771     __ emit_int64(0x0605040702010003UL);
 4772     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4773 
 4774     return start;
 4775   }
 4776 
 4777   // Helpers to schedule parallel operation bundles across vector
 4778   // register sequences of size 2, 4 or 8.
 4779 
 4780   // Implement various primitive computations across vector sequences
 4781 
 4782   template<int N>
 4783   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4784                const VSeq<N>& v1, const VSeq<N>& v2) {
 4785     // output must not be constant
 4786     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4787     // output cannot overwrite pending inputs
 4788     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4789     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4790     for (int i = 0; i < N; i++) {
 4791       __ addv(v[i], T, v1[i], v2[i]);
 4792     }
 4793   }
 4794 
 4795   template<int N>
 4796   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4797                const VSeq<N>& v1, const VSeq<N>& v2) {
 4798     // output must not be constant
 4799     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4800     // output cannot overwrite pending inputs
 4801     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4802     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4803     for (int i = 0; i < N; i++) {
 4804       __ subv(v[i], T, v1[i], v2[i]);
 4805     }
 4806   }
 4807 
 4808   template<int N>
 4809   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4810                const VSeq<N>& v1, const VSeq<N>& v2) {
 4811     // output must not be constant
 4812     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4813     // output cannot overwrite pending inputs
 4814     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4815     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4816     for (int i = 0; i < N; i++) {
 4817       __ mulv(v[i], T, v1[i], v2[i]);
 4818     }
 4819   }
 4820 
 4821   template<int N>
 4822   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4823     // output must not be constant
 4824     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4825     // output cannot overwrite pending inputs
 4826     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4827     for (int i = 0; i < N; i++) {
 4828       __ negr(v[i], T, v1[i]);
 4829     }
 4830   }
 4831 
 4832   template<int N>
 4833   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4834                const VSeq<N>& v1, int shift) {
 4835     // output must not be constant
 4836     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4837     // output cannot overwrite pending inputs
 4838     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4839     for (int i = 0; i < N; i++) {
 4840       __ sshr(v[i], T, v1[i], shift);
 4841     }
 4842   }
 4843 
 4844   template<int N>
 4845   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4846     // output must not be constant
 4847     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4848     // output cannot overwrite pending inputs
 4849     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4850     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4851     for (int i = 0; i < N; i++) {
 4852       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4853     }
 4854   }
 4855 
 4856   template<int N>
 4857   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4858     // output must not be constant
 4859     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4860     // output cannot overwrite pending inputs
 4861     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4862     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4863     for (int i = 0; i < N; i++) {
 4864       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4865     }
 4866   }
 4867 
 4868   template<int N>
 4869   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4870     // output must not be constant
 4871     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4872     // output cannot overwrite pending inputs
 4873     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4874     for (int i = 0; i < N; i++) {
 4875       __ notr(v[i], __ T16B, v1[i]);
 4876     }
 4877   }
 4878 
 4879   template<int N>
 4880   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4881     // output must not be constant
 4882     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4883     // output cannot overwrite pending inputs
 4884     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4885     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4886     for (int i = 0; i < N; i++) {
 4887       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4888     }
 4889   }
 4890 
 4891   template<int N>
 4892   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4893     // output must not be constant
 4894     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4895     // output cannot overwrite pending inputs
 4896     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4897     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4898     for (int i = 0; i < N; i++) {
 4899       __ mlsv(v[i], T, v1[i], v2[i]);
 4900     }
 4901   }
 4902 
 4903   // load N/2 successive pairs of quadword values from memory in order
 4904   // into N successive vector registers of the sequence via the
 4905   // address supplied in base.
 4906   template<int N>
 4907   void vs_ldpq(const VSeq<N>& v, Register base) {
 4908     for (int i = 0; i < N; i += 2) {
 4909       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4910     }
 4911   }
 4912 
 4913   // load N/2 successive pairs of quadword values from memory in order
 4914   // into N vector registers of the sequence via the address supplied
 4915   // in base using post-increment addressing
 4916   template<int N>
 4917   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4918     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4919     for (int i = 0; i < N; i += 2) {
 4920       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4921     }
 4922   }
 4923 
 4924   // store N successive vector registers of the sequence into N/2
 4925   // successive pairs of quadword memory locations via the address
 4926   // supplied in base using post-increment addressing
 4927   template<int N>
 4928   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4929     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4930     for (int i = 0; i < N; i += 2) {
 4931       __ stpq(v[i], v[i+1], __ post(base, 32));
 4932     }
 4933   }
 4934 
 4935   // load N/2 pairs of quadword values from memory de-interleaved into
 4936   // N vector registers 2 at a time via the address supplied in base
 4937   // using post-increment addressing.
 4938   template<int N>
 4939   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4940     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4941     for (int i = 0; i < N; i += 2) {
 4942       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4943     }
 4944   }
 4945 
 4946   // store N vector registers interleaved into N/2 pairs of quadword
 4947   // memory locations via the address supplied in base using
 4948   // post-increment addressing.
 4949   template<int N>
 4950   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4951     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4952     for (int i = 0; i < N; i += 2) {
 4953       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4954     }
 4955   }
 4956 
 4957   // load N quadword values from memory de-interleaved into N vector
 4958   // registers 3 elements at a time via the address supplied in base.
 4959   template<int N>
 4960   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4961     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4962     for (int i = 0; i < N; i += 3) {
 4963       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4964     }
 4965   }
 4966 
 4967   // load N quadword values from memory de-interleaved into N vector
 4968   // registers 3 elements at a time via the address supplied in base
 4969   // using post-increment addressing.
 4970   template<int N>
 4971   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4972     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4973     for (int i = 0; i < N; i += 3) {
 4974       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4975     }
 4976   }
 4977 
 4978   // load N/2 pairs of quadword values from memory into N vector
 4979   // registers via the address supplied in base with each pair indexed
 4980   // using the the start offset plus the corresponding entry in the
 4981   // offsets array
 4982   template<int N>
 4983   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4984     for (int i = 0; i < N/2; i++) {
 4985       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4986     }
 4987   }
 4988 
 4989   // store N vector registers into N/2 pairs of quadword memory
 4990   // locations via the address supplied in base with each pair indexed
 4991   // using the the start offset plus the corresponding entry in the
 4992   // offsets array
 4993   template<int N>
 4994   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4995     for (int i = 0; i < N/2; i++) {
 4996       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4997     }
 4998   }
 4999 
 5000   // load N single quadword values from memory into N vector registers
 5001   // via the address supplied in base with each value indexed using
 5002   // the the start offset plus the corresponding entry in the offsets
 5003   // array
 5004   template<int N>
 5005   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5006                       int start, int (&offsets)[N]) {
 5007     for (int i = 0; i < N; i++) {
 5008       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5009     }
 5010   }
 5011 
 5012   // store N vector registers into N single quadword memory locations
 5013   // via the address supplied in base with each value indexed using
 5014   // the the start offset plus the corresponding entry in the offsets
 5015   // array
 5016   template<int N>
 5017   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5018                       int start, int (&offsets)[N]) {
 5019     for (int i = 0; i < N; i++) {
 5020       __ str(v[i], T, Address(base, start + offsets[i]));
 5021     }
 5022   }
 5023 
 5024   // load N/2 pairs of quadword values from memory de-interleaved into
 5025   // N vector registers 2 at a time via the address supplied in base
 5026   // with each pair indexed using the the start offset plus the
 5027   // corresponding entry in the offsets array
 5028   template<int N>
 5029   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5030                       Register tmp, int start, int (&offsets)[N/2]) {
 5031     for (int i = 0; i < N/2; i++) {
 5032       __ add(tmp, base, start + offsets[i]);
 5033       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5034     }
 5035   }
 5036 
 5037   // store N vector registers 2 at a time interleaved into N/2 pairs
 5038   // of quadword memory locations via the address supplied in base
 5039   // with each pair indexed using the the start offset plus the
 5040   // corresponding entry in the offsets array
 5041   template<int N>
 5042   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5043                       Register tmp, int start, int (&offsets)[N/2]) {
 5044     for (int i = 0; i < N/2; i++) {
 5045       __ add(tmp, base, start + offsets[i]);
 5046       __ st2(v[2*i], v[2*i+1], T, tmp);
 5047     }
 5048   }
 5049 
 5050   // Helper routines for various flavours of Montgomery multiply
 5051 
 5052   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5053   // multiplications in parallel
 5054   //
 5055 
 5056   // See the montMul() method of the sun.security.provider.ML_DSA
 5057   // class.
 5058   //
 5059   // Computes 4x4S results or 8x8H results
 5060   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5061   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5062   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5063   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5064   // Outputs: va - 4x4S or 4x8H vector register sequences
 5065   // vb, vc, vtmp and vq must all be disjoint
 5066   // va must be disjoint from all other inputs/temps or must equal vc
 5067   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5068   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5069   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5070                    Assembler::SIMD_Arrangement T,
 5071                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5072     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5073     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5074     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5075     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5076 
 5077     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5078     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5079 
 5080     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5081 
 5082     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5083     assert(vs_disjoint(va, vb), "va and vb overlap");
 5084     assert(vs_disjoint(va, vq), "va and vq overlap");
 5085     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5086     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5087 
 5088     // schedule 4 streams of instructions across the vector sequences
 5089     for (int i = 0; i < 4; i++) {
 5090       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5091       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5092     }
 5093 
 5094     for (int i = 0; i < 4; i++) {
 5095       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5096     }
 5097 
 5098     for (int i = 0; i < 4; i++) {
 5099       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5100     }
 5101 
 5102     for (int i = 0; i < 4; i++) {
 5103       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5104     }
 5105   }
 5106 
 5107   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5108   // multiplications in parallel
 5109   //
 5110 
 5111   // See the montMul() method of the sun.security.provider.ML_DSA
 5112   // class.
 5113   //
 5114   // Computes 4x4S results or 8x8H results
 5115   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5116   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5117   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5118   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5119   // Outputs: va - 4x4S or 4x8H vector register sequences
 5120   // vb, vc, vtmp and vq must all be disjoint
 5121   // va must be disjoint from all other inputs/temps or must equal vc
 5122   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5123   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5124   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5125                    Assembler::SIMD_Arrangement T,
 5126                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5127     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5128     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5129     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5130     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5131 
 5132     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5133     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5134 
 5135     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5136 
 5137     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5138     assert(vs_disjoint(va, vb), "va and vb overlap");
 5139     assert(vs_disjoint(va, vq), "va and vq overlap");
 5140     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5141     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5142 
 5143     // schedule 2 streams of instructions across the vector sequences
 5144     for (int i = 0; i < 2; i++) {
 5145       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5146       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5147     }
 5148 
 5149     for (int i = 0; i < 2; i++) {
 5150       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5151     }
 5152 
 5153     for (int i = 0; i < 2; i++) {
 5154       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5155     }
 5156 
 5157     for (int i = 0; i < 2; i++) {
 5158       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5159     }
 5160   }
 5161 
 5162   // Perform 16 16-bit Montgomery multiplications in parallel.
 5163   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5164                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5165     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5166     // It will assert that the register use is valid
 5167     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5168   }
 5169 
 5170   // Perform 32 16-bit Montgomery multiplications in parallel.
 5171   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5172                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5173     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5174     // It will assert that the register use is valid
 5175     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5176   }
 5177 
 5178   // Perform 64 16-bit Montgomery multiplications in parallel.
 5179   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5180                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5181     // Schedule two successive 4x8H multiplies via the montmul helper
 5182     // on the front and back halves of va, vb and vc. The helper will
 5183     // assert that the register use has no overlap conflicts on each
 5184     // individual call but we also need to ensure that the necessary
 5185     // disjoint/equality constraints are met across both calls.
 5186 
 5187     // vb, vc, vtmp and vq must be disjoint. va must either be
 5188     // disjoint from all other registers or equal vc
 5189 
 5190     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5191     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5192     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5193 
 5194     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5195     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5196 
 5197     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5198 
 5199     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5200     assert(vs_disjoint(va, vb), "va and vb overlap");
 5201     assert(vs_disjoint(va, vq), "va and vq overlap");
 5202     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5203 
 5204     // we multiply the front and back halves of each sequence 4 at a
 5205     // time because
 5206     //
 5207     // 1) we are currently only able to get 4-way instruction
 5208     // parallelism at best
 5209     //
 5210     // 2) we need registers for the constants in vq and temporary
 5211     // scratch registers to hold intermediate results so vtmp can only
 5212     // be a VSeq<4> which means we only have 4 scratch slots
 5213 
 5214     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5215     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5216   }
 5217 
 5218   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5219                                const VSeq<4>& vc,
 5220                                const VSeq<4>& vtmp,
 5221                                const VSeq<2>& vq) {
 5222     // compute a = montmul(a1, c)
 5223     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5224     // ouptut a1 = a0 - a
 5225     vs_subv(va1, __ T8H, va0, vc);
 5226     //    and a0 = a0 + a
 5227     vs_addv(va0, __ T8H, va0, vc);
 5228   }
 5229 
 5230   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5231                                const VSeq<4>& vb,
 5232                                const VSeq<4>& vtmp1,
 5233                                const VSeq<4>& vtmp2,
 5234                                const VSeq<2>& vq) {
 5235     // compute c = a0 - a1
 5236     vs_subv(vtmp1, __ T8H, va0, va1);
 5237     // output a0 = a0 + a1
 5238     vs_addv(va0, __ T8H, va0, va1);
 5239     // output a1 = b montmul c
 5240     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5241   }
 5242 
 5243   void load64shorts(const VSeq<8>& v, Register shorts) {
 5244     vs_ldpq_post(v, shorts);
 5245   }
 5246 
 5247   void load32shorts(const VSeq<4>& v, Register shorts) {
 5248     vs_ldpq_post(v, shorts);
 5249   }
 5250 
 5251   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5252     vs_stpq_post(v, tmpAddr);
 5253   }
 5254 
 5255   // Kyber NTT function.
 5256   // Implements
 5257   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5258   //
 5259   // coeffs (short[256]) = c_rarg0
 5260   // ntt_zetas (short[256]) = c_rarg1
 5261   address generate_kyberNtt() {
 5262 
 5263     __ align(CodeEntryAlignment);
 5264     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5265     StubCodeMark mark(this, stub_id);
 5266     address start = __ pc();
 5267     __ enter();
 5268 
 5269     const Register coeffs = c_rarg0;
 5270     const Register zetas = c_rarg1;
 5271 
 5272     const Register kyberConsts = r10;
 5273     const Register tmpAddr = r11;
 5274 
 5275     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5276     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5277     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5278 
 5279     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5280     // load the montmul constants
 5281     vs_ldpq(vq, kyberConsts);
 5282 
 5283     // Each level corresponds to an iteration of the outermost loop of the
 5284     // Java method seilerNTT(int[] coeffs). There are some differences
 5285     // from what is done in the seilerNTT() method, though:
 5286     // 1. The computation is using 16-bit signed values, we do not convert them
 5287     // to ints here.
 5288     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5289     // this array for each level, it is easier that way to fill up the vector
 5290     // registers.
 5291     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5292     // multiplications (this is because that way there should not be any
 5293     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5294     // that we can use the 16-bit arithmetic in the vector unit.
 5295     //
 5296     // On each level, we fill up the vector registers in such a way that the
 5297     // array elements that need to be multiplied by the zetas go into one
 5298     // set of vector registers while the corresponding ones that don't need to
 5299     // be multiplied, go into another set.
 5300     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5301     // registers interleaving the steps of 4 identical computations,
 5302     // each done on 8 16-bit values per register.
 5303 
 5304     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5305     // to the zetas occur in discrete blocks whose size is some multiple
 5306     // of 32.
 5307 
 5308     // level 0
 5309     __ add(tmpAddr, coeffs, 256);
 5310     load64shorts(vs1, tmpAddr);
 5311     load64shorts(vs2, zetas);
 5312     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5313     __ add(tmpAddr, coeffs, 0);
 5314     load64shorts(vs1, tmpAddr);
 5315     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5316     vs_addv(vs1, __ T8H, vs1, vs2);
 5317     __ add(tmpAddr, coeffs, 0);
 5318     vs_stpq_post(vs1, tmpAddr);
 5319     __ add(tmpAddr, coeffs, 256);
 5320     vs_stpq_post(vs3, tmpAddr);
 5321     // restore montmul constants
 5322     vs_ldpq(vq, kyberConsts);
 5323     load64shorts(vs1, tmpAddr);
 5324     load64shorts(vs2, zetas);
 5325     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5326     __ add(tmpAddr, coeffs, 128);
 5327     load64shorts(vs1, tmpAddr);
 5328     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5329     vs_addv(vs1, __ T8H, vs1, vs2);
 5330     __ add(tmpAddr, coeffs, 128);
 5331     store64shorts(vs1, tmpAddr);
 5332     __ add(tmpAddr, coeffs, 384);
 5333     store64shorts(vs3, tmpAddr);
 5334 
 5335     // level 1
 5336     // restore montmul constants
 5337     vs_ldpq(vq, kyberConsts);
 5338     __ add(tmpAddr, coeffs, 128);
 5339     load64shorts(vs1, tmpAddr);
 5340     load64shorts(vs2, zetas);
 5341     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5342     __ add(tmpAddr, coeffs, 0);
 5343     load64shorts(vs1, tmpAddr);
 5344     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5345     vs_addv(vs1, __ T8H, vs1, vs2);
 5346     __ add(tmpAddr, coeffs, 0);
 5347     store64shorts(vs1, tmpAddr);
 5348     store64shorts(vs3, tmpAddr);
 5349     vs_ldpq(vq, kyberConsts);
 5350     __ add(tmpAddr, coeffs, 384);
 5351     load64shorts(vs1, tmpAddr);
 5352     load64shorts(vs2, zetas);
 5353     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5354     __ add(tmpAddr, coeffs, 256);
 5355     load64shorts(vs1, tmpAddr);
 5356     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5357     vs_addv(vs1, __ T8H, vs1, vs2);
 5358     __ add(tmpAddr, coeffs, 256);
 5359     store64shorts(vs1, tmpAddr);
 5360     store64shorts(vs3, tmpAddr);
 5361 
 5362     // level 2
 5363     vs_ldpq(vq, kyberConsts);
 5364     int offsets1[4] = { 0, 32, 128, 160 };
 5365     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5366     load64shorts(vs2, zetas);
 5367     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5368     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5369     // kyber_subv_addv64();
 5370     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5371     vs_addv(vs1, __ T8H, vs1, vs2);
 5372     __ add(tmpAddr, coeffs, 0);
 5373     vs_stpq_post(vs_front(vs1), tmpAddr);
 5374     vs_stpq_post(vs_front(vs3), tmpAddr);
 5375     vs_stpq_post(vs_back(vs1), tmpAddr);
 5376     vs_stpq_post(vs_back(vs3), tmpAddr);
 5377     vs_ldpq(vq, kyberConsts);
 5378     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5379     load64shorts(vs2, zetas);
 5380     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5381     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5382     // kyber_subv_addv64();
 5383     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5384     vs_addv(vs1, __ T8H, vs1, vs2);
 5385     __ add(tmpAddr, coeffs, 256);
 5386     vs_stpq_post(vs_front(vs1), tmpAddr);
 5387     vs_stpq_post(vs_front(vs3), tmpAddr);
 5388     vs_stpq_post(vs_back(vs1), tmpAddr);
 5389     vs_stpq_post(vs_back(vs3), tmpAddr);
 5390 
 5391     // level 3
 5392     vs_ldpq(vq, kyberConsts);
 5393     int offsets2[4] = { 0, 64, 128, 192 };
 5394     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5395     load64shorts(vs2, zetas);
 5396     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5397     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5398     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5399     vs_addv(vs1, __ T8H, vs1, vs2);
 5400     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5401     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5402 
 5403     vs_ldpq(vq, kyberConsts);
 5404     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5405     load64shorts(vs2, zetas);
 5406     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5407     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5408     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5409     vs_addv(vs1, __ T8H, vs1, vs2);
 5410     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5411     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5412 
 5413     // level 4
 5414     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5415     // so they are loaded using employing an ldr at 8 distinct offsets.
 5416 
 5417     vs_ldpq(vq, kyberConsts);
 5418     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5419     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5420     load64shorts(vs2, zetas);
 5421     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5422     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5423     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5424     vs_addv(vs1, __ T8H, vs1, vs2);
 5425     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5426     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5427 
 5428     vs_ldpq(vq, kyberConsts);
 5429     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5430     load64shorts(vs2, zetas);
 5431     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5432     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5433     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5434     vs_addv(vs1, __ T8H, vs1, vs2);
 5435     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5436     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5437 
 5438     // level 5
 5439     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5440     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5441 
 5442     vs_ldpq(vq, kyberConsts);
 5443     int offsets4[4] = { 0, 32, 64, 96 };
 5444     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5445     load32shorts(vs_front(vs2), zetas);
 5446     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5447     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5448     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5449     load32shorts(vs_front(vs2), zetas);
 5450     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5451     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5452     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5453     load32shorts(vs_front(vs2), zetas);
 5454     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5455     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5456 
 5457     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5458     load32shorts(vs_front(vs2), zetas);
 5459     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5460     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5461 
 5462     // level 6
 5463     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5464     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5465 
 5466     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5467     load32shorts(vs_front(vs2), zetas);
 5468     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5469     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5470     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5471     // __ ldpq(v18, v19, __ post(zetas, 32));
 5472     load32shorts(vs_front(vs2), zetas);
 5473     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5474     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5475 
 5476     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5477     load32shorts(vs_front(vs2), zetas);
 5478     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5479     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5480 
 5481     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5482     load32shorts(vs_front(vs2), zetas);
 5483     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5484     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5485 
 5486     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5487     __ mov(r0, zr); // return 0
 5488     __ ret(lr);
 5489 
 5490     return start;
 5491   }
 5492 
 5493   // Kyber Inverse NTT function
 5494   // Implements
 5495   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5496   //
 5497   // coeffs (short[256]) = c_rarg0
 5498   // ntt_zetas (short[256]) = c_rarg1
 5499   address generate_kyberInverseNtt() {
 5500 
 5501     __ align(CodeEntryAlignment);
 5502     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5503     StubCodeMark mark(this, stub_id);
 5504     address start = __ pc();
 5505     __ enter();
 5506 
 5507     const Register coeffs = c_rarg0;
 5508     const Register zetas = c_rarg1;
 5509 
 5510     const Register kyberConsts = r10;
 5511     const Register tmpAddr = r11;
 5512     const Register tmpAddr2 = c_rarg2;
 5513 
 5514     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5515     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5516     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5517 
 5518     __ lea(kyberConsts,
 5519              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5520 
 5521     // level 0
 5522     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5523     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5524 
 5525     vs_ldpq(vq, kyberConsts);
 5526     int offsets4[4] = { 0, 32, 64, 96 };
 5527     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5528     load32shorts(vs_front(vs2), zetas);
 5529     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5530                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5531     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5532     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5533     load32shorts(vs_front(vs2), zetas);
 5534     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5535                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5536     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5537     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5538     load32shorts(vs_front(vs2), zetas);
 5539     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5540                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5541     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5542     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5543     load32shorts(vs_front(vs2), zetas);
 5544     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5545                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5546     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5547 
 5548     // level 1
 5549     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5550     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5551 
 5552     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5553     load32shorts(vs_front(vs2), zetas);
 5554     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5555                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5556     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5557     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5558     load32shorts(vs_front(vs2), zetas);
 5559     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5560                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5561     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5562 
 5563     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5564     load32shorts(vs_front(vs2), zetas);
 5565     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5566                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5567     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5568     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5569     load32shorts(vs_front(vs2), zetas);
 5570     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5571                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5572     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5573 
 5574     // level 2
 5575     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5576     // so they are loaded using employing an ldr at 8 distinct offsets.
 5577 
 5578     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5579     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5580     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5581     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5582     vs_subv(vs1, __ T8H, vs1, vs2);
 5583     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5584     load64shorts(vs2, zetas);
 5585     vs_ldpq(vq, kyberConsts);
 5586     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5587     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5588 
 5589     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5590     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5591     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5592     vs_subv(vs1, __ T8H, vs1, vs2);
 5593     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5594     load64shorts(vs2, zetas);
 5595     vs_ldpq(vq, kyberConsts);
 5596     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5597     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5598 
 5599     // Barrett reduction at indexes where overflow may happen
 5600 
 5601     // load q and the multiplier for the Barrett reduction
 5602     __ add(tmpAddr, kyberConsts, 16);
 5603     vs_ldpq(vq, tmpAddr);
 5604 
 5605     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5606     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5607     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5608     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5609     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5610     vs_sshr(vs2, __ T8H, vs2, 11);
 5611     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5612     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5613     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5614     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5615     vs_sshr(vs2, __ T8H, vs2, 11);
 5616     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5617     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5618 
 5619     // level 3
 5620     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5621     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5622 
 5623     int offsets2[4] = { 0, 64, 128, 192 };
 5624     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5625     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5626     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5627     vs_subv(vs1, __ T8H, vs1, vs2);
 5628     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5629     load64shorts(vs2, zetas);
 5630     vs_ldpq(vq, kyberConsts);
 5631     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5632     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5633 
 5634     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5635     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5636     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5637     vs_subv(vs1, __ T8H, vs1, vs2);
 5638     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5639     load64shorts(vs2, zetas);
 5640     vs_ldpq(vq, kyberConsts);
 5641     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5642     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5643 
 5644     // level 4
 5645 
 5646     int offsets1[4] = { 0, 32, 128, 160 };
 5647     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5648     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5649     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5650     vs_subv(vs1, __ T8H, vs1, vs2);
 5651     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5652     load64shorts(vs2, zetas);
 5653     vs_ldpq(vq, kyberConsts);
 5654     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5655     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5656 
 5657     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5658     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5659     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5660     vs_subv(vs1, __ T8H, vs1, vs2);
 5661     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5662     load64shorts(vs2, zetas);
 5663     vs_ldpq(vq, kyberConsts);
 5664     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5665     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5666 
 5667     // level 5
 5668 
 5669     __ add(tmpAddr, coeffs, 0);
 5670     load64shorts(vs1, tmpAddr);
 5671     __ add(tmpAddr, coeffs, 128);
 5672     load64shorts(vs2, tmpAddr);
 5673     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5674     vs_subv(vs1, __ T8H, vs1, vs2);
 5675     __ add(tmpAddr, coeffs, 0);
 5676     store64shorts(vs3, tmpAddr);
 5677     load64shorts(vs2, zetas);
 5678     vs_ldpq(vq, kyberConsts);
 5679     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5680     __ add(tmpAddr, coeffs, 128);
 5681     store64shorts(vs2, tmpAddr);
 5682 
 5683     load64shorts(vs1, tmpAddr);
 5684     __ add(tmpAddr, coeffs, 384);
 5685     load64shorts(vs2, tmpAddr);
 5686     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5687     vs_subv(vs1, __ T8H, vs1, vs2);
 5688     __ add(tmpAddr, coeffs, 256);
 5689     store64shorts(vs3, tmpAddr);
 5690     load64shorts(vs2, zetas);
 5691     vs_ldpq(vq, kyberConsts);
 5692     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5693     __ add(tmpAddr, coeffs, 384);
 5694     store64shorts(vs2, tmpAddr);
 5695 
 5696     // Barrett reduction at indexes where overflow may happen
 5697 
 5698     // load q and the multiplier for the Barrett reduction
 5699     __ add(tmpAddr, kyberConsts, 16);
 5700     vs_ldpq(vq, tmpAddr);
 5701 
 5702     int offsets0[2] = { 0, 256 };
 5703     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5704     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5705     vs_sshr(vs2, __ T8H, vs2, 11);
 5706     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5707     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5708 
 5709     // level 6
 5710 
 5711     __ add(tmpAddr, coeffs, 0);
 5712     load64shorts(vs1, tmpAddr);
 5713     __ add(tmpAddr, coeffs, 256);
 5714     load64shorts(vs2, tmpAddr);
 5715     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5716     vs_subv(vs1, __ T8H, vs1, vs2);
 5717     __ add(tmpAddr, coeffs, 0);
 5718     store64shorts(vs3, tmpAddr);
 5719     load64shorts(vs2, zetas);
 5720     vs_ldpq(vq, kyberConsts);
 5721     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5722     __ add(tmpAddr, coeffs, 256);
 5723     store64shorts(vs2, tmpAddr);
 5724 
 5725     __ add(tmpAddr, coeffs, 128);
 5726     load64shorts(vs1, tmpAddr);
 5727     __ add(tmpAddr, coeffs, 384);
 5728     load64shorts(vs2, tmpAddr);
 5729     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5730     vs_subv(vs1, __ T8H, vs1, vs2);
 5731     __ add(tmpAddr, coeffs, 128);
 5732     store64shorts(vs3, tmpAddr);
 5733     load64shorts(vs2, zetas);
 5734     vs_ldpq(vq, kyberConsts);
 5735     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5736     __ add(tmpAddr, coeffs, 384);
 5737     store64shorts(vs2, tmpAddr);
 5738 
 5739     // multiply by 2^-n
 5740 
 5741     // load toMont(2^-n mod q)
 5742     __ add(tmpAddr, kyberConsts, 48);
 5743     __ ldr(v29, __ Q, tmpAddr);
 5744 
 5745     vs_ldpq(vq, kyberConsts);
 5746     __ add(tmpAddr, coeffs, 0);
 5747     load64shorts(vs1, tmpAddr);
 5748     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5749     __ add(tmpAddr, coeffs, 0);
 5750     store64shorts(vs2, tmpAddr);
 5751 
 5752     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5753     load64shorts(vs1, tmpAddr);
 5754     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5755     __ add(tmpAddr, coeffs, 128);
 5756     store64shorts(vs2, tmpAddr);
 5757 
 5758     // now tmpAddr contains coeffs + 256
 5759     load64shorts(vs1, tmpAddr);
 5760     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5761     __ add(tmpAddr, coeffs, 256);
 5762     store64shorts(vs2, tmpAddr);
 5763 
 5764     // now tmpAddr contains coeffs + 384
 5765     load64shorts(vs1, tmpAddr);
 5766     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5767     __ add(tmpAddr, coeffs, 384);
 5768     store64shorts(vs2, tmpAddr);
 5769 
 5770     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5771     __ mov(r0, zr); // return 0
 5772     __ ret(lr);
 5773 
 5774     return start;
 5775   }
 5776 
 5777   // Kyber multiply polynomials in the NTT domain.
 5778   // Implements
 5779   // static int implKyberNttMult(
 5780   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5781   //
 5782   // result (short[256]) = c_rarg0
 5783   // ntta (short[256]) = c_rarg1
 5784   // nttb (short[256]) = c_rarg2
 5785   // zetas (short[128]) = c_rarg3
 5786   address generate_kyberNttMult() {
 5787 
 5788     __ align(CodeEntryAlignment);
 5789     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5790     StubCodeMark mark(this, stub_id);
 5791     address start = __ pc();
 5792     __ enter();
 5793 
 5794     const Register result = c_rarg0;
 5795     const Register ntta = c_rarg1;
 5796     const Register nttb = c_rarg2;
 5797     const Register zetas = c_rarg3;
 5798 
 5799     const Register kyberConsts = r10;
 5800     const Register limit = r11;
 5801 
 5802     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5803     VSeq<4> vs3(16), vs4(20);
 5804     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5805     VSeq<2> vz(28);          // pair of zetas
 5806     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5807 
 5808     __ lea(kyberConsts,
 5809              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5810 
 5811     Label kyberNttMult_loop;
 5812 
 5813     __ add(limit, result, 512);
 5814 
 5815     // load q and qinv
 5816     vs_ldpq(vq, kyberConsts);
 5817 
 5818     // load R^2 mod q (to convert back from Montgomery representation)
 5819     __ add(kyberConsts, kyberConsts, 64);
 5820     __ ldr(v27, __ Q, kyberConsts);
 5821 
 5822     __ BIND(kyberNttMult_loop);
 5823 
 5824     // load 16 zetas
 5825     vs_ldpq_post(vz, zetas);
 5826 
 5827     // load 2 sets of 32 coefficients from the two input arrays
 5828     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5829     // are striped across pairs of vector registers
 5830     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5831     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5832     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5833     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5834 
 5835     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5836     // i.e. montmul the first and second halves of vs1 in order and
 5837     // then with one sequence reversed storing the two results in vs3
 5838     //
 5839     // vs3[0] <- montmul(a0, b0)
 5840     // vs3[1] <- montmul(a1, b1)
 5841     // vs3[2] <- montmul(a0, b1)
 5842     // vs3[3] <- montmul(a1, b0)
 5843     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5844     kyber_montmul16(vs_back(vs3),
 5845                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5846 
 5847     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5848     // i.e. montmul the first and second halves of vs4 in order and
 5849     // then with one sequence reversed storing the two results in vs1
 5850     //
 5851     // vs1[0] <- montmul(a2, b2)
 5852     // vs1[1] <- montmul(a3, b3)
 5853     // vs1[2] <- montmul(a2, b3)
 5854     // vs1[3] <- montmul(a3, b2)
 5855     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5856     kyber_montmul16(vs_back(vs1),
 5857                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5858 
 5859     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5860     // We can schedule two montmuls at a time if we use a suitable vector
 5861     // sequence <vs3[1], vs1[1]>.
 5862     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5863     VSeq<2> vs5(vs3[1], delta);
 5864 
 5865     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5866     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5867     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5868 
 5869     // add results in pairs storing in vs3
 5870     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5871     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5872     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5873 
 5874     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5875     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5876     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5877 
 5878     // vs1 <- montmul(vs3, montRSquareModQ)
 5879     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5880 
 5881     // store back the two pairs of result vectors de-interleaved as 8H elements
 5882     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5883     // in memory
 5884     vs_st2_post(vs1, __ T8H, result);
 5885 
 5886     __ cmp(result, limit);
 5887     __ br(Assembler::NE, kyberNttMult_loop);
 5888 
 5889     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5890     __ mov(r0, zr); // return 0
 5891     __ ret(lr);
 5892 
 5893     return start;
 5894   }
 5895 
 5896   // Kyber add 2 polynomials.
 5897   // Implements
 5898   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5899   //
 5900   // result (short[256]) = c_rarg0
 5901   // a (short[256]) = c_rarg1
 5902   // b (short[256]) = c_rarg2
 5903   address generate_kyberAddPoly_2() {
 5904 
 5905     __ align(CodeEntryAlignment);
 5906     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5907     StubCodeMark mark(this, stub_id);
 5908     address start = __ pc();
 5909     __ enter();
 5910 
 5911     const Register result = c_rarg0;
 5912     const Register a = c_rarg1;
 5913     const Register b = c_rarg2;
 5914 
 5915     const Register kyberConsts = r11;
 5916 
 5917     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5918     // So, we can load, add and store the data in 3 groups of 11,
 5919     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5920     // registers. A further constraint is that the mapping needs
 5921     // to skip callee saves. So, we allocate the register
 5922     // sequences using two 8 sequences, two 2 sequences and two
 5923     // single registers.
 5924     VSeq<8> vs1_1(0);
 5925     VSeq<2> vs1_2(16);
 5926     FloatRegister vs1_3 = v28;
 5927     VSeq<8> vs2_1(18);
 5928     VSeq<2> vs2_2(26);
 5929     FloatRegister vs2_3 = v29;
 5930 
 5931     // two constant vector sequences
 5932     VSeq<8> vc_1(31, 0);
 5933     VSeq<2> vc_2(31, 0);
 5934 
 5935     FloatRegister vc_3 = v31;
 5936     __ lea(kyberConsts,
 5937              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5938 
 5939     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5940     for (int i = 0; i < 3; i++) {
 5941       // load 80 or 88 values from a into vs1_1/2/3
 5942       vs_ldpq_post(vs1_1, a);
 5943       vs_ldpq_post(vs1_2, a);
 5944       if (i < 2) {
 5945         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5946       }
 5947       // load 80 or 88 values from b into vs2_1/2/3
 5948       vs_ldpq_post(vs2_1, b);
 5949       vs_ldpq_post(vs2_2, b);
 5950       if (i < 2) {
 5951         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5952       }
 5953       // sum 80 or 88 values across vs1 and vs2 into vs1
 5954       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5955       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5956       if (i < 2) {
 5957         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5958       }
 5959       // add constant to all 80 or 88 results
 5960       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5961       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5962       if (i < 2) {
 5963         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5964       }
 5965       // store 80 or 88 values
 5966       vs_stpq_post(vs1_1, result);
 5967       vs_stpq_post(vs1_2, result);
 5968       if (i < 2) {
 5969         __ str(vs1_3, __ Q, __ post(result, 16));
 5970       }
 5971     }
 5972 
 5973     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5974     __ mov(r0, zr); // return 0
 5975     __ ret(lr);
 5976 
 5977     return start;
 5978   }
 5979 
 5980   // Kyber add 3 polynomials.
 5981   // Implements
 5982   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5983   //
 5984   // result (short[256]) = c_rarg0
 5985   // a (short[256]) = c_rarg1
 5986   // b (short[256]) = c_rarg2
 5987   // c (short[256]) = c_rarg3
 5988   address generate_kyberAddPoly_3() {
 5989 
 5990     __ align(CodeEntryAlignment);
 5991     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 5992     StubCodeMark mark(this, stub_id);
 5993     address start = __ pc();
 5994     __ enter();
 5995 
 5996     const Register result = c_rarg0;
 5997     const Register a = c_rarg1;
 5998     const Register b = c_rarg2;
 5999     const Register c = c_rarg3;
 6000 
 6001     const Register kyberConsts = r11;
 6002 
 6003     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6004     // quadwords.  So, we can load, add and store the data in 3
 6005     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6006     // of 10 or 11 registers. A further constraint is that the
 6007     // mapping needs to skip callee saves. So, we allocate the
 6008     // register sequences using two 8 sequences, two 2 sequences
 6009     // and two single registers.
 6010     VSeq<8> vs1_1(0);
 6011     VSeq<2> vs1_2(16);
 6012     FloatRegister vs1_3 = v28;
 6013     VSeq<8> vs2_1(18);
 6014     VSeq<2> vs2_2(26);
 6015     FloatRegister vs2_3 = v29;
 6016 
 6017     // two constant vector sequences
 6018     VSeq<8> vc_1(31, 0);
 6019     VSeq<2> vc_2(31, 0);
 6020 
 6021     FloatRegister vc_3 = v31;
 6022 
 6023     __ lea(kyberConsts,
 6024              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6025 
 6026     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6027     for (int i = 0; i < 3; i++) {
 6028       // load 80 or 88 values from a into vs1_1/2/3
 6029       vs_ldpq_post(vs1_1, a);
 6030       vs_ldpq_post(vs1_2, a);
 6031       if (i < 2) {
 6032         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6033       }
 6034       // load 80 or 88 values from b into vs2_1/2/3
 6035       vs_ldpq_post(vs2_1, b);
 6036       vs_ldpq_post(vs2_2, b);
 6037       if (i < 2) {
 6038         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6039       }
 6040       // sum 80 or 88 values across vs1 and vs2 into vs1
 6041       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6042       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6043       if (i < 2) {
 6044         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6045       }
 6046       // load 80 or 88 values from c into vs2_1/2/3
 6047       vs_ldpq_post(vs2_1, c);
 6048       vs_ldpq_post(vs2_2, c);
 6049       if (i < 2) {
 6050         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6051       }
 6052       // sum 80 or 88 values across vs1 and vs2 into vs1
 6053       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6054       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6055       if (i < 2) {
 6056         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6057       }
 6058       // add constant to all 80 or 88 results
 6059       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6060       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6061       if (i < 2) {
 6062         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6063       }
 6064       // store 80 or 88 values
 6065       vs_stpq_post(vs1_1, result);
 6066       vs_stpq_post(vs1_2, result);
 6067       if (i < 2) {
 6068         __ str(vs1_3, __ Q, __ post(result, 16));
 6069       }
 6070     }
 6071 
 6072     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6073     __ mov(r0, zr); // return 0
 6074     __ ret(lr);
 6075 
 6076     return start;
 6077   }
 6078 
 6079   // Kyber parse XOF output to polynomial coefficient candidates
 6080   // or decodePoly(12, ...).
 6081   // Implements
 6082   // static int implKyber12To16(
 6083   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6084   //
 6085   // we assume that parsed and condensed are allocated such that for
 6086   // n = (parsedLength + 63) / 64
 6087   // n blocks of 96 bytes of input can be processed, i.e.
 6088   // index + n * 96 <= condensed.length and
 6089   // n * 64 <= parsed.length
 6090   //
 6091   // condensed (byte[]) = c_rarg0
 6092   // condensedIndex = c_rarg1
 6093   // parsed (short[]) = c_rarg2
 6094   // parsedLength = c_rarg3
 6095   address generate_kyber12To16() {
 6096     Label L_F00, L_loop;
 6097 
 6098     __ align(CodeEntryAlignment);
 6099     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6100     StubCodeMark mark(this, stub_id);
 6101     address start = __ pc();
 6102     __ enter();
 6103 
 6104     const Register condensed = c_rarg0;
 6105     const Register condensedOffs = c_rarg1;
 6106     const Register parsed = c_rarg2;
 6107     const Register parsedLength = c_rarg3;
 6108 
 6109     const Register tmpAddr = r11;
 6110 
 6111     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6112     // quadwords so we need a 6 vector sequence for the inputs.
 6113     // Parsing produces 64 shorts, employing two 8 vector
 6114     // sequences to store and combine the intermediate data.
 6115     VSeq<6> vin(24);
 6116     VSeq<8> va(0), vb(16);
 6117 
 6118     __ adr(tmpAddr, L_F00);
 6119     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6120     __ add(condensed, condensed, condensedOffs);
 6121 
 6122     __ BIND(L_loop);
 6123     // load 96 (6 x 16B) byte values
 6124     vs_ld3_post(vin, __ T16B, condensed);
 6125 
 6126     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6127     // holds 48 (16x3) contiguous bytes from memory striped
 6128     // horizontally across each of the 16 byte lanes. Equivalently,
 6129     // that is 16 pairs of 12-bit integers. Likewise the back half
 6130     // holds the next 48 bytes in the same arrangement.
 6131 
 6132     // Each vector in the front half can also be viewed as a vertical
 6133     // strip across the 16 pairs of 12 bit integers. Each byte in
 6134     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6135     // byte in vin[1] stores the high 4 bits of the first int and the
 6136     // low 4 bits of the second int. Each byte in vin[2] stores the
 6137     // high 8 bits of the second int. Likewise the vectors in second
 6138     // half.
 6139 
 6140     // Converting the data to 16-bit shorts requires first of all
 6141     // expanding each of the 6 x 16B vectors into 6 corresponding
 6142     // pairs of 8H vectors. Mask, shift and add operations on the
 6143     // resulting vector pairs can be used to combine 4 and 8 bit
 6144     // parts of related 8H vector elements.
 6145     //
 6146     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6147     // twice, one copy manipulated to provide the lower 4 bits
 6148     // belonging to the first short in a pair and another copy
 6149     // manipulated to provide the higher 4 bits belonging to the
 6150     // second short in a pair. This is why the the vector sequences va
 6151     // and vb used to hold the expanded 8H elements are of length 8.
 6152 
 6153     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6154     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6155     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6156     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6157     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6158     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6159     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6160     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6161 
 6162     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6163     // and vb[4:5]
 6164     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6165     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6166     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6167     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6168     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6169     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6170 
 6171     // shift lo byte of copy 1 of the middle stripe into the high byte
 6172     __ shl(va[2], __ T8H, va[2], 8);
 6173     __ shl(va[3], __ T8H, va[3], 8);
 6174     __ shl(vb[2], __ T8H, vb[2], 8);
 6175     __ shl(vb[3], __ T8H, vb[3], 8);
 6176 
 6177     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6178     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6179     // are in bit positions [4..11].
 6180     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6181     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6182     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6183     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6184 
 6185     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6186     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6187     // copy2
 6188     __ andr(va[2], __ T16B, va[2], v31);
 6189     __ andr(va[3], __ T16B, va[3], v31);
 6190     __ ushr(va[4], __ T8H, va[4], 4);
 6191     __ ushr(va[5], __ T8H, va[5], 4);
 6192     __ andr(vb[2], __ T16B, vb[2], v31);
 6193     __ andr(vb[3], __ T16B, vb[3], v31);
 6194     __ ushr(vb[4], __ T8H, vb[4], 4);
 6195     __ ushr(vb[5], __ T8H, vb[5], 4);
 6196 
 6197     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6198     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6199     // n.b. the ordering ensures: i) inputs are consumed before they
 6200     // are overwritten ii) the order of 16-bit results across successive
 6201     // pairs of vectors in va and then vb reflects the order of the
 6202     // corresponding 12-bit inputs
 6203     __ addv(va[0], __ T8H, va[0], va[2]);
 6204     __ addv(va[2], __ T8H, va[1], va[3]);
 6205     __ addv(va[1], __ T8H, va[4], va[6]);
 6206     __ addv(va[3], __ T8H, va[5], va[7]);
 6207     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6208     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6209     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6210     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6211 
 6212     // store 64 results interleaved as shorts
 6213     vs_st2_post(vs_front(va), __ T8H, parsed);
 6214     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6215 
 6216     __ sub(parsedLength, parsedLength, 64);
 6217     __ cmp(parsedLength, (u1)0);
 6218     __ br(Assembler::GT, L_loop);
 6219 
 6220     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6221     __ mov(r0, zr); // return 0
 6222     __ ret(lr);
 6223 
 6224     // bind label and generate constant data used by this stub
 6225     __ BIND(L_F00);
 6226     __ emit_int64(0x0f000f000f000f00);
 6227     __ emit_int64(0x0f000f000f000f00);
 6228 
 6229     return start;
 6230   }
 6231 
 6232   // Kyber Barrett reduce function.
 6233   // Implements
 6234   // static int implKyberBarrettReduce(short[] coeffs) {}
 6235   //
 6236   // coeffs (short[256]) = c_rarg0
 6237   address generate_kyberBarrettReduce() {
 6238 
 6239     __ align(CodeEntryAlignment);
 6240     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6241     StubCodeMark mark(this, stub_id);
 6242     address start = __ pc();
 6243     __ enter();
 6244 
 6245     const Register coeffs = c_rarg0;
 6246 
 6247     const Register kyberConsts = r10;
 6248     const Register result = r11;
 6249 
 6250     // As above we process 256 sets of values in total i.e. 32 x
 6251     // 8H quadwords. So, we can load, add and store the data in 3
 6252     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6253     // of 10 or 11 registers. A further constraint is that the
 6254     // mapping needs to skip callee saves. So, we allocate the
 6255     // register sequences using two 8 sequences, two 2 sequences
 6256     // and two single registers.
 6257     VSeq<8> vs1_1(0);
 6258     VSeq<2> vs1_2(16);
 6259     FloatRegister vs1_3 = v28;
 6260     VSeq<8> vs2_1(18);
 6261     VSeq<2> vs2_2(26);
 6262     FloatRegister vs2_3 = v29;
 6263 
 6264     // we also need a pair of corresponding constant sequences
 6265 
 6266     VSeq<8> vc1_1(30, 0);
 6267     VSeq<2> vc1_2(30, 0);
 6268     FloatRegister vc1_3 = v30; // for kyber_q
 6269 
 6270     VSeq<8> vc2_1(31, 0);
 6271     VSeq<2> vc2_2(31, 0);
 6272     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6273 
 6274     __ add(result, coeffs, 0);
 6275     __ lea(kyberConsts,
 6276              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6277 
 6278     // load q and the multiplier for the Barrett reduction
 6279     __ add(kyberConsts, kyberConsts, 16);
 6280     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6281 
 6282     for (int i = 0; i < 3; i++) {
 6283       // load 80 or 88 coefficients
 6284       vs_ldpq_post(vs1_1, coeffs);
 6285       vs_ldpq_post(vs1_2, coeffs);
 6286       if (i < 2) {
 6287         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6288       }
 6289 
 6290       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6291       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6292       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6293       if (i < 2) {
 6294         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6295       }
 6296 
 6297       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6298       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6299       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6300       if (i < 2) {
 6301         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6302       }
 6303 
 6304       // vs1 <- vs1 - vs2 * kyber_q
 6305       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6306       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6307       if (i < 2) {
 6308         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6309       }
 6310 
 6311       vs_stpq_post(vs1_1, result);
 6312       vs_stpq_post(vs1_2, result);
 6313       if (i < 2) {
 6314         __ str(vs1_3, __ Q, __ post(result, 16));
 6315       }
 6316     }
 6317 
 6318     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6319     __ mov(r0, zr); // return 0
 6320     __ ret(lr);
 6321 
 6322     return start;
 6323   }
 6324 
 6325 
 6326   // Dilithium-specific montmul helper routines that generate parallel
 6327   // code for, respectively, a single 4x4s vector sequence montmul or
 6328   // two such multiplies in a row.
 6329 
 6330   // Perform 16 32-bit Montgomery multiplications in parallel
 6331   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6332                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6333     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6334     // It will assert that the register use is valid
 6335     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6336   }
 6337 
 6338   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6339   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6340                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6341     // Schedule two successive 4x4S multiplies via the montmul helper
 6342     // on the front and back halves of va, vb and vc. The helper will
 6343     // assert that the register use has no overlap conflicts on each
 6344     // individual call but we also need to ensure that the necessary
 6345     // disjoint/equality constraints are met across both calls.
 6346 
 6347     // vb, vc, vtmp and vq must be disjoint. va must either be
 6348     // disjoint from all other registers or equal vc
 6349 
 6350     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6351     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6352     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6353 
 6354     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6355     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6356 
 6357     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6358 
 6359     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6360     assert(vs_disjoint(va, vb), "va and vb overlap");
 6361     assert(vs_disjoint(va, vq), "va and vq overlap");
 6362     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6363 
 6364     // We multiply the front and back halves of each sequence 4 at a
 6365     // time because
 6366     //
 6367     // 1) we are currently only able to get 4-way instruction
 6368     // parallelism at best
 6369     //
 6370     // 2) we need registers for the constants in vq and temporary
 6371     // scratch registers to hold intermediate results so vtmp can only
 6372     // be a VSeq<4> which means we only have 4 scratch slots.
 6373 
 6374     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6375     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6376   }
 6377 
 6378   // Perform combined montmul then add/sub on 4x4S vectors.
 6379   void dilithium_montmul16_sub_add(
 6380           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6381           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6382     // compute a = montmul(a1, c)
 6383     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6384     // ouptut a1 = a0 - a
 6385     vs_subv(va1, __ T4S, va0, vc);
 6386     //    and a0 = a0 + a
 6387     vs_addv(va0, __ T4S, va0, vc);
 6388   }
 6389 
 6390   // Perform combined add/sub then montul on 4x4S vectors.
 6391   void dilithium_sub_add_montmul16(
 6392           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6393           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6394     // compute c = a0 - a1
 6395     vs_subv(vtmp1, __ T4S, va0, va1);
 6396     // output a0 = a0 + a1
 6397     vs_addv(va0, __ T4S, va0, va1);
 6398     // output a1 = b montmul c
 6399     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6400   }
 6401 
 6402   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6403   // in the Java implementation come in sequences of at least 8, so we
 6404   // can use ldpq to collect the corresponding data into pairs of vector
 6405   // registers.
 6406   // We collect the coefficients corresponding to the 'j+l' indexes into
 6407   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6408   // then we do the (Montgomery) multiplications by the zetas in parallel
 6409   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6410   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6411   // v0-v7 and finally save the results back to the coeffs array.
 6412   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6413     const Register coeffs, const Register zetas) {
 6414     int c1 = 0;
 6415     int c2 = 512;
 6416     int startIncr;
 6417     // don't use callee save registers v8 - v15
 6418     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6419     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6420     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6421     int offsets[4] = { 0, 32, 64, 96 };
 6422 
 6423     for (int level = 0; level < 5; level++) {
 6424       int c1Start = c1;
 6425       int c2Start = c2;
 6426       if (level == 3) {
 6427         offsets[1] = 32;
 6428         offsets[2] = 128;
 6429         offsets[3] = 160;
 6430       } else if (level == 4) {
 6431         offsets[1] = 64;
 6432         offsets[2] = 128;
 6433         offsets[3] = 192;
 6434       }
 6435 
 6436       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6437       // time at 4 different offsets and multiply them in order by the
 6438       // next set of input values. So we employ indexed load and store
 6439       // pair instructions with arrangement 4S.
 6440       for (int i = 0; i < 4; i++) {
 6441         // reload q and qinv
 6442         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6443         // load 8x4S coefficients via second start pos == c2
 6444         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6445         // load next 8x4S inputs == b
 6446         vs_ldpq_post(vs2, zetas);
 6447         // compute a == c2 * b mod MONT_Q
 6448         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6449         // load 8x4s coefficients via first start pos == c1
 6450         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6451         // compute a1 =  c1 + a
 6452         vs_addv(vs3, __ T4S, vs1, vs2);
 6453         // compute a2 =  c1 - a
 6454         vs_subv(vs1, __ T4S, vs1, vs2);
 6455         // output a1 and a2
 6456         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6457         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6458 
 6459         int k = 4 * level + i;
 6460 
 6461         if (k > 7) {
 6462           startIncr = 256;
 6463         } else if (k == 5) {
 6464           startIncr = 384;
 6465         } else {
 6466           startIncr = 128;
 6467         }
 6468 
 6469         c1Start += startIncr;
 6470         c2Start += startIncr;
 6471       }
 6472 
 6473       c2 /= 2;
 6474     }
 6475   }
 6476 
 6477   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6478   // Implements the method
 6479   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6480   // of the Java class sun.security.provider
 6481   //
 6482   // coeffs (int[256]) = c_rarg0
 6483   // zetas (int[256]) = c_rarg1
 6484   address generate_dilithiumAlmostNtt() {
 6485 
 6486     __ align(CodeEntryAlignment);
 6487     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6488     StubCodeMark mark(this, stub_id);
 6489     address start = __ pc();
 6490     __ enter();
 6491 
 6492     const Register coeffs = c_rarg0;
 6493     const Register zetas = c_rarg1;
 6494 
 6495     const Register tmpAddr = r9;
 6496     const Register dilithiumConsts = r10;
 6497     const Register result = r11;
 6498     // don't use callee save registers v8 - v15
 6499     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6500     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6501     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6502     int offsets[4] = { 0, 32, 64, 96};
 6503     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6504     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6505     __ add(result, coeffs, 0);
 6506     __ lea(dilithiumConsts,
 6507              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6508 
 6509     // Each level represents one iteration of the outer for loop of the Java version.
 6510 
 6511     // level 0-4
 6512     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6513 
 6514     // level 5
 6515 
 6516     // At level 5 the coefficients we need to combine with the zetas
 6517     // are grouped in memory in blocks of size 4. So, for both sets of
 6518     // coefficients we load 4 adjacent values at 8 different offsets
 6519     // using an indexed ldr with register variant Q and multiply them
 6520     // in sequence order by the next set of inputs. Likewise we store
 6521     // the resuls using an indexed str with register variant Q.
 6522     for (int i = 0; i < 1024; i += 256) {
 6523       // reload constants q, qinv each iteration as they get clobbered later
 6524       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6525       // load 32 (8x4S) coefficients via first offsets = c1
 6526       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6527       // load next 32 (8x4S) inputs = b
 6528       vs_ldpq_post(vs2, zetas);
 6529       // a = b montul c1
 6530       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6531       // load 32 (8x4S) coefficients via second offsets = c2
 6532       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6533       // add/sub with result of multiply
 6534       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6535       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6536       // write back new coefficients using same offsets
 6537       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6538       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6539     }
 6540 
 6541     // level 6
 6542     // At level 6 the coefficients we need to combine with the zetas
 6543     // are grouped in memory in pairs, the first two being montmul
 6544     // inputs and the second add/sub inputs. We can still implement
 6545     // the montmul+sub+add using 4-way parallelism but only if we
 6546     // combine the coefficients with the zetas 16 at a time. We load 8
 6547     // adjacent values at 4 different offsets using an ld2 load with
 6548     // arrangement 2D. That interleaves the lower and upper halves of
 6549     // each pair of quadwords into successive vector registers. We
 6550     // then need to montmul the 4 even elements of the coefficients
 6551     // register sequence by the zetas in order and then add/sub the 4
 6552     // odd elements of the coefficients register sequence. We use an
 6553     // equivalent st2 operation to store the results back into memory
 6554     // de-interleaved.
 6555     for (int i = 0; i < 1024; i += 128) {
 6556       // reload constants q, qinv each iteration as they get clobbered later
 6557       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6558       // load interleaved 16 (4x2D) coefficients via offsets
 6559       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6560       // load next 16 (4x4S) inputs
 6561       vs_ldpq_post(vs_front(vs2), zetas);
 6562       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6563       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6564                                   vs_front(vs2), vtmp, vq);
 6565       // store interleaved 16 (4x2D) coefficients via offsets
 6566       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6567     }
 6568 
 6569     // level 7
 6570     // At level 7 the coefficients we need to combine with the zetas
 6571     // occur singly with montmul inputs alterating with add/sub
 6572     // inputs. Once again we can use 4-way parallelism to combine 16
 6573     // zetas at a time. However, we have to load 8 adjacent values at
 6574     // 4 different offsets using an ld2 load with arrangement 4S. That
 6575     // interleaves the the odd words of each pair into one
 6576     // coefficients vector register and the even words of the pair
 6577     // into the next register. We then need to montmul the 4 even
 6578     // elements of the coefficients register sequence by the zetas in
 6579     // order and then add/sub the 4 odd elements of the coefficients
 6580     // register sequence. We use an equivalent st2 operation to store
 6581     // the results back into memory de-interleaved.
 6582 
 6583     for (int i = 0; i < 1024; i += 128) {
 6584       // reload constants q, qinv each iteration as they get clobbered later
 6585       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6586       // load interleaved 16 (4x4S) coefficients via offsets
 6587       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6588       // load next 16 (4x4S) inputs
 6589       vs_ldpq_post(vs_front(vs2), zetas);
 6590       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6591       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6592                                   vs_front(vs2), vtmp, vq);
 6593       // store interleaved 16 (4x4S) coefficients via offsets
 6594       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6595     }
 6596     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6597     __ mov(r0, zr); // return 0
 6598     __ ret(lr);
 6599 
 6600     return start;
 6601   }
 6602 
 6603   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6604   // in the Java implementation come in sequences of at least 8, so we
 6605   // can use ldpq to collect the corresponding data into pairs of vector
 6606   // registers
 6607   // We collect the coefficients that correspond to the 'j's into vs1
 6608   // the coefficiets that correspond to the 'j+l's into vs2 then
 6609   // do the additions into vs3 and the subtractions into vs1 then
 6610   // save the result of the additions, load the zetas into vs2
 6611   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6612   // finally save the results back to the coeffs array
 6613   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6614     const Register coeffs, const Register zetas) {
 6615     int c1 = 0;
 6616     int c2 = 32;
 6617     int startIncr;
 6618     int offsets[4];
 6619     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6620     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6621     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6622 
 6623     offsets[0] = 0;
 6624 
 6625     for (int level = 3; level < 8; level++) {
 6626       int c1Start = c1;
 6627       int c2Start = c2;
 6628       if (level == 3) {
 6629         offsets[1] = 64;
 6630         offsets[2] = 128;
 6631         offsets[3] = 192;
 6632       } else if (level == 4) {
 6633         offsets[1] = 32;
 6634         offsets[2] = 128;
 6635         offsets[3] = 160;
 6636       } else {
 6637         offsets[1] = 32;
 6638         offsets[2] = 64;
 6639         offsets[3] = 96;
 6640       }
 6641 
 6642       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6643       // time at 4 different offsets and multiply them in order by the
 6644       // next set of input values. So we employ indexed load and store
 6645       // pair instructions with arrangement 4S.
 6646       for (int i = 0; i < 4; i++) {
 6647         // load v1 32 (8x4S) coefficients relative to first start index
 6648         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6649         // load v2 32 (8x4S) coefficients relative to second start index
 6650         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6651         // a0 = v1 + v2 -- n.b. clobbers vqs
 6652         vs_addv(vs3, __ T4S, vs1, vs2);
 6653         // a1 = v1 - v2
 6654         vs_subv(vs1, __ T4S, vs1, vs2);
 6655         // save a1 relative to first start index
 6656         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6657         // load constants q, qinv each iteration as they get clobbered above
 6658         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6659         // load b next 32 (8x4S) inputs
 6660         vs_ldpq_post(vs2, zetas);
 6661         // a = a1 montmul b
 6662         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6663         // save a relative to second start index
 6664         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6665 
 6666         int k = 4 * level + i;
 6667 
 6668         if (k < 24) {
 6669           startIncr = 256;
 6670         } else if (k == 25) {
 6671           startIncr = 384;
 6672         } else {
 6673           startIncr = 128;
 6674         }
 6675 
 6676         c1Start += startIncr;
 6677         c2Start += startIncr;
 6678       }
 6679 
 6680       c2 *= 2;
 6681     }
 6682   }
 6683 
 6684   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6685   // Implements the method
 6686   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6687   // the sun.security.provider.ML_DSA class.
 6688   //
 6689   // coeffs (int[256]) = c_rarg0
 6690   // zetas (int[256]) = c_rarg1
 6691   address generate_dilithiumAlmostInverseNtt() {
 6692 
 6693     __ align(CodeEntryAlignment);
 6694     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6695     StubCodeMark mark(this, stub_id);
 6696     address start = __ pc();
 6697     __ enter();
 6698 
 6699     const Register coeffs = c_rarg0;
 6700     const Register zetas = c_rarg1;
 6701 
 6702     const Register tmpAddr = r9;
 6703     const Register dilithiumConsts = r10;
 6704     const Register result = r11;
 6705     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6706     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6707     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6708     int offsets[4] = { 0, 32, 64, 96 };
 6709     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6710     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6711 
 6712     __ add(result, coeffs, 0);
 6713     __ lea(dilithiumConsts,
 6714              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6715 
 6716     // Each level represents one iteration of the outer for loop of the Java version
 6717 
 6718     // level 0
 6719     // At level 0 we need to interleave adjacent quartets of
 6720     // coefficients before we multiply and add/sub by the next 16
 6721     // zetas just as we did for level 7 in the multiply code. So we
 6722     // load and store the values using an ld2/st2 with arrangement 4S.
 6723     for (int i = 0; i < 1024; i += 128) {
 6724       // load constants q, qinv
 6725       // n.b. this can be moved out of the loop as they do not get
 6726       // clobbered by first two loops
 6727       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6728       // a0/a1 load interleaved 32 (8x4S) coefficients
 6729       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6730       // b load next 32 (8x4S) inputs
 6731       vs_ldpq_post(vs_front(vs2), zetas);
 6732       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6733       // n.b. second half of vs2 provides temporary register storage
 6734       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6735                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6736       // a0/a1 store interleaved 32 (8x4S) coefficients
 6737       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6738     }
 6739 
 6740     // level 1
 6741     // At level 1 we need to interleave pairs of adjacent pairs of
 6742     // coefficients before we multiply by the next 16 zetas just as we
 6743     // did for level 6 in the multiply code. So we load and store the
 6744     // values an ld2/st2 with arrangement 2D.
 6745     for (int i = 0; i < 1024; i += 128) {
 6746       // a0/a1 load interleaved 32 (8x2D) coefficients
 6747       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6748       // b load next 16 (4x4S) inputs
 6749       vs_ldpq_post(vs_front(vs2), zetas);
 6750       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6751       // n.b. second half of vs2 provides temporary register storage
 6752       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6753                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6754       // a0/a1 store interleaved 32 (8x2D) coefficients
 6755       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6756     }
 6757 
 6758     // level 2
 6759     // At level 2 coefficients come in blocks of 4. So, we load 4
 6760     // adjacent coefficients at 8 distinct offsets for both the first
 6761     // and second coefficient sequences, using an ldr with register
 6762     // variant Q then combine them with next set of 32 zetas. Likewise
 6763     // we store the results using an str with register variant Q.
 6764     for (int i = 0; i < 1024; i += 256) {
 6765       // c0 load 32 (8x4S) coefficients via first offsets
 6766       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6767       // c1 load 32 (8x4S) coefficients via second offsets
 6768       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6769       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6770       vs_addv(vs3, __ T4S, vs1, vs2);
 6771       // c = c0 - c1
 6772       vs_subv(vs1, __ T4S, vs1, vs2);
 6773       // store a0 32 (8x4S) coefficients via first offsets
 6774       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6775       // b load 32 (8x4S) next inputs
 6776       vs_ldpq_post(vs2, zetas);
 6777       // reload constants q, qinv -- they were clobbered earlier
 6778       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6779       // compute a1 = b montmul c
 6780       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6781       // store a1 32 (8x4S) coefficients via second offsets
 6782       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6783     }
 6784 
 6785     // level 3-7
 6786     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6787 
 6788     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6789     __ mov(r0, zr); // return 0
 6790     __ ret(lr);
 6791 
 6792     return start;
 6793   }
 6794 
 6795   // Dilithium multiply polynomials in the NTT domain.
 6796   // Straightforward implementation of the method
 6797   // static int implDilithiumNttMult(
 6798   //              int[] result, int[] ntta, int[] nttb {} of
 6799   // the sun.security.provider.ML_DSA class.
 6800   //
 6801   // result (int[256]) = c_rarg0
 6802   // poly1 (int[256]) = c_rarg1
 6803   // poly2 (int[256]) = c_rarg2
 6804   address generate_dilithiumNttMult() {
 6805 
 6806         __ align(CodeEntryAlignment);
 6807     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6808     StubCodeMark mark(this, stub_id);
 6809     address start = __ pc();
 6810     __ enter();
 6811 
 6812     Label L_loop;
 6813 
 6814     const Register result = c_rarg0;
 6815     const Register poly1 = c_rarg1;
 6816     const Register poly2 = c_rarg2;
 6817 
 6818     const Register dilithiumConsts = r10;
 6819     const Register len = r11;
 6820 
 6821     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6822     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6823     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6824     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6825 
 6826     __ lea(dilithiumConsts,
 6827              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6828 
 6829     // load constants q, qinv
 6830     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6831     // load constant rSquare into v29
 6832     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6833 
 6834     __ mov(len, zr);
 6835     __ add(len, len, 1024);
 6836 
 6837     __ BIND(L_loop);
 6838 
 6839     // b load 32 (8x4S) next inputs from poly1
 6840     vs_ldpq_post(vs1, poly1);
 6841     // c load 32 (8x4S) next inputs from poly2
 6842     vs_ldpq_post(vs2, poly2);
 6843     // compute a = b montmul c
 6844     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6845     // compute a = rsquare montmul a
 6846     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6847     // save a 32 (8x4S) results
 6848     vs_stpq_post(vs2, result);
 6849 
 6850     __ sub(len, len, 128);
 6851     __ cmp(len, (u1)128);
 6852     __ br(Assembler::GE, L_loop);
 6853 
 6854     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6855     __ mov(r0, zr); // return 0
 6856     __ ret(lr);
 6857 
 6858     return start;
 6859   }
 6860 
 6861   // Dilithium Motgomery multiply an array by a constant.
 6862   // A straightforward implementation of the method
 6863   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6864   // of the sun.security.provider.MLDSA class
 6865   //
 6866   // coeffs (int[256]) = c_rarg0
 6867   // constant (int) = c_rarg1
 6868   address generate_dilithiumMontMulByConstant() {
 6869 
 6870     __ align(CodeEntryAlignment);
 6871     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6872     StubCodeMark mark(this, stub_id);
 6873     address start = __ pc();
 6874     __ enter();
 6875 
 6876     Label L_loop;
 6877 
 6878     const Register coeffs = c_rarg0;
 6879     const Register constant = c_rarg1;
 6880 
 6881     const Register dilithiumConsts = r10;
 6882     const Register result = r11;
 6883     const Register len = r12;
 6884 
 6885     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6886     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6887     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6888     VSeq<8> vconst(29, 0);             // for montmul by constant
 6889 
 6890     // results track inputs
 6891     __ add(result, coeffs, 0);
 6892     __ lea(dilithiumConsts,
 6893              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6894 
 6895     // load constants q, qinv -- they do not get clobbered by first two loops
 6896     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6897     // copy caller supplied constant across vconst
 6898     __ dup(vconst[0], __ T4S, constant);
 6899     __ mov(len, zr);
 6900     __ add(len, len, 1024);
 6901 
 6902     __ BIND(L_loop);
 6903 
 6904     // load next 32 inputs
 6905     vs_ldpq_post(vs2, coeffs);
 6906     // mont mul by constant
 6907     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6908     // write next 32 results
 6909     vs_stpq_post(vs2, result);
 6910 
 6911     __ sub(len, len, 128);
 6912     __ cmp(len, (u1)128);
 6913     __ br(Assembler::GE, L_loop);
 6914 
 6915     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6916     __ mov(r0, zr); // return 0
 6917     __ ret(lr);
 6918 
 6919     return start;
 6920   }
 6921 
 6922   // Dilithium decompose poly.
 6923   // Implements the method
 6924   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6925   // of the sun.security.provider.ML_DSA class
 6926   //
 6927   // input (int[256]) = c_rarg0
 6928   // lowPart (int[256]) = c_rarg1
 6929   // highPart (int[256]) = c_rarg2
 6930   // twoGamma2  (int) = c_rarg3
 6931   // multiplier (int) = c_rarg4
 6932   address generate_dilithiumDecomposePoly() {
 6933 
 6934     __ align(CodeEntryAlignment);
 6935     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 6936     StubCodeMark mark(this, stub_id);
 6937     address start = __ pc();
 6938     Label L_loop;
 6939 
 6940     const Register input = c_rarg0;
 6941     const Register lowPart = c_rarg1;
 6942     const Register highPart = c_rarg2;
 6943     const Register twoGamma2 = c_rarg3;
 6944     const Register multiplier = c_rarg4;
 6945 
 6946     const Register len = r9;
 6947     const Register dilithiumConsts = r10;
 6948     const Register tmp = r11;
 6949 
 6950     // 6 independent sets of 4x4s values
 6951     VSeq<4> vs1(0), vs2(4), vs3(8);
 6952     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6953 
 6954     // 7 constants for cross-multiplying
 6955     VSeq<4> one(25, 0);
 6956     VSeq<4> qminus1(26, 0);
 6957     VSeq<4> g2(27, 0);
 6958     VSeq<4> twog2(28, 0);
 6959     VSeq<4> mult(29, 0);
 6960     VSeq<4> q(30, 0);
 6961     VSeq<4> qadd(31, 0);
 6962 
 6963     __ enter();
 6964 
 6965     __ lea(dilithiumConsts,
 6966              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6967 
 6968     // save callee-saved registers
 6969     __ stpd(v8, v9, __ pre(sp, -64));
 6970     __ stpd(v10, v11, Address(sp, 16));
 6971     __ stpd(v12, v13, Address(sp, 32));
 6972     __ stpd(v14, v15, Address(sp, 48));
 6973 
 6974     // populate constant registers
 6975     __ mov(tmp, zr);
 6976     __ add(tmp, tmp, 1);
 6977     __ dup(one[0], __ T4S, tmp); // 1
 6978     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6979     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6980     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6981     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6982     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6983     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6984 
 6985     __ mov(len, zr);
 6986     __ add(len, len, 1024);
 6987 
 6988     __ BIND(L_loop);
 6989 
 6990     // load next 4x4S inputs interleaved: rplus --> vs1
 6991     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 6992 
 6993     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 6994     vs_addv(vtmp, __ T4S, vs1, qadd);
 6995     vs_sshr(vtmp, __ T4S, vtmp, 23);
 6996     vs_mulv(vtmp, __ T4S, vtmp, q);
 6997     vs_subv(vs1, __ T4S, vs1, vtmp);
 6998 
 6999     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7000     vs_sshr(vtmp, __ T4S, vs1, 31);
 7001     vs_andr(vtmp, vtmp, q);
 7002     vs_addv(vs1, __ T4S, vs1, vtmp);
 7003 
 7004     // quotient --> vs2
 7005     // int quotient = (rplus * multiplier) >> 22;
 7006     vs_mulv(vtmp, __ T4S, vs1, mult);
 7007     vs_sshr(vs2, __ T4S, vtmp, 22);
 7008 
 7009     // r0 --> vs3
 7010     // int r0 = rplus - quotient * twoGamma2;
 7011     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7012     vs_subv(vs3, __ T4S, vs1, vtmp);
 7013 
 7014     // mask --> vs4
 7015     // int mask = (twoGamma2 - r0) >> 22;
 7016     vs_subv(vtmp, __ T4S, twog2, vs3);
 7017     vs_sshr(vs4, __ T4S, vtmp, 22);
 7018 
 7019     // r0 -= (mask & twoGamma2);
 7020     vs_andr(vtmp, vs4, twog2);
 7021     vs_subv(vs3, __ T4S, vs3, vtmp);
 7022 
 7023     //  quotient += (mask & 1);
 7024     vs_andr(vtmp, vs4, one);
 7025     vs_addv(vs2, __ T4S, vs2, vtmp);
 7026 
 7027     // mask = (twoGamma2 / 2 - r0) >> 31;
 7028     vs_subv(vtmp, __ T4S, g2, vs3);
 7029     vs_sshr(vs4, __ T4S, vtmp, 31);
 7030 
 7031     // r0 -= (mask & twoGamma2);
 7032     vs_andr(vtmp, vs4, twog2);
 7033     vs_subv(vs3, __ T4S, vs3, vtmp);
 7034 
 7035     // quotient += (mask & 1);
 7036     vs_andr(vtmp, vs4, one);
 7037     vs_addv(vs2, __ T4S, vs2, vtmp);
 7038 
 7039     // r1 --> vs5
 7040     // int r1 = rplus - r0 - (dilithium_q - 1);
 7041     vs_subv(vtmp, __ T4S, vs1, vs3);
 7042     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7043 
 7044     // r1 --> vs1 (overwriting rplus)
 7045     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7046     vs_negr(vtmp, __ T4S, vs5);
 7047     vs_orr(vtmp, vs5, vtmp);
 7048     vs_sshr(vs1, __ T4S, vtmp, 31);
 7049 
 7050     // r0 += ~r1;
 7051     vs_notr(vtmp, vs1);
 7052     vs_addv(vs3, __ T4S, vs3, vtmp);
 7053 
 7054     // r1 = r1 & quotient;
 7055     vs_andr(vs1, vs2, vs1);
 7056 
 7057     // store results inteleaved
 7058     // lowPart[m] = r0;
 7059     // highPart[m] = r1;
 7060     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7061     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7062 
 7063     __ sub(len, len, 64);
 7064     __ cmp(len, (u1)64);
 7065     __ br(Assembler::GE, L_loop);
 7066 
 7067     // restore callee-saved vector registers
 7068     __ ldpd(v14, v15, Address(sp, 48));
 7069     __ ldpd(v12, v13, Address(sp, 32));
 7070     __ ldpd(v10, v11, Address(sp, 16));
 7071     __ ldpd(v8, v9, __ post(sp, 64));
 7072 
 7073     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7074     __ mov(r0, zr); // return 0
 7075     __ ret(lr);
 7076 
 7077     return start;
 7078   }
 7079 
 7080   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7081              Register tmp0, Register tmp1, Register tmp2) {
 7082     __ bic(tmp0, a2, a1); // for a0
 7083     __ bic(tmp1, a3, a2); // for a1
 7084     __ bic(tmp2, a4, a3); // for a2
 7085     __ eor(a2, a2, tmp2);
 7086     __ bic(tmp2, a0, a4); // for a3
 7087     __ eor(a3, a3, tmp2);
 7088     __ bic(tmp2, a1, a0); // for a4
 7089     __ eor(a0, a0, tmp0);
 7090     __ eor(a1, a1, tmp1);
 7091     __ eor(a4, a4, tmp2);
 7092   }
 7093 
 7094   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7095                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7096                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7097                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7098                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7099                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7100                         Register tmp0, Register tmp1, Register tmp2) {
 7101     __ eor3(tmp1, a4, a9, a14);
 7102     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7103     __ eor3(tmp2, a1, a6, a11);
 7104     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7105     __ rax1(tmp2, tmp0, tmp1); // d0
 7106     {
 7107 
 7108       Register tmp3, tmp4;
 7109       if (can_use_fp && can_use_r18) {
 7110         tmp3 = rfp;
 7111         tmp4 = r18_tls;
 7112       } else {
 7113         tmp3 = a4;
 7114         tmp4 = a9;
 7115         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7116       }
 7117 
 7118       __ eor3(tmp3, a0, a5, a10);
 7119       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7120       __ eor(a0, a0, tmp2);
 7121       __ eor(a5, a5, tmp2);
 7122       __ eor(a10, a10, tmp2);
 7123       __ eor(a15, a15, tmp2);
 7124       __ eor(a20, a20, tmp2); // d0(tmp2)
 7125       __ eor3(tmp3, a2, a7, a12);
 7126       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7127       __ rax1(tmp3, tmp4, tmp2); // d1
 7128       __ eor(a1, a1, tmp3);
 7129       __ eor(a6, a6, tmp3);
 7130       __ eor(a11, a11, tmp3);
 7131       __ eor(a16, a16, tmp3);
 7132       __ eor(a21, a21, tmp3); // d1(tmp3)
 7133       __ rax1(tmp3, tmp2, tmp0); // d3
 7134       __ eor3(tmp2, a3, a8, a13);
 7135       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7136       __ eor(a3, a3, tmp3);
 7137       __ eor(a8, a8, tmp3);
 7138       __ eor(a13, a13, tmp3);
 7139       __ eor(a18, a18, tmp3);
 7140       __ eor(a23, a23, tmp3);
 7141       __ rax1(tmp2, tmp1, tmp0); // d2
 7142       __ eor(a2, a2, tmp2);
 7143       __ eor(a7, a7, tmp2);
 7144       __ eor(a12, a12, tmp2);
 7145       __ rax1(tmp0, tmp0, tmp4); // d4
 7146       if (!can_use_fp || !can_use_r18) {
 7147         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7148       }
 7149       __ eor(a17, a17, tmp2);
 7150       __ eor(a22, a22, tmp2);
 7151       __ eor(a4, a4, tmp0);
 7152       __ eor(a9, a9, tmp0);
 7153       __ eor(a14, a14, tmp0);
 7154       __ eor(a19, a19, tmp0);
 7155       __ eor(a24, a24, tmp0);
 7156     }
 7157 
 7158     __ rol(tmp0, a10, 3);
 7159     __ rol(a10, a1, 1);
 7160     __ rol(a1, a6, 44);
 7161     __ rol(a6, a9, 20);
 7162     __ rol(a9, a22, 61);
 7163     __ rol(a22, a14, 39);
 7164     __ rol(a14, a20, 18);
 7165     __ rol(a20, a2, 62);
 7166     __ rol(a2, a12, 43);
 7167     __ rol(a12, a13, 25);
 7168     __ rol(a13, a19, 8) ;
 7169     __ rol(a19, a23, 56);
 7170     __ rol(a23, a15, 41);
 7171     __ rol(a15, a4, 27);
 7172     __ rol(a4, a24, 14);
 7173     __ rol(a24, a21, 2);
 7174     __ rol(a21, a8, 55);
 7175     __ rol(a8, a16, 45);
 7176     __ rol(a16, a5, 36);
 7177     __ rol(a5, a3, 28);
 7178     __ rol(a3, a18, 21);
 7179     __ rol(a18, a17, 15);
 7180     __ rol(a17, a11, 10);
 7181     __ rol(a11, a7, 6);
 7182     __ mov(a7, tmp0);
 7183 
 7184     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7185     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7186     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7187     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7188     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7189 
 7190     __ ldr(tmp1, __ post(rc, 8));
 7191     __ eor(a0, a0, tmp1);
 7192 
 7193   }
 7194 
 7195   // Arguments:
 7196   //
 7197   // Inputs:
 7198   //   c_rarg0   - byte[]  source+offset
 7199   //   c_rarg1   - byte[]  SHA.state
 7200   //   c_rarg2   - int     block_size
 7201   //   c_rarg3   - int     offset
 7202   //   c_rarg4   - int     limit
 7203   //
 7204   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7205     bool multi_block;
 7206     switch (stub_id) {
 7207     case StubId::stubgen_sha3_implCompress_id:
 7208       multi_block = false;
 7209       break;
 7210     case StubId::stubgen_sha3_implCompressMB_id:
 7211       multi_block = true;
 7212       break;
 7213     default:
 7214       ShouldNotReachHere();
 7215     }
 7216 
 7217     static const uint64_t round_consts[24] = {
 7218       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7219       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7220       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7221       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7222       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7223       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7224       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7225       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7226     };
 7227 
 7228     __ align(CodeEntryAlignment);
 7229     StubCodeMark mark(this, stub_id);
 7230     address start = __ pc();
 7231 
 7232     Register buf           = c_rarg0;
 7233     Register state         = c_rarg1;
 7234     Register block_size    = c_rarg2;
 7235     Register ofs           = c_rarg3;
 7236     Register limit         = c_rarg4;
 7237 
 7238     // use r3.r17,r19..r28 to keep a0..a24.
 7239     // a0..a24 are respective locals from SHA3.java
 7240     Register a0 = r25,
 7241              a1 = r26,
 7242              a2 = r27,
 7243              a3 = r3,
 7244              a4 = r4,
 7245              a5 = r5,
 7246              a6 = r6,
 7247              a7 = r7,
 7248              a8 = rscratch1, // r8
 7249              a9 = rscratch2, // r9
 7250              a10 = r10,
 7251              a11 = r11,
 7252              a12 = r12,
 7253              a13 = r13,
 7254              a14 = r14,
 7255              a15 = r15,
 7256              a16 = r16,
 7257              a17 = r17,
 7258              a18 = r28,
 7259              a19 = r19,
 7260              a20 = r20,
 7261              a21 = r21,
 7262              a22 = r22,
 7263              a23 = r23,
 7264              a24 = r24;
 7265 
 7266     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7267 
 7268     Label sha3_loop, rounds24_preloop, loop_body;
 7269     Label sha3_512_or_sha3_384, shake128;
 7270 
 7271     bool can_use_r18 = false;
 7272 #ifndef R18_RESERVED
 7273     can_use_r18 = true;
 7274 #endif
 7275     bool can_use_fp = !PreserveFramePointer;
 7276 
 7277     __ enter();
 7278 
 7279     // save almost all yet unsaved gpr registers on stack
 7280     __ str(block_size, __ pre(sp, -128));
 7281     if (multi_block) {
 7282       __ stpw(ofs, limit, Address(sp, 8));
 7283     }
 7284     // 8 bytes at sp+16 will be used to keep buf
 7285     __ stp(r19, r20, Address(sp, 32));
 7286     __ stp(r21, r22, Address(sp, 48));
 7287     __ stp(r23, r24, Address(sp, 64));
 7288     __ stp(r25, r26, Address(sp, 80));
 7289     __ stp(r27, r28, Address(sp, 96));
 7290     if (can_use_r18 && can_use_fp) {
 7291       __ stp(r18_tls, state, Address(sp, 112));
 7292     } else {
 7293       __ str(state, Address(sp, 112));
 7294     }
 7295 
 7296     // begin sha3 calculations: loading a0..a24 from state arrary
 7297     __ ldp(a0, a1, state);
 7298     __ ldp(a2, a3, Address(state, 16));
 7299     __ ldp(a4, a5, Address(state, 32));
 7300     __ ldp(a6, a7, Address(state, 48));
 7301     __ ldp(a8, a9, Address(state, 64));
 7302     __ ldp(a10, a11, Address(state, 80));
 7303     __ ldp(a12, a13, Address(state, 96));
 7304     __ ldp(a14, a15, Address(state, 112));
 7305     __ ldp(a16, a17, Address(state, 128));
 7306     __ ldp(a18, a19, Address(state, 144));
 7307     __ ldp(a20, a21, Address(state, 160));
 7308     __ ldp(a22, a23, Address(state, 176));
 7309     __ ldr(a24, Address(state, 192));
 7310 
 7311     __ BIND(sha3_loop);
 7312 
 7313     // load input
 7314     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7315     __ eor(a0, a0, tmp3);
 7316     __ eor(a1, a1, tmp2);
 7317     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7318     __ eor(a2, a2, tmp3);
 7319     __ eor(a3, a3, tmp2);
 7320     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7321     __ eor(a4, a4, tmp3);
 7322     __ eor(a5, a5, tmp2);
 7323     __ ldr(tmp3, __ post(buf, 8));
 7324     __ eor(a6, a6, tmp3);
 7325 
 7326     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7327     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7328 
 7329     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7330     __ eor(a7, a7, tmp3);
 7331     __ eor(a8, a8, tmp2);
 7332     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7333     __ eor(a9, a9, tmp3);
 7334     __ eor(a10, a10, tmp2);
 7335     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7336     __ eor(a11, a11, tmp3);
 7337     __ eor(a12, a12, tmp2);
 7338     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7339     __ eor(a13, a13, tmp3);
 7340     __ eor(a14, a14, tmp2);
 7341     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7342     __ eor(a15, a15, tmp3);
 7343     __ eor(a16, a16, tmp2);
 7344 
 7345     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7346     __ andw(tmp2, block_size, 48);
 7347     __ cbzw(tmp2, rounds24_preloop);
 7348     __ tbnz(block_size, 5, shake128);
 7349     // block_size == 144, bit5 == 0, SHA3-244
 7350     __ ldr(tmp3, __ post(buf, 8));
 7351     __ eor(a17, a17, tmp3);
 7352     __ b(rounds24_preloop);
 7353 
 7354     __ BIND(shake128);
 7355     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7356     __ eor(a17, a17, tmp3);
 7357     __ eor(a18, a18, tmp2);
 7358     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7359     __ eor(a19, a19, tmp3);
 7360     __ eor(a20, a20, tmp2);
 7361     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7362 
 7363     __ BIND(sha3_512_or_sha3_384);
 7364     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7365     __ eor(a7, a7, tmp3);
 7366     __ eor(a8, a8, tmp2);
 7367     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7368 
 7369     // SHA3-384
 7370     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7371     __ eor(a9, a9, tmp3);
 7372     __ eor(a10, a10, tmp2);
 7373     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7374     __ eor(a11, a11, tmp3);
 7375     __ eor(a12, a12, tmp2);
 7376 
 7377     __ BIND(rounds24_preloop);
 7378     __ fmovs(v0, 24.0); // float loop counter,
 7379     __ fmovs(v1, 1.0);  // exact representation
 7380 
 7381     __ str(buf, Address(sp, 16));
 7382     __ lea(tmp3, ExternalAddress((address) round_consts));
 7383 
 7384     __ BIND(loop_body);
 7385     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7386                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7387                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7388                      tmp0, tmp1, tmp2);
 7389     __ fsubs(v0, v0, v1);
 7390     __ fcmps(v0, 0.0);
 7391     __ br(__ NE, loop_body);
 7392 
 7393     if (multi_block) {
 7394       __ ldrw(block_size, sp); // block_size
 7395       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7396       __ addw(tmp2, tmp2, block_size);
 7397       __ cmpw(tmp2, tmp1);
 7398       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7399       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7400       __ br(Assembler::LE, sha3_loop);
 7401       __ movw(c_rarg0, tmp2); // return offset
 7402     }
 7403     if (can_use_fp && can_use_r18) {
 7404       __ ldp(r18_tls, state, Address(sp, 112));
 7405     } else {
 7406       __ ldr(state, Address(sp, 112));
 7407     }
 7408     // save calculated sha3 state
 7409     __ stp(a0, a1, Address(state));
 7410     __ stp(a2, a3, Address(state, 16));
 7411     __ stp(a4, a5, Address(state, 32));
 7412     __ stp(a6, a7, Address(state, 48));
 7413     __ stp(a8, a9, Address(state, 64));
 7414     __ stp(a10, a11, Address(state, 80));
 7415     __ stp(a12, a13, Address(state, 96));
 7416     __ stp(a14, a15, Address(state, 112));
 7417     __ stp(a16, a17, Address(state, 128));
 7418     __ stp(a18, a19, Address(state, 144));
 7419     __ stp(a20, a21, Address(state, 160));
 7420     __ stp(a22, a23, Address(state, 176));
 7421     __ str(a24, Address(state, 192));
 7422 
 7423     // restore required registers from stack
 7424     __ ldp(r19, r20, Address(sp, 32));
 7425     __ ldp(r21, r22, Address(sp, 48));
 7426     __ ldp(r23, r24, Address(sp, 64));
 7427     __ ldp(r25, r26, Address(sp, 80));
 7428     __ ldp(r27, r28, Address(sp, 96));
 7429     if (can_use_fp && can_use_r18) {
 7430       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7431     } // else no need to recalculate rfp, since it wasn't changed
 7432 
 7433     __ leave();
 7434 
 7435     __ ret(lr);
 7436 
 7437     return start;
 7438   }
 7439 
 7440   /**
 7441    *  Arguments:
 7442    *
 7443    * Inputs:
 7444    *   c_rarg0   - int crc
 7445    *   c_rarg1   - byte* buf
 7446    *   c_rarg2   - int length
 7447    *
 7448    * Output:
 7449    *       rax   - int crc result
 7450    */
 7451   address generate_updateBytesCRC32() {
 7452     assert(UseCRC32Intrinsics, "what are we doing here?");
 7453 
 7454     __ align(CodeEntryAlignment);
 7455     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7456     StubCodeMark mark(this, stub_id);
 7457 
 7458     address start = __ pc();
 7459 
 7460     const Register crc   = c_rarg0;  // crc
 7461     const Register buf   = c_rarg1;  // source java byte array address
 7462     const Register len   = c_rarg2;  // length
 7463     const Register table0 = c_rarg3; // crc_table address
 7464     const Register table1 = c_rarg4;
 7465     const Register table2 = c_rarg5;
 7466     const Register table3 = c_rarg6;
 7467     const Register tmp3 = c_rarg7;
 7468 
 7469     BLOCK_COMMENT("Entry:");
 7470     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7471 
 7472     __ kernel_crc32(crc, buf, len,
 7473               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7474 
 7475     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7476     __ ret(lr);
 7477 
 7478     return start;
 7479   }
 7480 
 7481   /**
 7482    *  Arguments:
 7483    *
 7484    * Inputs:
 7485    *   c_rarg0   - int crc
 7486    *   c_rarg1   - byte* buf
 7487    *   c_rarg2   - int length
 7488    *   c_rarg3   - int* table
 7489    *
 7490    * Output:
 7491    *       r0   - int crc result
 7492    */
 7493   address generate_updateBytesCRC32C() {
 7494     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7495 
 7496     __ align(CodeEntryAlignment);
 7497     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7498     StubCodeMark mark(this, stub_id);
 7499 
 7500     address start = __ pc();
 7501 
 7502     const Register crc   = c_rarg0;  // crc
 7503     const Register buf   = c_rarg1;  // source java byte array address
 7504     const Register len   = c_rarg2;  // length
 7505     const Register table0 = c_rarg3; // crc_table address
 7506     const Register table1 = c_rarg4;
 7507     const Register table2 = c_rarg5;
 7508     const Register table3 = c_rarg6;
 7509     const Register tmp3 = c_rarg7;
 7510 
 7511     BLOCK_COMMENT("Entry:");
 7512     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7513 
 7514     __ kernel_crc32c(crc, buf, len,
 7515               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7516 
 7517     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7518     __ ret(lr);
 7519 
 7520     return start;
 7521   }
 7522 
 7523   /***
 7524    *  Arguments:
 7525    *
 7526    *  Inputs:
 7527    *   c_rarg0   - int   adler
 7528    *   c_rarg1   - byte* buff
 7529    *   c_rarg2   - int   len
 7530    *
 7531    * Output:
 7532    *   c_rarg0   - int adler result
 7533    */
 7534   address generate_updateBytesAdler32() {
 7535     __ align(CodeEntryAlignment);
 7536     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7537     StubCodeMark mark(this, stub_id);
 7538     address start = __ pc();
 7539 
 7540     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7541 
 7542     // Aliases
 7543     Register adler  = c_rarg0;
 7544     Register s1     = c_rarg0;
 7545     Register s2     = c_rarg3;
 7546     Register buff   = c_rarg1;
 7547     Register len    = c_rarg2;
 7548     Register nmax  = r4;
 7549     Register base  = r5;
 7550     Register count = r6;
 7551     Register temp0 = rscratch1;
 7552     Register temp1 = rscratch2;
 7553     FloatRegister vbytes = v0;
 7554     FloatRegister vs1acc = v1;
 7555     FloatRegister vs2acc = v2;
 7556     FloatRegister vtable = v3;
 7557 
 7558     // Max number of bytes we can process before having to take the mod
 7559     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7560     uint64_t BASE = 0xfff1;
 7561     uint64_t NMAX = 0x15B0;
 7562 
 7563     __ mov(base, BASE);
 7564     __ mov(nmax, NMAX);
 7565 
 7566     // Load accumulation coefficients for the upper 16 bits
 7567     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7568     __ ld1(vtable, __ T16B, Address(temp0));
 7569 
 7570     // s1 is initialized to the lower 16 bits of adler
 7571     // s2 is initialized to the upper 16 bits of adler
 7572     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7573     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7574 
 7575     // The pipelined loop needs at least 16 elements for 1 iteration
 7576     // It does check this, but it is more effective to skip to the cleanup loop
 7577     __ cmp(len, (u1)16);
 7578     __ br(Assembler::HS, L_nmax);
 7579     __ cbz(len, L_combine);
 7580 
 7581     __ bind(L_simple_by1_loop);
 7582     __ ldrb(temp0, Address(__ post(buff, 1)));
 7583     __ add(s1, s1, temp0);
 7584     __ add(s2, s2, s1);
 7585     __ subs(len, len, 1);
 7586     __ br(Assembler::HI, L_simple_by1_loop);
 7587 
 7588     // s1 = s1 % BASE
 7589     __ subs(temp0, s1, base);
 7590     __ csel(s1, temp0, s1, Assembler::HS);
 7591 
 7592     // s2 = s2 % BASE
 7593     __ lsr(temp0, s2, 16);
 7594     __ lsl(temp1, temp0, 4);
 7595     __ sub(temp1, temp1, temp0);
 7596     __ add(s2, temp1, s2, ext::uxth);
 7597 
 7598     __ subs(temp0, s2, base);
 7599     __ csel(s2, temp0, s2, Assembler::HS);
 7600 
 7601     __ b(L_combine);
 7602 
 7603     __ bind(L_nmax);
 7604     __ subs(len, len, nmax);
 7605     __ sub(count, nmax, 16);
 7606     __ br(Assembler::LO, L_by16);
 7607 
 7608     __ bind(L_nmax_loop);
 7609 
 7610     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7611                                       vbytes, vs1acc, vs2acc, vtable);
 7612 
 7613     __ subs(count, count, 16);
 7614     __ br(Assembler::HS, L_nmax_loop);
 7615 
 7616     // s1 = s1 % BASE
 7617     __ lsr(temp0, s1, 16);
 7618     __ lsl(temp1, temp0, 4);
 7619     __ sub(temp1, temp1, temp0);
 7620     __ add(temp1, temp1, s1, ext::uxth);
 7621 
 7622     __ lsr(temp0, temp1, 16);
 7623     __ lsl(s1, temp0, 4);
 7624     __ sub(s1, s1, temp0);
 7625     __ add(s1, s1, temp1, ext:: uxth);
 7626 
 7627     __ subs(temp0, s1, base);
 7628     __ csel(s1, temp0, s1, Assembler::HS);
 7629 
 7630     // s2 = s2 % BASE
 7631     __ lsr(temp0, s2, 16);
 7632     __ lsl(temp1, temp0, 4);
 7633     __ sub(temp1, temp1, temp0);
 7634     __ add(temp1, temp1, s2, ext::uxth);
 7635 
 7636     __ lsr(temp0, temp1, 16);
 7637     __ lsl(s2, temp0, 4);
 7638     __ sub(s2, s2, temp0);
 7639     __ add(s2, s2, temp1, ext:: uxth);
 7640 
 7641     __ subs(temp0, s2, base);
 7642     __ csel(s2, temp0, s2, Assembler::HS);
 7643 
 7644     __ subs(len, len, nmax);
 7645     __ sub(count, nmax, 16);
 7646     __ br(Assembler::HS, L_nmax_loop);
 7647 
 7648     __ bind(L_by16);
 7649     __ adds(len, len, count);
 7650     __ br(Assembler::LO, L_by1);
 7651 
 7652     __ bind(L_by16_loop);
 7653 
 7654     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7655                                       vbytes, vs1acc, vs2acc, vtable);
 7656 
 7657     __ subs(len, len, 16);
 7658     __ br(Assembler::HS, L_by16_loop);
 7659 
 7660     __ bind(L_by1);
 7661     __ adds(len, len, 15);
 7662     __ br(Assembler::LO, L_do_mod);
 7663 
 7664     __ bind(L_by1_loop);
 7665     __ ldrb(temp0, Address(__ post(buff, 1)));
 7666     __ add(s1, temp0, s1);
 7667     __ add(s2, s2, s1);
 7668     __ subs(len, len, 1);
 7669     __ br(Assembler::HS, L_by1_loop);
 7670 
 7671     __ bind(L_do_mod);
 7672     // s1 = s1 % BASE
 7673     __ lsr(temp0, s1, 16);
 7674     __ lsl(temp1, temp0, 4);
 7675     __ sub(temp1, temp1, temp0);
 7676     __ add(temp1, temp1, s1, ext::uxth);
 7677 
 7678     __ lsr(temp0, temp1, 16);
 7679     __ lsl(s1, temp0, 4);
 7680     __ sub(s1, s1, temp0);
 7681     __ add(s1, s1, temp1, ext:: uxth);
 7682 
 7683     __ subs(temp0, s1, base);
 7684     __ csel(s1, temp0, s1, Assembler::HS);
 7685 
 7686     // s2 = s2 % BASE
 7687     __ lsr(temp0, s2, 16);
 7688     __ lsl(temp1, temp0, 4);
 7689     __ sub(temp1, temp1, temp0);
 7690     __ add(temp1, temp1, s2, ext::uxth);
 7691 
 7692     __ lsr(temp0, temp1, 16);
 7693     __ lsl(s2, temp0, 4);
 7694     __ sub(s2, s2, temp0);
 7695     __ add(s2, s2, temp1, ext:: uxth);
 7696 
 7697     __ subs(temp0, s2, base);
 7698     __ csel(s2, temp0, s2, Assembler::HS);
 7699 
 7700     // Combine lower bits and higher bits
 7701     __ bind(L_combine);
 7702     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7703 
 7704     __ ret(lr);
 7705 
 7706     return start;
 7707   }
 7708 
 7709   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7710           Register temp0, Register temp1, FloatRegister vbytes,
 7711           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7712     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7713     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7714     // In non-vectorized code, we update s1 and s2 as:
 7715     //   s1 <- s1 + b1
 7716     //   s2 <- s2 + s1
 7717     //   s1 <- s1 + b2
 7718     //   s2 <- s2 + b1
 7719     //   ...
 7720     //   s1 <- s1 + b16
 7721     //   s2 <- s2 + s1
 7722     // Putting above assignments together, we have:
 7723     //   s1_new = s1 + b1 + b2 + ... + b16
 7724     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7725     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7726     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7727     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7728 
 7729     // s2 = s2 + s1 * 16
 7730     __ add(s2, s2, s1, Assembler::LSL, 4);
 7731 
 7732     // vs1acc = b1 + b2 + b3 + ... + b16
 7733     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7734     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7735     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7736     __ uaddlv(vs1acc, __ T16B, vbytes);
 7737     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7738 
 7739     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7740     __ fmovd(temp0, vs1acc);
 7741     __ fmovd(temp1, vs2acc);
 7742     __ add(s1, s1, temp0);
 7743     __ add(s2, s2, temp1);
 7744   }
 7745 
 7746   /**
 7747    *  Arguments:
 7748    *
 7749    *  Input:
 7750    *    c_rarg0   - x address
 7751    *    c_rarg1   - x length
 7752    *    c_rarg2   - y address
 7753    *    c_rarg3   - y length
 7754    *    c_rarg4   - z address
 7755    */
 7756   address generate_multiplyToLen() {
 7757     __ align(CodeEntryAlignment);
 7758     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7759     StubCodeMark mark(this, stub_id);
 7760 
 7761     address start = __ pc();
 7762  
 7763     if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
 7764       return start;
 7765     }
 7766     const Register x     = r0;
 7767     const Register xlen  = r1;
 7768     const Register y     = r2;
 7769     const Register ylen  = r3;
 7770     const Register z     = r4;
 7771 
 7772     const Register tmp0  = r5;
 7773     const Register tmp1  = r10;
 7774     const Register tmp2  = r11;
 7775     const Register tmp3  = r12;
 7776     const Register tmp4  = r13;
 7777     const Register tmp5  = r14;
 7778     const Register tmp6  = r15;
 7779     const Register tmp7  = r16;
 7780 
 7781     BLOCK_COMMENT("Entry:");
 7782     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7783     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7784     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7785     __ ret(lr);
 7786 
 7787     AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
 7788     return start;
 7789   }
 7790 
 7791   address generate_squareToLen() {
 7792     // squareToLen algorithm for sizes 1..127 described in java code works
 7793     // faster than multiply_to_len on some CPUs and slower on others, but
 7794     // multiply_to_len shows a bit better overall results
 7795     __ align(CodeEntryAlignment);
 7796     StubId stub_id = StubId::stubgen_squareToLen_id;
 7797     StubCodeMark mark(this, stub_id);
 7798     address start = __ pc();
 7799 
 7800     if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
 7801       return start;
 7802     }
 7803     const Register x     = r0;
 7804     const Register xlen  = r1;
 7805     const Register z     = r2;
 7806     const Register y     = r4; // == x
 7807     const Register ylen  = r5; // == xlen
 7808 
 7809     const Register tmp0  = r3;
 7810     const Register tmp1  = r10;
 7811     const Register tmp2  = r11;
 7812     const Register tmp3  = r12;
 7813     const Register tmp4  = r13;
 7814     const Register tmp5  = r14;
 7815     const Register tmp6  = r15;
 7816     const Register tmp7  = r16;
 7817 
 7818     RegSet spilled_regs = RegSet::of(y, ylen);
 7819     BLOCK_COMMENT("Entry:");
 7820     __ enter();
 7821     __ push(spilled_regs, sp);
 7822     __ mov(y, x);
 7823     __ mov(ylen, xlen);
 7824     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7825     __ pop(spilled_regs, sp);
 7826     __ leave();
 7827     __ ret(lr);
 7828 
 7829     AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
 7830     return start;
 7831   }
 7832 
 7833   address generate_mulAdd() {
 7834     __ align(CodeEntryAlignment);
 7835     StubId stub_id = StubId::stubgen_mulAdd_id;
 7836     StubCodeMark mark(this, stub_id);
 7837 
 7838     address start = __ pc();
 7839 
 7840     if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
 7841       return start;
 7842     }
 7843     const Register out     = r0;
 7844     const Register in      = r1;
 7845     const Register offset  = r2;
 7846     const Register len     = r3;
 7847     const Register k       = r4;
 7848 
 7849     BLOCK_COMMENT("Entry:");
 7850     __ enter();
 7851     __ mul_add(out, in, offset, len, k);
 7852     __ leave();
 7853     __ ret(lr);
 7854 
 7855     AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
 7856     return start;
 7857   }
 7858 
 7859   // Arguments:
 7860   //
 7861   // Input:
 7862   //   c_rarg0   - newArr address
 7863   //   c_rarg1   - oldArr address
 7864   //   c_rarg2   - newIdx
 7865   //   c_rarg3   - shiftCount
 7866   //   c_rarg4   - numIter
 7867   //
 7868   address generate_bigIntegerRightShift() {
 7869     __ align(CodeEntryAlignment);
 7870     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7871     StubCodeMark mark(this, stub_id);
 7872     address start = __ pc();
 7873 
 7874     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7875 
 7876     Register newArr        = c_rarg0;
 7877     Register oldArr        = c_rarg1;
 7878     Register newIdx        = c_rarg2;
 7879     Register shiftCount    = c_rarg3;
 7880     Register numIter       = c_rarg4;
 7881     Register idx           = numIter;
 7882 
 7883     Register newArrCur     = rscratch1;
 7884     Register shiftRevCount = rscratch2;
 7885     Register oldArrCur     = r13;
 7886     Register oldArrNext    = r14;
 7887 
 7888     FloatRegister oldElem0        = v0;
 7889     FloatRegister oldElem1        = v1;
 7890     FloatRegister newElem         = v2;
 7891     FloatRegister shiftVCount     = v3;
 7892     FloatRegister shiftVRevCount  = v4;
 7893 
 7894     __ cbz(idx, Exit);
 7895 
 7896     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7897 
 7898     // left shift count
 7899     __ movw(shiftRevCount, 32);
 7900     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7901 
 7902     // numIter too small to allow a 4-words SIMD loop, rolling back
 7903     __ cmp(numIter, (u1)4);
 7904     __ br(Assembler::LT, ShiftThree);
 7905 
 7906     __ dup(shiftVCount,    __ T4S, shiftCount);
 7907     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7908     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7909 
 7910     __ BIND(ShiftSIMDLoop);
 7911 
 7912     // Calculate the load addresses
 7913     __ sub(idx, idx, 4);
 7914     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7915     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7916     __ add(oldArrCur,  oldArrNext, 4);
 7917 
 7918     // Load 4 words and process
 7919     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7920     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7921     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7922     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7923     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7924     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7925 
 7926     __ cmp(idx, (u1)4);
 7927     __ br(Assembler::LT, ShiftTwoLoop);
 7928     __ b(ShiftSIMDLoop);
 7929 
 7930     __ BIND(ShiftTwoLoop);
 7931     __ cbz(idx, Exit);
 7932     __ cmp(idx, (u1)1);
 7933     __ br(Assembler::EQ, ShiftOne);
 7934 
 7935     // Calculate the load addresses
 7936     __ sub(idx, idx, 2);
 7937     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7938     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7939     __ add(oldArrCur,  oldArrNext, 4);
 7940 
 7941     // Load 2 words and process
 7942     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7943     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7944     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7945     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7946     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7947     __ st1(newElem,   __ T2S, Address(newArrCur));
 7948     __ b(ShiftTwoLoop);
 7949 
 7950     __ BIND(ShiftThree);
 7951     __ tbz(idx, 1, ShiftOne);
 7952     __ tbz(idx, 0, ShiftTwo);
 7953     __ ldrw(r10,  Address(oldArr, 12));
 7954     __ ldrw(r11,  Address(oldArr, 8));
 7955     __ lsrvw(r10, r10, shiftCount);
 7956     __ lslvw(r11, r11, shiftRevCount);
 7957     __ orrw(r12,  r10, r11);
 7958     __ strw(r12,  Address(newArr, 8));
 7959 
 7960     __ BIND(ShiftTwo);
 7961     __ ldrw(r10,  Address(oldArr, 8));
 7962     __ ldrw(r11,  Address(oldArr, 4));
 7963     __ lsrvw(r10, r10, shiftCount);
 7964     __ lslvw(r11, r11, shiftRevCount);
 7965     __ orrw(r12,  r10, r11);
 7966     __ strw(r12,  Address(newArr, 4));
 7967 
 7968     __ BIND(ShiftOne);
 7969     __ ldrw(r10,  Address(oldArr, 4));
 7970     __ ldrw(r11,  Address(oldArr));
 7971     __ lsrvw(r10, r10, shiftCount);
 7972     __ lslvw(r11, r11, shiftRevCount);
 7973     __ orrw(r12,  r10, r11);
 7974     __ strw(r12,  Address(newArr));
 7975 
 7976     __ BIND(Exit);
 7977     __ ret(lr);
 7978 
 7979     return start;
 7980   }
 7981 
 7982   // Arguments:
 7983   //
 7984   // Input:
 7985   //   c_rarg0   - newArr address
 7986   //   c_rarg1   - oldArr address
 7987   //   c_rarg2   - newIdx
 7988   //   c_rarg3   - shiftCount
 7989   //   c_rarg4   - numIter
 7990   //
 7991   address generate_bigIntegerLeftShift() {
 7992     __ align(CodeEntryAlignment);
 7993     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 7994     StubCodeMark mark(this, stub_id);
 7995     address start = __ pc();
 7996 
 7997     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7998 
 7999     Register newArr        = c_rarg0;
 8000     Register oldArr        = c_rarg1;
 8001     Register newIdx        = c_rarg2;
 8002     Register shiftCount    = c_rarg3;
 8003     Register numIter       = c_rarg4;
 8004 
 8005     Register shiftRevCount = rscratch1;
 8006     Register oldArrNext    = rscratch2;
 8007 
 8008     FloatRegister oldElem0        = v0;
 8009     FloatRegister oldElem1        = v1;
 8010     FloatRegister newElem         = v2;
 8011     FloatRegister shiftVCount     = v3;
 8012     FloatRegister shiftVRevCount  = v4;
 8013 
 8014     __ cbz(numIter, Exit);
 8015 
 8016     __ add(oldArrNext, oldArr, 4);
 8017     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8018 
 8019     // right shift count
 8020     __ movw(shiftRevCount, 32);
 8021     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8022 
 8023     // numIter too small to allow a 4-words SIMD loop, rolling back
 8024     __ cmp(numIter, (u1)4);
 8025     __ br(Assembler::LT, ShiftThree);
 8026 
 8027     __ dup(shiftVCount,     __ T4S, shiftCount);
 8028     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8029     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8030 
 8031     __ BIND(ShiftSIMDLoop);
 8032 
 8033     // load 4 words and process
 8034     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8035     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8036     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8037     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8038     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8039     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8040     __ sub(numIter,   numIter, 4);
 8041 
 8042     __ cmp(numIter, (u1)4);
 8043     __ br(Assembler::LT, ShiftTwoLoop);
 8044     __ b(ShiftSIMDLoop);
 8045 
 8046     __ BIND(ShiftTwoLoop);
 8047     __ cbz(numIter, Exit);
 8048     __ cmp(numIter, (u1)1);
 8049     __ br(Assembler::EQ, ShiftOne);
 8050 
 8051     // load 2 words and process
 8052     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8053     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8054     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8055     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8056     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8057     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8058     __ sub(numIter,   numIter, 2);
 8059     __ b(ShiftTwoLoop);
 8060 
 8061     __ BIND(ShiftThree);
 8062     __ ldrw(r10,  __ post(oldArr, 4));
 8063     __ ldrw(r11,  __ post(oldArrNext, 4));
 8064     __ lslvw(r10, r10, shiftCount);
 8065     __ lsrvw(r11, r11, shiftRevCount);
 8066     __ orrw(r12,  r10, r11);
 8067     __ strw(r12,  __ post(newArr, 4));
 8068     __ tbz(numIter, 1, Exit);
 8069     __ tbz(numIter, 0, ShiftOne);
 8070 
 8071     __ BIND(ShiftTwo);
 8072     __ ldrw(r10,  __ post(oldArr, 4));
 8073     __ ldrw(r11,  __ post(oldArrNext, 4));
 8074     __ lslvw(r10, r10, shiftCount);
 8075     __ lsrvw(r11, r11, shiftRevCount);
 8076     __ orrw(r12,  r10, r11);
 8077     __ strw(r12,  __ post(newArr, 4));
 8078 
 8079     __ BIND(ShiftOne);
 8080     __ ldrw(r10,  Address(oldArr));
 8081     __ ldrw(r11,  Address(oldArrNext));
 8082     __ lslvw(r10, r10, shiftCount);
 8083     __ lsrvw(r11, r11, shiftRevCount);
 8084     __ orrw(r12,  r10, r11);
 8085     __ strw(r12,  Address(newArr));
 8086 
 8087     __ BIND(Exit);
 8088     __ ret(lr);
 8089 
 8090     return start;
 8091   }
 8092 
 8093   address generate_count_positives(address &count_positives_long) {
 8094     const u1 large_loop_size = 64;
 8095     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8096     int dcache_line = VM_Version::dcache_line_size();
 8097 
 8098     Register ary1 = r1, len = r2, result = r0;
 8099 
 8100     __ align(CodeEntryAlignment);
 8101 
 8102     StubId stub_id = StubId::stubgen_count_positives_id;
 8103     StubCodeMark mark(this, stub_id);
 8104 
 8105     address entry = __ pc();
 8106 
 8107     __ enter();
 8108     // precondition: a copy of len is already in result
 8109     // __ mov(result, len);
 8110 
 8111   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8112         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8113 
 8114   __ cmp(len, (u1)15);
 8115   __ br(Assembler::GT, LEN_OVER_15);
 8116   // The only case when execution falls into this code is when pointer is near
 8117   // the end of memory page and we have to avoid reading next page
 8118   __ add(ary1, ary1, len);
 8119   __ subs(len, len, 8);
 8120   __ br(Assembler::GT, LEN_OVER_8);
 8121   __ ldr(rscratch2, Address(ary1, -8));
 8122   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8123   __ lsrv(rscratch2, rscratch2, rscratch1);
 8124   __ tst(rscratch2, UPPER_BIT_MASK);
 8125   __ csel(result, zr, result, Assembler::NE);
 8126   __ leave();
 8127   __ ret(lr);
 8128   __ bind(LEN_OVER_8);
 8129   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8130   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8131   __ tst(rscratch2, UPPER_BIT_MASK);
 8132   __ br(Assembler::NE, RET_NO_POP);
 8133   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8134   __ lsrv(rscratch1, rscratch1, rscratch2);
 8135   __ tst(rscratch1, UPPER_BIT_MASK);
 8136   __ bind(RET_NO_POP);
 8137   __ csel(result, zr, result, Assembler::NE);
 8138   __ leave();
 8139   __ ret(lr);
 8140 
 8141   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8142   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8143 
 8144   count_positives_long = __ pc(); // 2nd entry point
 8145 
 8146   __ enter();
 8147 
 8148   __ bind(LEN_OVER_15);
 8149     __ push(spilled_regs, sp);
 8150     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8151     __ cbz(rscratch2, ALIGNED);
 8152     __ ldp(tmp6, tmp1, Address(ary1));
 8153     __ mov(tmp5, 16);
 8154     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8155     __ add(ary1, ary1, rscratch1);
 8156     __ orr(tmp6, tmp6, tmp1);
 8157     __ tst(tmp6, UPPER_BIT_MASK);
 8158     __ br(Assembler::NE, RET_ADJUST);
 8159     __ sub(len, len, rscratch1);
 8160 
 8161   __ bind(ALIGNED);
 8162     __ cmp(len, large_loop_size);
 8163     __ br(Assembler::LT, CHECK_16);
 8164     // Perform 16-byte load as early return in pre-loop to handle situation
 8165     // when initially aligned large array has negative values at starting bytes,
 8166     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8167     // slower. Cases with negative bytes further ahead won't be affected that
 8168     // much. In fact, it'll be faster due to early loads, less instructions and
 8169     // less branches in LARGE_LOOP.
 8170     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8171     __ sub(len, len, 16);
 8172     __ orr(tmp6, tmp6, tmp1);
 8173     __ tst(tmp6, UPPER_BIT_MASK);
 8174     __ br(Assembler::NE, RET_ADJUST_16);
 8175     __ cmp(len, large_loop_size);
 8176     __ br(Assembler::LT, CHECK_16);
 8177 
 8178     if (SoftwarePrefetchHintDistance >= 0
 8179         && SoftwarePrefetchHintDistance >= dcache_line) {
 8180       // initial prefetch
 8181       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8182     }
 8183   __ bind(LARGE_LOOP);
 8184     if (SoftwarePrefetchHintDistance >= 0) {
 8185       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8186     }
 8187     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8188     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8189     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8190     // instructions per cycle and have less branches, but this approach disables
 8191     // early return, thus, all 64 bytes are loaded and checked every time.
 8192     __ ldp(tmp2, tmp3, Address(ary1));
 8193     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8194     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8195     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8196     __ add(ary1, ary1, large_loop_size);
 8197     __ sub(len, len, large_loop_size);
 8198     __ orr(tmp2, tmp2, tmp3);
 8199     __ orr(tmp4, tmp4, tmp5);
 8200     __ orr(rscratch1, rscratch1, rscratch2);
 8201     __ orr(tmp6, tmp6, tmp1);
 8202     __ orr(tmp2, tmp2, tmp4);
 8203     __ orr(rscratch1, rscratch1, tmp6);
 8204     __ orr(tmp2, tmp2, rscratch1);
 8205     __ tst(tmp2, UPPER_BIT_MASK);
 8206     __ br(Assembler::NE, RET_ADJUST_LONG);
 8207     __ cmp(len, large_loop_size);
 8208     __ br(Assembler::GE, LARGE_LOOP);
 8209 
 8210   __ bind(CHECK_16); // small 16-byte load pre-loop
 8211     __ cmp(len, (u1)16);
 8212     __ br(Assembler::LT, POST_LOOP16);
 8213 
 8214   __ bind(LOOP16); // small 16-byte load loop
 8215     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8216     __ sub(len, len, 16);
 8217     __ orr(tmp2, tmp2, tmp3);
 8218     __ tst(tmp2, UPPER_BIT_MASK);
 8219     __ br(Assembler::NE, RET_ADJUST_16);
 8220     __ cmp(len, (u1)16);
 8221     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8222 
 8223   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8224     __ cmp(len, (u1)8);
 8225     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8226     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8227     __ tst(tmp3, UPPER_BIT_MASK);
 8228     __ br(Assembler::NE, RET_ADJUST);
 8229     __ sub(len, len, 8);
 8230 
 8231   __ bind(POST_LOOP16_LOAD_TAIL);
 8232     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8233     __ ldr(tmp1, Address(ary1));
 8234     __ mov(tmp2, 64);
 8235     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8236     __ lslv(tmp1, tmp1, tmp4);
 8237     __ tst(tmp1, UPPER_BIT_MASK);
 8238     __ br(Assembler::NE, RET_ADJUST);
 8239     // Fallthrough
 8240 
 8241   __ bind(RET_LEN);
 8242     __ pop(spilled_regs, sp);
 8243     __ leave();
 8244     __ ret(lr);
 8245 
 8246     // difference result - len is the count of guaranteed to be
 8247     // positive bytes
 8248 
 8249   __ bind(RET_ADJUST_LONG);
 8250     __ add(len, len, (u1)(large_loop_size - 16));
 8251   __ bind(RET_ADJUST_16);
 8252     __ add(len, len, 16);
 8253   __ bind(RET_ADJUST);
 8254     __ pop(spilled_regs, sp);
 8255     __ leave();
 8256     __ sub(result, result, len);
 8257     __ ret(lr);
 8258 
 8259     return entry;
 8260   }
 8261 
 8262   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8263         bool usePrefetch, Label &NOT_EQUAL) {
 8264     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8265         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8266         tmp7 = r12, tmp8 = r13;
 8267     Label LOOP;
 8268 
 8269     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8270     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8271     __ bind(LOOP);
 8272     if (usePrefetch) {
 8273       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8274       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8275     }
 8276     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8277     __ eor(tmp1, tmp1, tmp2);
 8278     __ eor(tmp3, tmp3, tmp4);
 8279     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8280     __ orr(tmp1, tmp1, tmp3);
 8281     __ cbnz(tmp1, NOT_EQUAL);
 8282     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8283     __ eor(tmp5, tmp5, tmp6);
 8284     __ eor(tmp7, tmp7, tmp8);
 8285     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8286     __ orr(tmp5, tmp5, tmp7);
 8287     __ cbnz(tmp5, NOT_EQUAL);
 8288     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8289     __ eor(tmp1, tmp1, tmp2);
 8290     __ eor(tmp3, tmp3, tmp4);
 8291     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8292     __ orr(tmp1, tmp1, tmp3);
 8293     __ cbnz(tmp1, NOT_EQUAL);
 8294     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8295     __ eor(tmp5, tmp5, tmp6);
 8296     __ sub(cnt1, cnt1, 8 * wordSize);
 8297     __ eor(tmp7, tmp7, tmp8);
 8298     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8299     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8300     // cmp) because subs allows an unlimited range of immediate operand.
 8301     __ subs(tmp6, cnt1, loopThreshold);
 8302     __ orr(tmp5, tmp5, tmp7);
 8303     __ cbnz(tmp5, NOT_EQUAL);
 8304     __ br(__ GE, LOOP);
 8305     // post-loop
 8306     __ eor(tmp1, tmp1, tmp2);
 8307     __ eor(tmp3, tmp3, tmp4);
 8308     __ orr(tmp1, tmp1, tmp3);
 8309     __ sub(cnt1, cnt1, 2 * wordSize);
 8310     __ cbnz(tmp1, NOT_EQUAL);
 8311   }
 8312 
 8313   void generate_large_array_equals_loop_simd(int loopThreshold,
 8314         bool usePrefetch, Label &NOT_EQUAL) {
 8315     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8316         tmp2 = rscratch2;
 8317     Label LOOP;
 8318 
 8319     __ bind(LOOP);
 8320     if (usePrefetch) {
 8321       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8322       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8323     }
 8324     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8325     __ sub(cnt1, cnt1, 8 * wordSize);
 8326     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8327     __ subs(tmp1, cnt1, loopThreshold);
 8328     __ eor(v0, __ T16B, v0, v4);
 8329     __ eor(v1, __ T16B, v1, v5);
 8330     __ eor(v2, __ T16B, v2, v6);
 8331     __ eor(v3, __ T16B, v3, v7);
 8332     __ orr(v0, __ T16B, v0, v1);
 8333     __ orr(v1, __ T16B, v2, v3);
 8334     __ orr(v0, __ T16B, v0, v1);
 8335     __ umov(tmp1, v0, __ D, 0);
 8336     __ umov(tmp2, v0, __ D, 1);
 8337     __ orr(tmp1, tmp1, tmp2);
 8338     __ cbnz(tmp1, NOT_EQUAL);
 8339     __ br(__ GE, LOOP);
 8340   }
 8341 
 8342   // a1 = r1 - array1 address
 8343   // a2 = r2 - array2 address
 8344   // result = r0 - return value. Already contains "false"
 8345   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8346   // r3-r5 are reserved temporary registers
 8347   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8348   address generate_large_array_equals() {
 8349     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8350         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8351         tmp7 = r12, tmp8 = r13;
 8352     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8353         SMALL_LOOP, POST_LOOP;
 8354     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8355     // calculate if at least 32 prefetched bytes are used
 8356     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8357     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8358     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8359     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8360         tmp5, tmp6, tmp7, tmp8);
 8361 
 8362     __ align(CodeEntryAlignment);
 8363 
 8364     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8365     StubCodeMark mark(this, stub_id);
 8366 
 8367     address entry = __ pc();
 8368     __ enter();
 8369     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8370     // also advance pointers to use post-increment instead of pre-increment
 8371     __ add(a1, a1, wordSize);
 8372     __ add(a2, a2, wordSize);
 8373     if (AvoidUnalignedAccesses) {
 8374       // both implementations (SIMD/nonSIMD) are using relatively large load
 8375       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8376       // on some CPUs in case of address is not at least 16-byte aligned.
 8377       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8378       // load if needed at least for 1st address and make if 16-byte aligned.
 8379       Label ALIGNED16;
 8380       __ tbz(a1, 3, ALIGNED16);
 8381       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8382       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8383       __ sub(cnt1, cnt1, wordSize);
 8384       __ eor(tmp1, tmp1, tmp2);
 8385       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8386       __ bind(ALIGNED16);
 8387     }
 8388     if (UseSIMDForArrayEquals) {
 8389       if (SoftwarePrefetchHintDistance >= 0) {
 8390         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8391         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8392         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8393             /* prfm = */ true, NOT_EQUAL);
 8394         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8395         __ br(__ LT, TAIL);
 8396       }
 8397       __ bind(NO_PREFETCH_LARGE_LOOP);
 8398       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8399           /* prfm = */ false, NOT_EQUAL);
 8400     } else {
 8401       __ push(spilled_regs, sp);
 8402       if (SoftwarePrefetchHintDistance >= 0) {
 8403         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8404         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8405         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8406             /* prfm = */ true, NOT_EQUAL);
 8407         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8408         __ br(__ LT, TAIL);
 8409       }
 8410       __ bind(NO_PREFETCH_LARGE_LOOP);
 8411       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8412           /* prfm = */ false, NOT_EQUAL);
 8413     }
 8414     __ bind(TAIL);
 8415       __ cbz(cnt1, EQUAL);
 8416       __ subs(cnt1, cnt1, wordSize);
 8417       __ br(__ LE, POST_LOOP);
 8418     __ bind(SMALL_LOOP);
 8419       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8420       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8421       __ subs(cnt1, cnt1, wordSize);
 8422       __ eor(tmp1, tmp1, tmp2);
 8423       __ cbnz(tmp1, NOT_EQUAL);
 8424       __ br(__ GT, SMALL_LOOP);
 8425     __ bind(POST_LOOP);
 8426       __ ldr(tmp1, Address(a1, cnt1));
 8427       __ ldr(tmp2, Address(a2, cnt1));
 8428       __ eor(tmp1, tmp1, tmp2);
 8429       __ cbnz(tmp1, NOT_EQUAL);
 8430     __ bind(EQUAL);
 8431       __ mov(result, true);
 8432     __ bind(NOT_EQUAL);
 8433       if (!UseSIMDForArrayEquals) {
 8434         __ pop(spilled_regs, sp);
 8435       }
 8436     __ bind(NOT_EQUAL_NO_POP);
 8437     __ leave();
 8438     __ ret(lr);
 8439     return entry;
 8440   }
 8441 
 8442   // result = r0 - return value. Contains initial hashcode value on entry.
 8443   // ary = r1 - array address
 8444   // cnt = r2 - elements count
 8445   // Clobbers: v0-v13, rscratch1, rscratch2
 8446   address generate_large_arrays_hashcode(BasicType eltype) {
 8447     const Register result = r0, ary = r1, cnt = r2;
 8448     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8449     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8450     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8451     const FloatRegister vpowm = v13;
 8452 
 8453     ARRAYS_HASHCODE_REGISTERS;
 8454 
 8455     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8456 
 8457     unsigned int vf; // vectorization factor
 8458     bool multiply_by_halves;
 8459     Assembler::SIMD_Arrangement load_arrangement;
 8460     switch (eltype) {
 8461     case T_BOOLEAN:
 8462     case T_BYTE:
 8463       load_arrangement = Assembler::T8B;
 8464       multiply_by_halves = true;
 8465       vf = 8;
 8466       break;
 8467     case T_CHAR:
 8468     case T_SHORT:
 8469       load_arrangement = Assembler::T8H;
 8470       multiply_by_halves = true;
 8471       vf = 8;
 8472       break;
 8473     case T_INT:
 8474       load_arrangement = Assembler::T4S;
 8475       multiply_by_halves = false;
 8476       vf = 4;
 8477       break;
 8478     default:
 8479       ShouldNotReachHere();
 8480     }
 8481 
 8482     // Unroll factor
 8483     const unsigned uf = 4;
 8484 
 8485     // Effective vectorization factor
 8486     const unsigned evf = vf * uf;
 8487 
 8488     __ align(CodeEntryAlignment);
 8489 
 8490     StubId stub_id;
 8491     switch (eltype) {
 8492     case T_BOOLEAN:
 8493       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8494       break;
 8495     case T_BYTE:
 8496       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8497       break;
 8498     case T_CHAR:
 8499       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8500       break;
 8501     case T_SHORT:
 8502       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8503       break;
 8504     case T_INT:
 8505       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8506       break;
 8507     default:
 8508       stub_id = StubId::NO_STUBID;
 8509       ShouldNotReachHere();
 8510     };
 8511 
 8512     StubCodeMark mark(this, stub_id);
 8513 
 8514     address entry = __ pc();
 8515     __ enter();
 8516 
 8517     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8518     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8519     // value shouldn't change throughout both loops.
 8520     __ movw(rscratch1, intpow(31U, 3));
 8521     __ mov(vpow, Assembler::S, 0, rscratch1);
 8522     __ movw(rscratch1, intpow(31U, 2));
 8523     __ mov(vpow, Assembler::S, 1, rscratch1);
 8524     __ movw(rscratch1, intpow(31U, 1));
 8525     __ mov(vpow, Assembler::S, 2, rscratch1);
 8526     __ movw(rscratch1, intpow(31U, 0));
 8527     __ mov(vpow, Assembler::S, 3, rscratch1);
 8528 
 8529     __ mov(vmul0, Assembler::T16B, 0);
 8530     __ mov(vmul0, Assembler::S, 3, result);
 8531 
 8532     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8533     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8534 
 8535     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8536     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8537 
 8538     // SMALL LOOP
 8539     __ bind(SMALL_LOOP);
 8540 
 8541     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8542     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8543     __ subsw(rscratch2, rscratch2, vf);
 8544 
 8545     if (load_arrangement == Assembler::T8B) {
 8546       // Extend 8B to 8H to be able to use vector multiply
 8547       // instructions
 8548       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8549       if (is_signed_subword_type(eltype)) {
 8550         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8551       } else {
 8552         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8553       }
 8554     }
 8555 
 8556     switch (load_arrangement) {
 8557     case Assembler::T4S:
 8558       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8559       break;
 8560     case Assembler::T8B:
 8561     case Assembler::T8H:
 8562       assert(is_subword_type(eltype), "subword type expected");
 8563       if (is_signed_subword_type(eltype)) {
 8564         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8565       } else {
 8566         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8567       }
 8568       break;
 8569     default:
 8570       __ should_not_reach_here();
 8571     }
 8572 
 8573     // Process the upper half of a vector
 8574     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8575       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8576       if (is_signed_subword_type(eltype)) {
 8577         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8578       } else {
 8579         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8580       }
 8581     }
 8582 
 8583     __ br(Assembler::HI, SMALL_LOOP);
 8584 
 8585     // SMALL LOOP'S EPILOQUE
 8586     __ lsr(rscratch2, cnt, exact_log2(evf));
 8587     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8588 
 8589     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8590     __ addv(vmul0, Assembler::T4S, vmul0);
 8591     __ umov(result, vmul0, Assembler::S, 0);
 8592 
 8593     // TAIL
 8594     __ bind(TAIL);
 8595 
 8596     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8597     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8598     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8599     __ andr(rscratch2, cnt, vf - 1);
 8600     __ bind(TAIL_SHORTCUT);
 8601     __ adr(rscratch1, BR_BASE);
 8602     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8603     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8604     __ movw(rscratch2, 0x1f);
 8605     __ br(rscratch1);
 8606 
 8607     for (size_t i = 0; i < vf - 1; ++i) {
 8608       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8609                                    eltype);
 8610       __ maddw(result, result, rscratch2, rscratch1);
 8611       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8612       // Generate 2nd nop to have 4 instructions per iteration.
 8613       if (VM_Version::supports_a53mac()) {
 8614         __ nop();
 8615       }
 8616     }
 8617     __ bind(BR_BASE);
 8618 
 8619     __ leave();
 8620     __ ret(lr);
 8621 
 8622     // LARGE LOOP
 8623     __ bind(LARGE_LOOP_PREHEADER);
 8624 
 8625     __ lsr(rscratch2, cnt, exact_log2(evf));
 8626 
 8627     if (multiply_by_halves) {
 8628       // 31^4 - multiplier between lower and upper parts of a register
 8629       __ movw(rscratch1, intpow(31U, vf / 2));
 8630       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8631       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8632       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8633       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8634     } else {
 8635       // 31^16
 8636       __ movw(rscratch1, intpow(31U, evf));
 8637       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8638     }
 8639 
 8640     __ mov(vmul3, Assembler::T16B, 0);
 8641     __ mov(vmul2, Assembler::T16B, 0);
 8642     __ mov(vmul1, Assembler::T16B, 0);
 8643 
 8644     __ bind(LARGE_LOOP);
 8645 
 8646     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8647     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8648     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8649     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8650 
 8651     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8652            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8653 
 8654     if (load_arrangement == Assembler::T8B) {
 8655       // Extend 8B to 8H to be able to use vector multiply
 8656       // instructions
 8657       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8658       if (is_signed_subword_type(eltype)) {
 8659         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8660         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8661         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8662         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8663       } else {
 8664         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8665         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8666         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8667         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8668       }
 8669     }
 8670 
 8671     switch (load_arrangement) {
 8672     case Assembler::T4S:
 8673       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8674       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8675       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8676       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8677       break;
 8678     case Assembler::T8B:
 8679     case Assembler::T8H:
 8680       assert(is_subword_type(eltype), "subword type expected");
 8681       if (is_signed_subword_type(eltype)) {
 8682         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8683         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8684         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8685         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8686       } else {
 8687         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8688         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8689         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8690         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8691       }
 8692       break;
 8693     default:
 8694       __ should_not_reach_here();
 8695     }
 8696 
 8697     // Process the upper half of a vector
 8698     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8699       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8700       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8701       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8702       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8703       if (is_signed_subword_type(eltype)) {
 8704         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8705         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8706         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8707         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8708       } else {
 8709         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8710         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8711         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8712         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8713       }
 8714     }
 8715 
 8716     __ subsw(rscratch2, rscratch2, 1);
 8717     __ br(Assembler::HI, LARGE_LOOP);
 8718 
 8719     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8720     __ addv(vmul3, Assembler::T4S, vmul3);
 8721     __ umov(result, vmul3, Assembler::S, 0);
 8722 
 8723     __ mov(rscratch2, intpow(31U, vf));
 8724 
 8725     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8726     __ addv(vmul2, Assembler::T4S, vmul2);
 8727     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8728     __ maddw(result, result, rscratch2, rscratch1);
 8729 
 8730     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8731     __ addv(vmul1, Assembler::T4S, vmul1);
 8732     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8733     __ maddw(result, result, rscratch2, rscratch1);
 8734 
 8735     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8736     __ addv(vmul0, Assembler::T4S, vmul0);
 8737     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8738     __ maddw(result, result, rscratch2, rscratch1);
 8739 
 8740     __ andr(rscratch2, cnt, vf - 1);
 8741     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8742 
 8743     __ leave();
 8744     __ ret(lr);
 8745 
 8746     return entry;
 8747   }
 8748 
 8749   address generate_dsin_dcos(bool isCos) {
 8750     __ align(CodeEntryAlignment);
 8751     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8752     StubCodeMark mark(this, stub_id);
 8753     address start = __ pc();
 8754     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8755         (address)StubRoutines::aarch64::_two_over_pi,
 8756         (address)StubRoutines::aarch64::_pio2,
 8757         (address)StubRoutines::aarch64::_dsin_coef,
 8758         (address)StubRoutines::aarch64::_dcos_coef);
 8759     return start;
 8760   }
 8761 
 8762   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8763   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8764       Label &DIFF2) {
 8765     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8766     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8767 
 8768     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8769     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8770     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8771     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8772 
 8773     __ fmovd(tmpL, vtmp3);
 8774     __ eor(rscratch2, tmp3, tmpL);
 8775     __ cbnz(rscratch2, DIFF2);
 8776 
 8777     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8778     __ umov(tmpL, vtmp3, __ D, 1);
 8779     __ eor(rscratch2, tmpU, tmpL);
 8780     __ cbnz(rscratch2, DIFF1);
 8781 
 8782     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8783     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8784     __ fmovd(tmpL, vtmp);
 8785     __ eor(rscratch2, tmp3, tmpL);
 8786     __ cbnz(rscratch2, DIFF2);
 8787 
 8788     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8789     __ umov(tmpL, vtmp, __ D, 1);
 8790     __ eor(rscratch2, tmpU, tmpL);
 8791     __ cbnz(rscratch2, DIFF1);
 8792   }
 8793 
 8794   // r0  = result
 8795   // r1  = str1
 8796   // r2  = cnt1
 8797   // r3  = str2
 8798   // r4  = cnt2
 8799   // r10 = tmp1
 8800   // r11 = tmp2
 8801   address generate_compare_long_string_different_encoding(bool isLU) {
 8802     __ align(CodeEntryAlignment);
 8803     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8804     StubCodeMark mark(this, stub_id);
 8805     address entry = __ pc();
 8806     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8807         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8808         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8809     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8810         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8811     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8812     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8813 
 8814     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8815 
 8816     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8817     // cnt2 == amount of characters left to compare
 8818     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8819     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8820     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8821     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8822     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8823     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8824     __ eor(rscratch2, tmp1, tmp2);
 8825     __ mov(rscratch1, tmp2);
 8826     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8827     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8828              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8829     __ push(spilled_regs, sp);
 8830     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8831     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8832 
 8833     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8834 
 8835     if (SoftwarePrefetchHintDistance >= 0) {
 8836       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8837       __ br(__ LT, NO_PREFETCH);
 8838       __ bind(LARGE_LOOP_PREFETCH);
 8839         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8840         __ mov(tmp4, 2);
 8841         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8842         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8843           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8844           __ subs(tmp4, tmp4, 1);
 8845           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8846           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8847           __ mov(tmp4, 2);
 8848         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8849           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8850           __ subs(tmp4, tmp4, 1);
 8851           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8852           __ sub(cnt2, cnt2, 64);
 8853           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8854           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8855     }
 8856     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8857     __ bind(NO_PREFETCH);
 8858     __ subs(cnt2, cnt2, 16);
 8859     __ br(__ LT, TAIL);
 8860     __ align(OptoLoopAlignment);
 8861     __ bind(SMALL_LOOP); // smaller loop
 8862       __ subs(cnt2, cnt2, 16);
 8863       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8864       __ br(__ GE, SMALL_LOOP);
 8865       __ cmn(cnt2, (u1)16);
 8866       __ br(__ EQ, LOAD_LAST);
 8867     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8868       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8869       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8870       __ ldr(tmp3, Address(cnt1, -8));
 8871       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8872       __ b(LOAD_LAST);
 8873     __ bind(DIFF2);
 8874       __ mov(tmpU, tmp3);
 8875     __ bind(DIFF1);
 8876       __ pop(spilled_regs, sp);
 8877       __ b(CALCULATE_DIFFERENCE);
 8878     __ bind(LOAD_LAST);
 8879       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8880       // No need to load it again
 8881       __ mov(tmpU, tmp3);
 8882       __ pop(spilled_regs, sp);
 8883 
 8884       // tmp2 points to the address of the last 4 Latin1 characters right now
 8885       __ ldrs(vtmp, Address(tmp2));
 8886       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8887       __ fmovd(tmpL, vtmp);
 8888 
 8889       __ eor(rscratch2, tmpU, tmpL);
 8890       __ cbz(rscratch2, DONE);
 8891 
 8892     // Find the first different characters in the longwords and
 8893     // compute their difference.
 8894     __ bind(CALCULATE_DIFFERENCE);
 8895       __ rev(rscratch2, rscratch2);
 8896       __ clz(rscratch2, rscratch2);
 8897       __ andr(rscratch2, rscratch2, -16);
 8898       __ lsrv(tmp1, tmp1, rscratch2);
 8899       __ uxthw(tmp1, tmp1);
 8900       __ lsrv(rscratch1, rscratch1, rscratch2);
 8901       __ uxthw(rscratch1, rscratch1);
 8902       __ subw(result, tmp1, rscratch1);
 8903     __ bind(DONE);
 8904       __ ret(lr);
 8905     return entry;
 8906   }
 8907 
 8908   // r0 = input (float16)
 8909   // v0 = result (float)
 8910   // v1 = temporary float register
 8911   address generate_float16ToFloat() {
 8912     __ align(CodeEntryAlignment);
 8913     StubId stub_id = StubId::stubgen_hf2f_id;
 8914     StubCodeMark mark(this, stub_id);
 8915     address entry = __ pc();
 8916     BLOCK_COMMENT("Entry:");
 8917     __ flt16_to_flt(v0, r0, v1);
 8918     __ ret(lr);
 8919     return entry;
 8920   }
 8921 
 8922   // v0 = input (float)
 8923   // r0 = result (float16)
 8924   // v1 = temporary float register
 8925   address generate_floatToFloat16() {
 8926     __ align(CodeEntryAlignment);
 8927     StubId stub_id = StubId::stubgen_f2hf_id;
 8928     StubCodeMark mark(this, stub_id);
 8929     address entry = __ pc();
 8930     BLOCK_COMMENT("Entry:");
 8931     __ flt_to_flt16(r0, v0, v1);
 8932     __ ret(lr);
 8933     return entry;
 8934   }
 8935 
 8936   address generate_method_entry_barrier() {
 8937     __ align(CodeEntryAlignment);
 8938     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 8939     StubCodeMark mark(this, stub_id);
 8940 
 8941     Label deoptimize_label;
 8942 
 8943     address start = __ pc();
 8944 
 8945     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8946 
 8947     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8948       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8949       // We can get here despite the nmethod being good, if we have not
 8950       // yet applied our cross modification fence (or data fence).
 8951       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8952       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8953       __ ldrw(rscratch2, rscratch2);
 8954       __ strw(rscratch2, thread_epoch_addr);
 8955       __ isb();
 8956       __ membar(__ LoadLoad);
 8957     }
 8958 
 8959     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8960 
 8961     __ enter();
 8962     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8963 
 8964     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8965 
 8966     __ push_call_clobbered_registers();
 8967 
 8968     __ mov(c_rarg0, rscratch2);
 8969     __ call_VM_leaf
 8970          (CAST_FROM_FN_PTR
 8971           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8972 
 8973     __ reset_last_Java_frame(true);
 8974 
 8975     __ mov(rscratch1, r0);
 8976 
 8977     __ pop_call_clobbered_registers();
 8978 
 8979     __ cbnz(rscratch1, deoptimize_label);
 8980 
 8981     __ leave();
 8982     __ ret(lr);
 8983 
 8984     __ BIND(deoptimize_label);
 8985 
 8986     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8987     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8988 
 8989     __ mov(sp, rscratch1);
 8990     __ br(rscratch2);
 8991 
 8992     return start;
 8993   }
 8994 
 8995   // r0  = result
 8996   // r1  = str1
 8997   // r2  = cnt1
 8998   // r3  = str2
 8999   // r4  = cnt2
 9000   // r10 = tmp1
 9001   // r11 = tmp2
 9002   address generate_compare_long_string_same_encoding(bool isLL) {
 9003     __ align(CodeEntryAlignment);
 9004     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9005     StubCodeMark mark(this, stub_id);
 9006     address entry = __ pc();
 9007     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9008         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9009 
 9010     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9011 
 9012     // exit from large loop when less than 64 bytes left to read or we're about
 9013     // to prefetch memory behind array border
 9014     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9015 
 9016     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9017     __ eor(rscratch2, tmp1, tmp2);
 9018     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9019 
 9020     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9021     // update pointers, because of previous read
 9022     __ add(str1, str1, wordSize);
 9023     __ add(str2, str2, wordSize);
 9024     if (SoftwarePrefetchHintDistance >= 0) {
 9025       __ align(OptoLoopAlignment);
 9026       __ bind(LARGE_LOOP_PREFETCH);
 9027         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9028         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9029 
 9030         for (int i = 0; i < 4; i++) {
 9031           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9032           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9033           __ cmp(tmp1, tmp2);
 9034           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9035           __ br(Assembler::NE, DIFF);
 9036         }
 9037         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9038         __ add(str1, str1, 64);
 9039         __ add(str2, str2, 64);
 9040         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9041         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9042         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9043     }
 9044 
 9045     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9046     __ br(Assembler::LE, LESS16);
 9047     __ align(OptoLoopAlignment);
 9048     __ bind(LOOP_COMPARE16);
 9049       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9050       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9051       __ cmp(tmp1, tmp2);
 9052       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9053       __ br(Assembler::NE, DIFF);
 9054       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9055       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9056       __ br(Assembler::LT, LESS16);
 9057 
 9058       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9059       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9060       __ cmp(tmp1, tmp2);
 9061       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9062       __ br(Assembler::NE, DIFF);
 9063       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9064       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9065       __ br(Assembler::GE, LOOP_COMPARE16);
 9066       __ cbz(cnt2, LENGTH_DIFF);
 9067 
 9068     __ bind(LESS16);
 9069       // each 8 compare
 9070       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9071       __ br(Assembler::LE, LESS8);
 9072       __ ldr(tmp1, Address(__ post(str1, 8)));
 9073       __ ldr(tmp2, Address(__ post(str2, 8)));
 9074       __ eor(rscratch2, tmp1, tmp2);
 9075       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9076       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9077 
 9078     __ bind(LESS8); // directly load last 8 bytes
 9079       if (!isLL) {
 9080         __ add(cnt2, cnt2, cnt2);
 9081       }
 9082       __ ldr(tmp1, Address(str1, cnt2));
 9083       __ ldr(tmp2, Address(str2, cnt2));
 9084       __ eor(rscratch2, tmp1, tmp2);
 9085       __ cbz(rscratch2, LENGTH_DIFF);
 9086       __ b(CAL_DIFFERENCE);
 9087 
 9088     __ bind(DIFF);
 9089       __ cmp(tmp1, tmp2);
 9090       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9091       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9092       // reuse rscratch2 register for the result of eor instruction
 9093       __ eor(rscratch2, tmp1, tmp2);
 9094 
 9095     __ bind(CAL_DIFFERENCE);
 9096       __ rev(rscratch2, rscratch2);
 9097       __ clz(rscratch2, rscratch2);
 9098       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9099       __ lsrv(tmp1, tmp1, rscratch2);
 9100       __ lsrv(tmp2, tmp2, rscratch2);
 9101       if (isLL) {
 9102         __ uxtbw(tmp1, tmp1);
 9103         __ uxtbw(tmp2, tmp2);
 9104       } else {
 9105         __ uxthw(tmp1, tmp1);
 9106         __ uxthw(tmp2, tmp2);
 9107       }
 9108       __ subw(result, tmp1, tmp2);
 9109 
 9110     __ bind(LENGTH_DIFF);
 9111       __ ret(lr);
 9112     return entry;
 9113   }
 9114 
 9115   enum string_compare_mode {
 9116     LL,
 9117     LU,
 9118     UL,
 9119     UU,
 9120   };
 9121 
 9122   // The following registers are declared in aarch64.ad
 9123   // r0  = result
 9124   // r1  = str1
 9125   // r2  = cnt1
 9126   // r3  = str2
 9127   // r4  = cnt2
 9128   // r10 = tmp1
 9129   // r11 = tmp2
 9130   // z0  = ztmp1
 9131   // z1  = ztmp2
 9132   // p0  = pgtmp1
 9133   // p1  = pgtmp2
 9134   address generate_compare_long_string_sve(string_compare_mode mode) {
 9135     StubId stub_id;
 9136     switch (mode) {
 9137       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9138       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9139       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9140       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9141       default: ShouldNotReachHere();
 9142     }
 9143 
 9144     __ align(CodeEntryAlignment);
 9145     address entry = __ pc();
 9146     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9147              tmp1 = r10, tmp2 = r11;
 9148 
 9149     Label LOOP, DONE, MISMATCH;
 9150     Register vec_len = tmp1;
 9151     Register idx = tmp2;
 9152     // The minimum of the string lengths has been stored in cnt2.
 9153     Register cnt = cnt2;
 9154     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9155     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9156 
 9157 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9158     switch (mode) {                                                            \
 9159       case LL:                                                                 \
 9160         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9161         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9162         break;                                                                 \
 9163       case LU:                                                                 \
 9164         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9165         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9166         break;                                                                 \
 9167       case UL:                                                                 \
 9168         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9169         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9170         break;                                                                 \
 9171       case UU:                                                                 \
 9172         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9173         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9174         break;                                                                 \
 9175       default:                                                                 \
 9176         ShouldNotReachHere();                                                  \
 9177     }
 9178 
 9179     StubCodeMark mark(this, stub_id);
 9180 
 9181     __ mov(idx, 0);
 9182     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9183 
 9184     if (mode == LL) {
 9185       __ sve_cntb(vec_len);
 9186     } else {
 9187       __ sve_cnth(vec_len);
 9188     }
 9189 
 9190     __ sub(rscratch1, cnt, vec_len);
 9191 
 9192     __ bind(LOOP);
 9193 
 9194       // main loop
 9195       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9196       __ add(idx, idx, vec_len);
 9197       // Compare strings.
 9198       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9199       __ br(__ NE, MISMATCH);
 9200       __ cmp(idx, rscratch1);
 9201       __ br(__ LT, LOOP);
 9202 
 9203     // post loop, last iteration
 9204     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9205 
 9206     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9207     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9208     __ br(__ EQ, DONE);
 9209 
 9210     __ bind(MISMATCH);
 9211 
 9212     // Crop the vector to find its location.
 9213     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9214     // Extract the first different characters of each string.
 9215     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9216     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9217 
 9218     // Compute the difference of the first different characters.
 9219     __ sub(result, rscratch1, rscratch2);
 9220 
 9221     __ bind(DONE);
 9222     __ ret(lr);
 9223 #undef LOAD_PAIR
 9224     return entry;
 9225   }
 9226 
 9227   void generate_compare_long_strings() {
 9228     if (UseSVE == 0) {
 9229       StubRoutines::aarch64::_compare_long_string_LL
 9230           = generate_compare_long_string_same_encoding(true);
 9231       StubRoutines::aarch64::_compare_long_string_UU
 9232           = generate_compare_long_string_same_encoding(false);
 9233       StubRoutines::aarch64::_compare_long_string_LU
 9234           = generate_compare_long_string_different_encoding(true);
 9235       StubRoutines::aarch64::_compare_long_string_UL
 9236           = generate_compare_long_string_different_encoding(false);
 9237     } else {
 9238       StubRoutines::aarch64::_compare_long_string_LL
 9239           = generate_compare_long_string_sve(LL);
 9240       StubRoutines::aarch64::_compare_long_string_UU
 9241           = generate_compare_long_string_sve(UU);
 9242       StubRoutines::aarch64::_compare_long_string_LU
 9243           = generate_compare_long_string_sve(LU);
 9244       StubRoutines::aarch64::_compare_long_string_UL
 9245           = generate_compare_long_string_sve(UL);
 9246     }
 9247   }
 9248 
 9249   // R0 = result
 9250   // R1 = str2
 9251   // R2 = cnt1
 9252   // R3 = str1
 9253   // R4 = cnt2
 9254   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9255   //
 9256   // This generic linear code use few additional ideas, which makes it faster:
 9257   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9258   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9259   // 2) we can use "fast" algorithm of finding single character to search for
 9260   // first symbol with less branches(1 branch per each loaded register instead
 9261   // of branch for each symbol), so, this is where constants like
 9262   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9263   // 3) after loading and analyzing 1st register of source string, it can be
 9264   // used to search for every 1st character entry, saving few loads in
 9265   // comparison with "simplier-but-slower" implementation
 9266   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9267   // re-using/re-initializing/compressing register values, which makes code
 9268   // larger and a bit less readable, however, most of extra operations are
 9269   // issued during loads or branches, so, penalty is minimal
 9270   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9271     StubId stub_id;
 9272     if (str1_isL) {
 9273       if (str2_isL) {
 9274         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9275       } else {
 9276         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9277       }
 9278     } else {
 9279       if (str2_isL) {
 9280         ShouldNotReachHere();
 9281       } else {
 9282         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9283       }
 9284     }
 9285     __ align(CodeEntryAlignment);
 9286     StubCodeMark mark(this, stub_id);
 9287     address entry = __ pc();
 9288 
 9289     int str1_chr_size = str1_isL ? 1 : 2;
 9290     int str2_chr_size = str2_isL ? 1 : 2;
 9291     int str1_chr_shift = str1_isL ? 0 : 1;
 9292     int str2_chr_shift = str2_isL ? 0 : 1;
 9293     bool isL = str1_isL && str2_isL;
 9294    // parameters
 9295     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9296     // temporary registers
 9297     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9298     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9299     // redefinitions
 9300     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9301 
 9302     __ push(spilled_regs, sp);
 9303     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9304         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9305         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9306         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9307         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9308         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9309     // Read whole register from str1. It is safe, because length >=8 here
 9310     __ ldr(ch1, Address(str1));
 9311     // Read whole register from str2. It is safe, because length >=8 here
 9312     __ ldr(ch2, Address(str2));
 9313     __ sub(cnt2, cnt2, cnt1);
 9314     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9315     if (str1_isL != str2_isL) {
 9316       __ eor(v0, __ T16B, v0, v0);
 9317     }
 9318     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9319     __ mul(first, first, tmp1);
 9320     // check if we have less than 1 register to check
 9321     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9322     if (str1_isL != str2_isL) {
 9323       __ fmovd(v1, ch1);
 9324     }
 9325     __ br(__ LE, L_SMALL);
 9326     __ eor(ch2, first, ch2);
 9327     if (str1_isL != str2_isL) {
 9328       __ zip1(v1, __ T16B, v1, v0);
 9329     }
 9330     __ sub(tmp2, ch2, tmp1);
 9331     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9332     __ bics(tmp2, tmp2, ch2);
 9333     if (str1_isL != str2_isL) {
 9334       __ fmovd(ch1, v1);
 9335     }
 9336     __ br(__ NE, L_HAS_ZERO);
 9337     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9338     __ add(result, result, wordSize/str2_chr_size);
 9339     __ add(str2, str2, wordSize);
 9340     __ br(__ LT, L_POST_LOOP);
 9341     __ BIND(L_LOOP);
 9342       __ ldr(ch2, Address(str2));
 9343       __ eor(ch2, first, ch2);
 9344       __ sub(tmp2, ch2, tmp1);
 9345       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9346       __ bics(tmp2, tmp2, ch2);
 9347       __ br(__ NE, L_HAS_ZERO);
 9348     __ BIND(L_LOOP_PROCEED);
 9349       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9350       __ add(str2, str2, wordSize);
 9351       __ add(result, result, wordSize/str2_chr_size);
 9352       __ br(__ GE, L_LOOP);
 9353     __ BIND(L_POST_LOOP);
 9354       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9355       __ br(__ LE, NOMATCH);
 9356       __ ldr(ch2, Address(str2));
 9357       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9358       __ eor(ch2, first, ch2);
 9359       __ sub(tmp2, ch2, tmp1);
 9360       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9361       __ mov(tmp4, -1); // all bits set
 9362       __ b(L_SMALL_PROCEED);
 9363     __ align(OptoLoopAlignment);
 9364     __ BIND(L_SMALL);
 9365       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9366       __ eor(ch2, first, ch2);
 9367       if (str1_isL != str2_isL) {
 9368         __ zip1(v1, __ T16B, v1, v0);
 9369       }
 9370       __ sub(tmp2, ch2, tmp1);
 9371       __ mov(tmp4, -1); // all bits set
 9372       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9373       if (str1_isL != str2_isL) {
 9374         __ fmovd(ch1, v1); // move converted 4 symbols
 9375       }
 9376     __ BIND(L_SMALL_PROCEED);
 9377       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9378       __ bic(tmp2, tmp2, ch2);
 9379       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9380       __ rbit(tmp2, tmp2);
 9381       __ br(__ EQ, NOMATCH);
 9382     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9383       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9384       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9385       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9386       if (str2_isL) { // LL
 9387         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9388         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9389         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9390         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9391         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9392       } else {
 9393         __ mov(ch2, 0xE); // all bits in byte set except last one
 9394         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9395         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9396         __ lslv(tmp2, tmp2, tmp4);
 9397         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9398         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9399         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9400         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9401       }
 9402       __ cmp(ch1, ch2);
 9403       __ mov(tmp4, wordSize/str2_chr_size);
 9404       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9405     __ BIND(L_SMALL_CMP_LOOP);
 9406       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9407                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9408       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9409                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9410       __ add(tmp4, tmp4, 1);
 9411       __ cmp(tmp4, cnt1);
 9412       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9413       __ cmp(first, ch2);
 9414       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9415     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9416       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9417       __ clz(tmp4, tmp2);
 9418       __ add(result, result, 1); // advance index
 9419       __ add(str2, str2, str2_chr_size); // advance pointer
 9420       __ b(L_SMALL_HAS_ZERO_LOOP);
 9421     __ align(OptoLoopAlignment);
 9422     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9423       __ cmp(first, ch2);
 9424       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9425       __ b(DONE);
 9426     __ align(OptoLoopAlignment);
 9427     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9428       if (str2_isL) { // LL
 9429         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9430         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9431         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9432         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9433         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9434       } else {
 9435         __ mov(ch2, 0xE); // all bits in byte set except last one
 9436         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9437         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9438         __ lslv(tmp2, tmp2, tmp4);
 9439         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9440         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9441         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9442         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9443       }
 9444       __ cmp(ch1, ch2);
 9445       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9446       __ b(DONE);
 9447     __ align(OptoLoopAlignment);
 9448     __ BIND(L_HAS_ZERO);
 9449       __ rbit(tmp2, tmp2);
 9450       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9451       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9452       // It's fine because both counters are 32bit and are not changed in this
 9453       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9454       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9455       __ sub(result, result, 1);
 9456     __ BIND(L_HAS_ZERO_LOOP);
 9457       __ mov(cnt1, wordSize/str2_chr_size);
 9458       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9459       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9460       if (str2_isL) {
 9461         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9462         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9463         __ lslv(tmp2, tmp2, tmp4);
 9464         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9465         __ add(tmp4, tmp4, 1);
 9466         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9467         __ lsl(tmp2, tmp2, 1);
 9468         __ mov(tmp4, wordSize/str2_chr_size);
 9469       } else {
 9470         __ mov(ch2, 0xE);
 9471         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9472         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9473         __ lslv(tmp2, tmp2, tmp4);
 9474         __ add(tmp4, tmp4, 1);
 9475         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9476         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9477         __ lsl(tmp2, tmp2, 1);
 9478         __ mov(tmp4, wordSize/str2_chr_size);
 9479         __ sub(str2, str2, str2_chr_size);
 9480       }
 9481       __ cmp(ch1, ch2);
 9482       __ mov(tmp4, wordSize/str2_chr_size);
 9483       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9484     __ BIND(L_CMP_LOOP);
 9485       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9486                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9487       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9488                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9489       __ add(tmp4, tmp4, 1);
 9490       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9491       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9492       __ cmp(cnt1, ch2);
 9493       __ br(__ EQ, L_CMP_LOOP);
 9494     __ BIND(L_CMP_LOOP_NOMATCH);
 9495       // here we're not matched
 9496       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9497       __ clz(tmp4, tmp2);
 9498       __ add(str2, str2, str2_chr_size); // advance pointer
 9499       __ b(L_HAS_ZERO_LOOP);
 9500     __ align(OptoLoopAlignment);
 9501     __ BIND(L_CMP_LOOP_LAST_CMP);
 9502       __ cmp(cnt1, ch2);
 9503       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9504       __ b(DONE);
 9505     __ align(OptoLoopAlignment);
 9506     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9507       if (str2_isL) {
 9508         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9509         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9510         __ lslv(tmp2, tmp2, tmp4);
 9511         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9512         __ add(tmp4, tmp4, 1);
 9513         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9514         __ lsl(tmp2, tmp2, 1);
 9515       } else {
 9516         __ mov(ch2, 0xE);
 9517         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9518         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9519         __ lslv(tmp2, tmp2, tmp4);
 9520         __ add(tmp4, tmp4, 1);
 9521         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9522         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9523         __ lsl(tmp2, tmp2, 1);
 9524         __ sub(str2, str2, str2_chr_size);
 9525       }
 9526       __ cmp(ch1, ch2);
 9527       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9528       __ b(DONE);
 9529     __ align(OptoLoopAlignment);
 9530     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9531       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9532       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9533       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9534       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9535       // result by analyzed characters value, so, we can just reset lower bits
 9536       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9537       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9538       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9539       // index of last analyzed substring inside current octet. So, str2 in at
 9540       // respective start address. We need to advance it to next octet
 9541       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9542       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9543       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9544       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9545       __ movw(cnt2, cnt2);
 9546       __ b(L_LOOP_PROCEED);
 9547     __ align(OptoLoopAlignment);
 9548     __ BIND(NOMATCH);
 9549       __ mov(result, -1);
 9550     __ BIND(DONE);
 9551       __ pop(spilled_regs, sp);
 9552       __ ret(lr);
 9553     return entry;
 9554   }
 9555 
 9556   void generate_string_indexof_stubs() {
 9557     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9558     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9559     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9560   }
 9561 
 9562   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9563       FloatRegister src1, FloatRegister src2) {
 9564     Register dst = r1;
 9565     __ zip1(v1, __ T16B, src1, v0);
 9566     __ zip2(v2, __ T16B, src1, v0);
 9567     if (generatePrfm) {
 9568       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9569     }
 9570     __ zip1(v3, __ T16B, src2, v0);
 9571     __ zip2(v4, __ T16B, src2, v0);
 9572     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9573   }
 9574 
 9575   // R0 = src
 9576   // R1 = dst
 9577   // R2 = len
 9578   // R3 = len >> 3
 9579   // V0 = 0
 9580   // v1 = loaded 8 bytes
 9581   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9582   address generate_large_byte_array_inflate() {
 9583     __ align(CodeEntryAlignment);
 9584     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9585     StubCodeMark mark(this, stub_id);
 9586     address entry = __ pc();
 9587     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9588     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9589     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9590 
 9591     // do one more 8-byte read to have address 16-byte aligned in most cases
 9592     // also use single store instruction
 9593     __ ldrd(v2, __ post(src, 8));
 9594     __ sub(octetCounter, octetCounter, 2);
 9595     __ zip1(v1, __ T16B, v1, v0);
 9596     __ zip1(v2, __ T16B, v2, v0);
 9597     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9598     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9599     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9600     __ br(__ LE, LOOP_START);
 9601     __ b(LOOP_PRFM_START);
 9602     __ bind(LOOP_PRFM);
 9603       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9604     __ bind(LOOP_PRFM_START);
 9605       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9606       __ sub(octetCounter, octetCounter, 8);
 9607       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9608       inflate_and_store_2_fp_registers(true, v3, v4);
 9609       inflate_and_store_2_fp_registers(true, v5, v6);
 9610       __ br(__ GT, LOOP_PRFM);
 9611       __ cmp(octetCounter, (u1)8);
 9612       __ br(__ LT, DONE);
 9613     __ bind(LOOP);
 9614       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9615       __ bind(LOOP_START);
 9616       __ sub(octetCounter, octetCounter, 8);
 9617       __ cmp(octetCounter, (u1)8);
 9618       inflate_and_store_2_fp_registers(false, v3, v4);
 9619       inflate_and_store_2_fp_registers(false, v5, v6);
 9620       __ br(__ GE, LOOP);
 9621     __ bind(DONE);
 9622       __ ret(lr);
 9623     return entry;
 9624   }
 9625 
 9626   /**
 9627    *  Arguments:
 9628    *
 9629    *  Input:
 9630    *  c_rarg0   - current state address
 9631    *  c_rarg1   - H key address
 9632    *  c_rarg2   - data address
 9633    *  c_rarg3   - number of blocks
 9634    *
 9635    *  Output:
 9636    *  Updated state at c_rarg0
 9637    */
 9638   address generate_ghash_processBlocks() {
 9639     // Bafflingly, GCM uses little-endian for the byte order, but
 9640     // big-endian for the bit order.  For example, the polynomial 1 is
 9641     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9642     //
 9643     // So, we must either reverse the bytes in each word and do
 9644     // everything big-endian or reverse the bits in each byte and do
 9645     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9646     // the bits in each byte (we have an instruction, RBIT, to do
 9647     // that) and keep the data in little-endian bit order through the
 9648     // calculation, bit-reversing the inputs and outputs.
 9649 
 9650     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9651     StubCodeMark mark(this, stub_id);
 9652     Label polynomial; // local data generated at end of stub
 9653     __ align(CodeEntryAlignment);
 9654     address start = __ pc();
 9655 
 9656     Register state   = c_rarg0;
 9657     Register subkeyH = c_rarg1;
 9658     Register data    = c_rarg2;
 9659     Register blocks  = c_rarg3;
 9660 
 9661     FloatRegister vzr = v30;
 9662     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9663 
 9664     __ adr(rscratch1, polynomial);
 9665     __ ldrq(v24, rscratch1);    // The field polynomial
 9666 
 9667     __ ldrq(v0, Address(state));
 9668     __ ldrq(v1, Address(subkeyH));
 9669 
 9670     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9671     __ rbit(v0, __ T16B, v0);
 9672     __ rev64(v1, __ T16B, v1);
 9673     __ rbit(v1, __ T16B, v1);
 9674 
 9675     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9676     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9677 
 9678     {
 9679       Label L_ghash_loop;
 9680       __ bind(L_ghash_loop);
 9681 
 9682       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9683                                                  // reversing each byte
 9684       __ rbit(v2, __ T16B, v2);
 9685       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9686 
 9687       // Multiply state in v2 by subkey in v1
 9688       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9689                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9690                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9691       // Reduce v7:v5 by the field polynomial
 9692       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9693 
 9694       __ sub(blocks, blocks, 1);
 9695       __ cbnz(blocks, L_ghash_loop);
 9696     }
 9697 
 9698     // The bit-reversed result is at this point in v0
 9699     __ rev64(v0, __ T16B, v0);
 9700     __ rbit(v0, __ T16B, v0);
 9701 
 9702     __ st1(v0, __ T16B, state);
 9703     __ ret(lr);
 9704 
 9705     // bind label and generate local polynomial data
 9706     __ align(wordSize * 2);
 9707     __ bind(polynomial);
 9708     __ emit_int64(0x87);  // The low-order bits of the field
 9709                           // polynomial (i.e. p = z^7+z^2+z+1)
 9710                           // repeated in the low and high parts of a
 9711                           // 128-bit vector
 9712     __ emit_int64(0x87);
 9713 
 9714     return start;
 9715   }
 9716 
 9717   address generate_ghash_processBlocks_wide() {
 9718     address small = generate_ghash_processBlocks();
 9719 
 9720     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9721     StubCodeMark mark(this, stub_id);
 9722     Label polynomial;           // local data generated after stub
 9723     __ align(CodeEntryAlignment);
 9724     address start = __ pc();
 9725 
 9726     Register state   = c_rarg0;
 9727     Register subkeyH = c_rarg1;
 9728     Register data    = c_rarg2;
 9729     Register blocks  = c_rarg3;
 9730 
 9731     const int unroll = 4;
 9732 
 9733     __ cmp(blocks, (unsigned char)(unroll * 2));
 9734     __ br(__ LT, small);
 9735 
 9736     if (unroll > 1) {
 9737     // Save state before entering routine
 9738       __ sub(sp, sp, 4 * 16);
 9739       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9740       __ sub(sp, sp, 4 * 16);
 9741       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9742     }
 9743 
 9744     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
 9745 
 9746     if (unroll > 1) {
 9747       // And restore state
 9748       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9749       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9750     }
 9751 
 9752     __ cmp(blocks, (unsigned char)0);
 9753     __ br(__ GT, small);
 9754 
 9755     __ ret(lr);
 9756 
 9757     // bind label and generate polynomial data
 9758     __ align(wordSize * 2);
 9759     __ bind(polynomial);
 9760     __ emit_int64(0x87);  // The low-order bits of the field
 9761                           // polynomial (i.e. p = z^7+z^2+z+1)
 9762                           // repeated in the low and high parts of a
 9763                           // 128-bit vector
 9764     __ emit_int64(0x87);
 9765 
 9766     return start;
 9767 
 9768   }
 9769 
 9770   void generate_base64_encode_simdround(Register src, Register dst,
 9771         FloatRegister codec, u8 size) {
 9772 
 9773     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9774     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9775     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9776 
 9777     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9778 
 9779     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9780 
 9781     __ ushr(ind0, arrangement, in0,  2);
 9782 
 9783     __ ushr(ind1, arrangement, in1,  2);
 9784     __ shl(in0,   arrangement, in0,  6);
 9785     __ orr(ind1,  arrangement, ind1, in0);
 9786     __ ushr(ind1, arrangement, ind1, 2);
 9787 
 9788     __ ushr(ind2, arrangement, in2,  4);
 9789     __ shl(in1,   arrangement, in1,  4);
 9790     __ orr(ind2,  arrangement, in1,  ind2);
 9791     __ ushr(ind2, arrangement, ind2, 2);
 9792 
 9793     __ shl(ind3,  arrangement, in2,  2);
 9794     __ ushr(ind3, arrangement, ind3, 2);
 9795 
 9796     __ tbl(out0,  arrangement, codec,  4, ind0);
 9797     __ tbl(out1,  arrangement, codec,  4, ind1);
 9798     __ tbl(out2,  arrangement, codec,  4, ind2);
 9799     __ tbl(out3,  arrangement, codec,  4, ind3);
 9800 
 9801     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9802   }
 9803 
 9804    /**
 9805    *  Arguments:
 9806    *
 9807    *  Input:
 9808    *  c_rarg0   - src_start
 9809    *  c_rarg1   - src_offset
 9810    *  c_rarg2   - src_length
 9811    *  c_rarg3   - dest_start
 9812    *  c_rarg4   - dest_offset
 9813    *  c_rarg5   - isURL
 9814    *
 9815    */
 9816   address generate_base64_encodeBlock() {
 9817 
 9818     static const char toBase64[64] = {
 9819       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9820       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9821       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9822       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9823       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9824     };
 9825 
 9826     static const char toBase64URL[64] = {
 9827       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9828       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9829       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9830       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9831       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9832     };
 9833 
 9834     __ align(CodeEntryAlignment);
 9835     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9836     StubCodeMark mark(this, stub_id);
 9837     address start = __ pc();
 9838 
 9839     Register src   = c_rarg0;  // source array
 9840     Register soff  = c_rarg1;  // source start offset
 9841     Register send  = c_rarg2;  // source end offset
 9842     Register dst   = c_rarg3;  // dest array
 9843     Register doff  = c_rarg4;  // position for writing to dest array
 9844     Register isURL = c_rarg5;  // Base64 or URL character set
 9845 
 9846     // c_rarg6 and c_rarg7 are free to use as temps
 9847     Register codec  = c_rarg6;
 9848     Register length = c_rarg7;
 9849 
 9850     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9851 
 9852     __ add(src, src, soff);
 9853     __ add(dst, dst, doff);
 9854     __ sub(length, send, soff);
 9855 
 9856     // load the codec base address
 9857     __ lea(codec, ExternalAddress((address) toBase64));
 9858     __ cbz(isURL, ProcessData);
 9859     __ lea(codec, ExternalAddress((address) toBase64URL));
 9860 
 9861     __ BIND(ProcessData);
 9862 
 9863     // too short to formup a SIMD loop, roll back
 9864     __ cmp(length, (u1)24);
 9865     __ br(Assembler::LT, Process3B);
 9866 
 9867     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9868 
 9869     __ BIND(Process48B);
 9870     __ cmp(length, (u1)48);
 9871     __ br(Assembler::LT, Process24B);
 9872     generate_base64_encode_simdround(src, dst, v0, 16);
 9873     __ sub(length, length, 48);
 9874     __ b(Process48B);
 9875 
 9876     __ BIND(Process24B);
 9877     __ cmp(length, (u1)24);
 9878     __ br(Assembler::LT, SIMDExit);
 9879     generate_base64_encode_simdround(src, dst, v0, 8);
 9880     __ sub(length, length, 24);
 9881 
 9882     __ BIND(SIMDExit);
 9883     __ cbz(length, Exit);
 9884 
 9885     __ BIND(Process3B);
 9886     //  3 src bytes, 24 bits
 9887     __ ldrb(r10, __ post(src, 1));
 9888     __ ldrb(r11, __ post(src, 1));
 9889     __ ldrb(r12, __ post(src, 1));
 9890     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9891     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9892     // codec index
 9893     __ ubfmw(r15, r12, 18, 23);
 9894     __ ubfmw(r14, r12, 12, 17);
 9895     __ ubfmw(r13, r12, 6,  11);
 9896     __ andw(r12,  r12, 63);
 9897     // get the code based on the codec
 9898     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9899     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9900     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9901     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9902     __ strb(r15, __ post(dst, 1));
 9903     __ strb(r14, __ post(dst, 1));
 9904     __ strb(r13, __ post(dst, 1));
 9905     __ strb(r12, __ post(dst, 1));
 9906     __ sub(length, length, 3);
 9907     __ cbnz(length, Process3B);
 9908 
 9909     __ BIND(Exit);
 9910     __ ret(lr);
 9911 
 9912     return start;
 9913   }
 9914 
 9915   void generate_base64_decode_simdround(Register src, Register dst,
 9916         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9917 
 9918     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9919     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9920 
 9921     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9922     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9923 
 9924     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9925 
 9926     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9927 
 9928     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9929 
 9930     // we need unsigned saturating subtract, to make sure all input values
 9931     // in range [0, 63] will have 0U value in the higher half lookup
 9932     __ uqsubv(decH0, __ T16B, in0, v27);
 9933     __ uqsubv(decH1, __ T16B, in1, v27);
 9934     __ uqsubv(decH2, __ T16B, in2, v27);
 9935     __ uqsubv(decH3, __ T16B, in3, v27);
 9936 
 9937     // lower half lookup
 9938     __ tbl(decL0, arrangement, codecL, 4, in0);
 9939     __ tbl(decL1, arrangement, codecL, 4, in1);
 9940     __ tbl(decL2, arrangement, codecL, 4, in2);
 9941     __ tbl(decL3, arrangement, codecL, 4, in3);
 9942 
 9943     // higher half lookup
 9944     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9945     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9946     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9947     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9948 
 9949     // combine lower and higher
 9950     __ orr(decL0, arrangement, decL0, decH0);
 9951     __ orr(decL1, arrangement, decL1, decH1);
 9952     __ orr(decL2, arrangement, decL2, decH2);
 9953     __ orr(decL3, arrangement, decL3, decH3);
 9954 
 9955     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9956     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9957     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9958     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9959     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9960     __ orr(in0, arrangement, decH0, decH1);
 9961     __ orr(in1, arrangement, decH2, decH3);
 9962     __ orr(in2, arrangement, in0,   in1);
 9963     __ umaxv(in3, arrangement, in2);
 9964     __ umov(rscratch2, in3, __ B, 0);
 9965 
 9966     // get the data to output
 9967     __ shl(out0,  arrangement, decL0, 2);
 9968     __ ushr(out1, arrangement, decL1, 4);
 9969     __ orr(out0,  arrangement, out0,  out1);
 9970     __ shl(out1,  arrangement, decL1, 4);
 9971     __ ushr(out2, arrangement, decL2, 2);
 9972     __ orr(out1,  arrangement, out1,  out2);
 9973     __ shl(out2,  arrangement, decL2, 6);
 9974     __ orr(out2,  arrangement, out2,  decL3);
 9975 
 9976     __ cbz(rscratch2, NoIllegalData);
 9977 
 9978     // handle illegal input
 9979     __ umov(r10, in2, __ D, 0);
 9980     if (size == 16) {
 9981       __ cbnz(r10, ErrorInLowerHalf);
 9982 
 9983       // illegal input is in higher half, store the lower half now.
 9984       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9985 
 9986       __ umov(r10, in2,  __ D, 1);
 9987       __ umov(r11, out0, __ D, 1);
 9988       __ umov(r12, out1, __ D, 1);
 9989       __ umov(r13, out2, __ D, 1);
 9990       __ b(StoreLegalData);
 9991 
 9992       __ BIND(ErrorInLowerHalf);
 9993     }
 9994     __ umov(r11, out0, __ D, 0);
 9995     __ umov(r12, out1, __ D, 0);
 9996     __ umov(r13, out2, __ D, 0);
 9997 
 9998     __ BIND(StoreLegalData);
 9999     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10000     __ strb(r11, __ post(dst, 1));
10001     __ strb(r12, __ post(dst, 1));
10002     __ strb(r13, __ post(dst, 1));
10003     __ lsr(r10, r10, 8);
10004     __ lsr(r11, r11, 8);
10005     __ lsr(r12, r12, 8);
10006     __ lsr(r13, r13, 8);
10007     __ b(StoreLegalData);
10008 
10009     __ BIND(NoIllegalData);
10010     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10011   }
10012 
10013 
10014    /**
10015    *  Arguments:
10016    *
10017    *  Input:
10018    *  c_rarg0   - src_start
10019    *  c_rarg1   - src_offset
10020    *  c_rarg2   - src_length
10021    *  c_rarg3   - dest_start
10022    *  c_rarg4   - dest_offset
10023    *  c_rarg5   - isURL
10024    *  c_rarg6   - isMIME
10025    *
10026    */
10027   address generate_base64_decodeBlock() {
10028 
10029     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10030     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10031     // titled "Base64 decoding".
10032 
10033     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10034     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10035     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10036     static const uint8_t fromBase64ForNoSIMD[256] = {
10037       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10038       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10039       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10040        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10041       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10042        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10043       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10044        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10045       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10046       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10047       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10048       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10049       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10050       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10051       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10052       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053     };
10054 
10055     static const uint8_t fromBase64URLForNoSIMD[256] = {
10056       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10058       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10059        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10060       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10061        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10062       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10063        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10064       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10065       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10066       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10067       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10068       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10069       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10070       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10071       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10072     };
10073 
10074     // A legal value of base64 code is in range [0, 127].  We need two lookups
10075     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10076     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10077     // table vector lookup use tbx, out of range indices are unchanged in
10078     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10079     // The value of index 64 is set to 0, so that we know that we already get the
10080     // decoded data with the 1st lookup.
10081     static const uint8_t fromBase64ForSIMD[128] = {
10082       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10083       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10084       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10085        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10086         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10087        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10088       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10089        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10090     };
10091 
10092     static const uint8_t fromBase64URLForSIMD[128] = {
10093       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10094       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10095       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10096        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10097         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10098        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10099        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10100        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10101     };
10102 
10103     __ align(CodeEntryAlignment);
10104     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10105     StubCodeMark mark(this, stub_id);
10106     address start = __ pc();
10107 
10108     Register src    = c_rarg0;  // source array
10109     Register soff   = c_rarg1;  // source start offset
10110     Register send   = c_rarg2;  // source end offset
10111     Register dst    = c_rarg3;  // dest array
10112     Register doff   = c_rarg4;  // position for writing to dest array
10113     Register isURL  = c_rarg5;  // Base64 or URL character set
10114     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10115 
10116     Register length = send;    // reuse send as length of source data to process
10117 
10118     Register simd_codec   = c_rarg6;
10119     Register nosimd_codec = c_rarg7;
10120 
10121     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10122 
10123     __ enter();
10124 
10125     __ add(src, src, soff);
10126     __ add(dst, dst, doff);
10127 
10128     __ mov(doff, dst);
10129 
10130     __ sub(length, send, soff);
10131     __ bfm(length, zr, 0, 1);
10132 
10133     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10134     __ cbz(isURL, ProcessData);
10135     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10136 
10137     __ BIND(ProcessData);
10138     __ mov(rscratch1, length);
10139     __ cmp(length, (u1)144); // 144 = 80 + 64
10140     __ br(Assembler::LT, Process4B);
10141 
10142     // In the MIME case, the line length cannot be more than 76
10143     // bytes (see RFC 2045). This is too short a block for SIMD
10144     // to be worthwhile, so we use non-SIMD here.
10145     __ movw(rscratch1, 79);
10146 
10147     __ BIND(Process4B);
10148     __ ldrw(r14, __ post(src, 4));
10149     __ ubfxw(r10, r14, 0,  8);
10150     __ ubfxw(r11, r14, 8,  8);
10151     __ ubfxw(r12, r14, 16, 8);
10152     __ ubfxw(r13, r14, 24, 8);
10153     // get the de-code
10154     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10155     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10156     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10157     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10158     // error detection, 255u indicates an illegal input
10159     __ orrw(r14, r10, r11);
10160     __ orrw(r15, r12, r13);
10161     __ orrw(r14, r14, r15);
10162     __ tbnz(r14, 7, Exit);
10163     // recover the data
10164     __ lslw(r14, r10, 10);
10165     __ bfiw(r14, r11, 4, 6);
10166     __ bfmw(r14, r12, 2, 5);
10167     __ rev16w(r14, r14);
10168     __ bfiw(r13, r12, 6, 2);
10169     __ strh(r14, __ post(dst, 2));
10170     __ strb(r13, __ post(dst, 1));
10171     // non-simd loop
10172     __ subsw(rscratch1, rscratch1, 4);
10173     __ br(Assembler::GT, Process4B);
10174 
10175     // if exiting from PreProcess80B, rscratch1 == -1;
10176     // otherwise, rscratch1 == 0.
10177     __ cbzw(rscratch1, Exit);
10178     __ sub(length, length, 80);
10179 
10180     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10181     __ cbz(isURL, SIMDEnter);
10182     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10183 
10184     __ BIND(SIMDEnter);
10185     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10186     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10187     __ mov(rscratch1, 63);
10188     __ dup(v27, __ T16B, rscratch1);
10189 
10190     __ BIND(Process64B);
10191     __ cmp(length, (u1)64);
10192     __ br(Assembler::LT, Process32B);
10193     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10194     __ sub(length, length, 64);
10195     __ b(Process64B);
10196 
10197     __ BIND(Process32B);
10198     __ cmp(length, (u1)32);
10199     __ br(Assembler::LT, SIMDExit);
10200     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10201     __ sub(length, length, 32);
10202     __ b(Process32B);
10203 
10204     __ BIND(SIMDExit);
10205     __ cbz(length, Exit);
10206     __ movw(rscratch1, length);
10207     __ b(Process4B);
10208 
10209     __ BIND(Exit);
10210     __ sub(c_rarg0, dst, doff);
10211 
10212     __ leave();
10213     __ ret(lr);
10214 
10215     return start;
10216   }
10217 
10218   // Support for spin waits.
10219   address generate_spin_wait() {
10220     __ align(CodeEntryAlignment);
10221     StubId stub_id = StubId::stubgen_spin_wait_id;
10222     StubCodeMark mark(this, stub_id);
10223     address start = __ pc();
10224 
10225     __ spin_wait();
10226     __ ret(lr);
10227 
10228     return start;
10229   }
10230 
10231   void generate_lookup_secondary_supers_table_stub() {
10232     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10233     StubCodeMark mark(this, stub_id);
10234 
10235     const Register
10236       r_super_klass  = r0,
10237       r_array_base   = r1,
10238       r_array_length = r2,
10239       r_array_index  = r3,
10240       r_sub_klass    = r4,
10241       r_bitmap       = rscratch2,
10242       result         = r5;
10243     const FloatRegister
10244       vtemp          = v0;
10245 
10246     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10247       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10248       Label L_success;
10249       __ enter();
10250       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10251                                              r_array_base, r_array_length, r_array_index,
10252                                              vtemp, result, slot,
10253                                              /*stub_is_near*/true);
10254       __ leave();
10255       __ ret(lr);
10256     }
10257   }
10258 
10259   // Slow path implementation for UseSecondarySupersTable.
10260   address generate_lookup_secondary_supers_table_slow_path_stub() {
10261     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10262     StubCodeMark mark(this, stub_id);
10263 
10264     address start = __ pc();
10265     const Register
10266       r_super_klass  = r0,        // argument
10267       r_array_base   = r1,        // argument
10268       temp1          = r2,        // temp
10269       r_array_index  = r3,        // argument
10270       r_bitmap       = rscratch2, // argument
10271       result         = r5;        // argument
10272 
10273     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10274     __ ret(lr);
10275 
10276     return start;
10277   }
10278 
10279 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10280 
10281   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10282   //
10283   // If LSE is in use, generate LSE versions of all the stubs. The
10284   // non-LSE versions are in atomic_aarch64.S.
10285 
10286   // class AtomicStubMark records the entry point of a stub and the
10287   // stub pointer which will point to it. The stub pointer is set to
10288   // the entry point when ~AtomicStubMark() is called, which must be
10289   // after ICache::invalidate_range. This ensures safe publication of
10290   // the generated code.
10291   class AtomicStubMark {
10292     address _entry_point;
10293     aarch64_atomic_stub_t *_stub;
10294     MacroAssembler *_masm;
10295   public:
10296     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10297       _masm = masm;
10298       __ align(32);
10299       _entry_point = __ pc();
10300       _stub = stub;
10301     }
10302     ~AtomicStubMark() {
10303       *_stub = (aarch64_atomic_stub_t)_entry_point;
10304     }
10305   };
10306 
10307   // NB: For memory_order_conservative we need a trailing membar after
10308   // LSE atomic operations but not a leading membar.
10309   //
10310   // We don't need a leading membar because a clause in the Arm ARM
10311   // says:
10312   //
10313   //   Barrier-ordered-before
10314   //
10315   //   Barrier instructions order prior Memory effects before subsequent
10316   //   Memory effects generated by the same Observer. A read or a write
10317   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10318   //   Observer if and only if RW1 appears in program order before RW 2
10319   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10320   //   instruction with both Acquire and Release semantics.
10321   //
10322   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10323   // and Release semantics, therefore we don't need a leading
10324   // barrier. However, there is no corresponding Barrier-ordered-after
10325   // relationship, therefore we need a trailing membar to prevent a
10326   // later store or load from being reordered with the store in an
10327   // atomic instruction.
10328   //
10329   // This was checked by using the herd7 consistency model simulator
10330   // (http://diy.inria.fr/) with this test case:
10331   //
10332   // AArch64 LseCas
10333   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10334   // P0 | P1;
10335   // LDR W4, [X2] | MOV W3, #0;
10336   // DMB LD       | MOV W4, #1;
10337   // LDR W3, [X1] | CASAL W3, W4, [X1];
10338   //              | DMB ISH;
10339   //              | STR W4, [X2];
10340   // exists
10341   // (0:X3=0 /\ 0:X4=1)
10342   //
10343   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10344   // with the store to x in P1. Without the DMB in P1 this may happen.
10345   //
10346   // At the time of writing we don't know of any AArch64 hardware that
10347   // reorders stores in this way, but the Reference Manual permits it.
10348 
10349   void gen_cas_entry(Assembler::operand_size size,
10350                      atomic_memory_order order) {
10351     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10352       exchange_val = c_rarg2;
10353     bool acquire, release;
10354     switch (order) {
10355       case memory_order_relaxed:
10356         acquire = false;
10357         release = false;
10358         break;
10359       case memory_order_release:
10360         acquire = false;
10361         release = true;
10362         break;
10363       default:
10364         acquire = true;
10365         release = true;
10366         break;
10367     }
10368     __ mov(prev, compare_val);
10369     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10370     if (order == memory_order_conservative) {
10371       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10372     }
10373     if (size == Assembler::xword) {
10374       __ mov(r0, prev);
10375     } else {
10376       __ movw(r0, prev);
10377     }
10378     __ ret(lr);
10379   }
10380 
10381   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10382     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10383     // If not relaxed, then default to conservative.  Relaxed is the only
10384     // case we use enough to be worth specializing.
10385     if (order == memory_order_relaxed) {
10386       __ ldadd(size, incr, prev, addr);
10387     } else {
10388       __ ldaddal(size, incr, prev, addr);
10389       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10390     }
10391     if (size == Assembler::xword) {
10392       __ mov(r0, prev);
10393     } else {
10394       __ movw(r0, prev);
10395     }
10396     __ ret(lr);
10397   }
10398 
10399   void gen_swpal_entry(Assembler::operand_size size) {
10400     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10401     __ swpal(size, incr, prev, addr);
10402     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10403     if (size == Assembler::xword) {
10404       __ mov(r0, prev);
10405     } else {
10406       __ movw(r0, prev);
10407     }
10408     __ ret(lr);
10409   }
10410 
10411   void generate_atomic_entry_points() {
10412     if (! UseLSE) {
10413       return;
10414     }
10415     __ align(CodeEntryAlignment);
10416     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10417     StubCodeMark mark(this, stub_id);
10418     address first_entry = __ pc();
10419 
10420     // ADD, memory_order_conservative
10421     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10422     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10423     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10424     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10425 
10426     // ADD, memory_order_relaxed
10427     AtomicStubMark mark_fetch_add_4_relaxed
10428       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10429     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10430     AtomicStubMark mark_fetch_add_8_relaxed
10431       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10432     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10433 
10434     // XCHG, memory_order_conservative
10435     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10436     gen_swpal_entry(Assembler::word);
10437     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10438     gen_swpal_entry(Assembler::xword);
10439 
10440     // CAS, memory_order_conservative
10441     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10442     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10443     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10444     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10445     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10446     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10447 
10448     // CAS, memory_order_relaxed
10449     AtomicStubMark mark_cmpxchg_1_relaxed
10450       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10451     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10452     AtomicStubMark mark_cmpxchg_4_relaxed
10453       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10454     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10455     AtomicStubMark mark_cmpxchg_8_relaxed
10456       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10457     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10458 
10459     AtomicStubMark mark_cmpxchg_4_release
10460       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10461     gen_cas_entry(MacroAssembler::word, memory_order_release);
10462     AtomicStubMark mark_cmpxchg_8_release
10463       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10464     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10465 
10466     AtomicStubMark mark_cmpxchg_4_seq_cst
10467       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10468     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10469     AtomicStubMark mark_cmpxchg_8_seq_cst
10470       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10471     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10472 
10473     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10474   }
10475 #endif // LINUX
10476 
10477   address generate_cont_thaw(Continuation::thaw_kind kind) {
10478     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10479     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10480 
10481     address start = __ pc();
10482 
10483     if (return_barrier) {
10484       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10485       __ mov(sp, rscratch1);
10486     }
10487     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10488 
10489     if (return_barrier) {
10490       // preserve possible return value from a method returning to the return barrier
10491       __ fmovd(rscratch1, v0);
10492       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10493     }
10494 
10495     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10496     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10497     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10498 
10499     if (return_barrier) {
10500       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10501       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10502       __ fmovd(v0, rscratch1);
10503     }
10504     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10505 
10506 
10507     Label thaw_success;
10508     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10509     __ cbnz(rscratch2, thaw_success);
10510     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10511     __ br(rscratch1);
10512     __ bind(thaw_success);
10513 
10514     // make room for the thawed frames
10515     __ sub(rscratch1, sp, rscratch2);
10516     __ andr(rscratch1, rscratch1, -16); // align
10517     __ mov(sp, rscratch1);
10518 
10519     if (return_barrier) {
10520       // save original return value -- again
10521       __ fmovd(rscratch1, v0);
10522       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10523     }
10524 
10525     // If we want, we can templatize thaw by kind, and have three different entries
10526     __ movw(c_rarg1, (uint32_t)kind);
10527 
10528     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10529     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10530 
10531     if (return_barrier) {
10532       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10533       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10534       __ fmovd(v0, rscratch1);
10535     } else {
10536       __ mov(r0, zr); // return 0 (success) from doYield
10537     }
10538 
10539     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10540     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10541     __ mov(rfp, sp);
10542 
10543     if (return_barrier_exception) {
10544       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10545       __ authenticate_return_address(c_rarg1);
10546       __ verify_oop(r0);
10547       // save return value containing the exception oop in callee-saved R19
10548       __ mov(r19, r0);
10549 
10550       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10551 
10552       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10553       // __ reinitialize_ptrue();
10554 
10555       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10556 
10557       __ mov(r1, r0); // the exception handler
10558       __ mov(r0, r19); // restore return value containing the exception oop
10559       __ verify_oop(r0);
10560 
10561       __ leave();
10562       __ mov(r3, lr);
10563       __ br(r1); // the exception handler
10564     } else {
10565       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10566       __ leave();
10567       __ ret(lr);
10568     }
10569 
10570     return start;
10571   }
10572 
10573   address generate_cont_thaw() {
10574     if (!Continuations::enabled()) return nullptr;
10575 
10576     StubId stub_id = StubId::stubgen_cont_thaw_id;
10577     StubCodeMark mark(this, stub_id);
10578     address start = __ pc();
10579     generate_cont_thaw(Continuation::thaw_top);
10580     return start;
10581   }
10582 
10583   address generate_cont_returnBarrier() {
10584     if (!Continuations::enabled()) return nullptr;
10585 
10586     // TODO: will probably need multiple return barriers depending on return type
10587     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10588     StubCodeMark mark(this, stub_id);
10589     address start = __ pc();
10590 
10591     generate_cont_thaw(Continuation::thaw_return_barrier);
10592 
10593     return start;
10594   }
10595 
10596   address generate_cont_returnBarrier_exception() {
10597     if (!Continuations::enabled()) return nullptr;
10598 
10599     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10600     StubCodeMark mark(this, stub_id);
10601     address start = __ pc();
10602 
10603     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10604 
10605     return start;
10606   }
10607 
10608   address generate_cont_preempt_stub() {
10609     if (!Continuations::enabled()) return nullptr;
10610     StubId stub_id = StubId::stubgen_cont_preempt_id;
10611     StubCodeMark mark(this, stub_id);
10612     address start = __ pc();
10613 
10614     __ reset_last_Java_frame(true);
10615 
10616     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10617     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10618     __ mov(sp, rscratch2);
10619 
10620     Label preemption_cancelled;
10621     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10622     __ cbnz(rscratch1, preemption_cancelled);
10623 
10624     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10625     SharedRuntime::continuation_enter_cleanup(_masm);
10626     __ leave();
10627     __ ret(lr);
10628 
10629     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10630     __ bind(preemption_cancelled);
10631     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10632     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10633     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10634     __ ldr(rscratch1, Address(rscratch1));
10635     __ br(rscratch1);
10636 
10637     return start;
10638   }
10639 
10640   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10641   // are represented as long[5], with BITS_PER_LIMB = 26.
10642   // Pack five 26-bit limbs into three 64-bit registers.
10643   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10644     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10645     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10646     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10647     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10648 
10649     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10650     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10651     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10652     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10653 
10654     if (dest2->is_valid()) {
10655       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10656     } else {
10657 #ifdef ASSERT
10658       Label OK;
10659       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10660       __ br(__ EQ, OK);
10661       __ stop("high bits of Poly1305 integer should be zero");
10662       __ should_not_reach_here();
10663       __ bind(OK);
10664 #endif
10665     }
10666   }
10667 
10668   // As above, but return only a 128-bit integer, packed into two
10669   // 64-bit registers.
10670   void pack_26(Register dest0, Register dest1, Register src) {
10671     pack_26(dest0, dest1, noreg, src);
10672   }
10673 
10674   // Multiply and multiply-accumulate unsigned 64-bit registers.
10675   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10676     __ mul(prod_lo, n, m);
10677     __ umulh(prod_hi, n, m);
10678   }
10679   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10680     wide_mul(rscratch1, rscratch2, n, m);
10681     __ adds(sum_lo, sum_lo, rscratch1);
10682     __ adc(sum_hi, sum_hi, rscratch2);
10683   }
10684 
10685   // Poly1305, RFC 7539
10686 
10687   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10688   // description of the tricks used to simplify and accelerate this
10689   // computation.
10690 
10691   address generate_poly1305_processBlocks() {
10692     __ align(CodeEntryAlignment);
10693     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10694     StubCodeMark mark(this, stub_id);
10695     address start = __ pc();
10696     Label here;
10697     __ enter();
10698     RegSet callee_saved = RegSet::range(r19, r28);
10699     __ push(callee_saved, sp);
10700 
10701     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10702 
10703     // Arguments
10704     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10705 
10706     // R_n is the 128-bit randomly-generated key, packed into two
10707     // registers.  The caller passes this key to us as long[5], with
10708     // BITS_PER_LIMB = 26.
10709     const Register R_0 = *++regs, R_1 = *++regs;
10710     pack_26(R_0, R_1, r_start);
10711 
10712     // RR_n is (R_n >> 2) * 5
10713     const Register RR_0 = *++regs, RR_1 = *++regs;
10714     __ lsr(RR_0, R_0, 2);
10715     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10716     __ lsr(RR_1, R_1, 2);
10717     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10718 
10719     // U_n is the current checksum
10720     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10721     pack_26(U_0, U_1, U_2, acc_start);
10722 
10723     static constexpr int BLOCK_LENGTH = 16;
10724     Label DONE, LOOP;
10725 
10726     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10727     __ br(Assembler::LT, DONE); {
10728       __ bind(LOOP);
10729 
10730       // S_n is to be the sum of U_n and the next block of data
10731       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10732       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10733       __ adds(S_0, U_0, S_0);
10734       __ adcs(S_1, U_1, S_1);
10735       __ adc(S_2, U_2, zr);
10736       __ add(S_2, S_2, 1);
10737 
10738       const Register U_0HI = *++regs, U_1HI = *++regs;
10739 
10740       // NB: this logic depends on some of the special properties of
10741       // Poly1305 keys. In particular, because we know that the top
10742       // four bits of R_0 and R_1 are zero, we can add together
10743       // partial products without any risk of needing to propagate a
10744       // carry out.
10745       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10746       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10747       __ andr(U_2, R_0, 3);
10748       __ mul(U_2, S_2, U_2);
10749 
10750       // Recycle registers S_0, S_1, S_2
10751       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10752 
10753       // Partial reduction mod 2**130 - 5
10754       __ adds(U_1, U_0HI, U_1);
10755       __ adc(U_2, U_1HI, U_2);
10756       // Sum now in U_2:U_1:U_0.
10757       // Dead: U_0HI, U_1HI.
10758       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10759 
10760       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10761 
10762       // First, U_2:U_1:U_0 += (U_2 >> 2)
10763       __ lsr(rscratch1, U_2, 2);
10764       __ andr(U_2, U_2, (u8)3);
10765       __ adds(U_0, U_0, rscratch1);
10766       __ adcs(U_1, U_1, zr);
10767       __ adc(U_2, U_2, zr);
10768       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10769       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10770       __ adcs(U_1, U_1, zr);
10771       __ adc(U_2, U_2, zr);
10772 
10773       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10774       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10775       __ br(~ Assembler::LT, LOOP);
10776     }
10777 
10778     // Further reduce modulo 2^130 - 5
10779     __ lsr(rscratch1, U_2, 2);
10780     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10781     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10782     __ adcs(U_1, U_1, zr);
10783     __ andr(U_2, U_2, (u1)3);
10784     __ adc(U_2, U_2, zr);
10785 
10786     // Unpack the sum into five 26-bit limbs and write to memory.
10787     __ ubfiz(rscratch1, U_0, 0, 26);
10788     __ ubfx(rscratch2, U_0, 26, 26);
10789     __ stp(rscratch1, rscratch2, Address(acc_start));
10790     __ ubfx(rscratch1, U_0, 52, 12);
10791     __ bfi(rscratch1, U_1, 12, 14);
10792     __ ubfx(rscratch2, U_1, 14, 26);
10793     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10794     __ ubfx(rscratch1, U_1, 40, 24);
10795     __ bfi(rscratch1, U_2, 24, 3);
10796     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10797 
10798     __ bind(DONE);
10799     __ pop(callee_saved, sp);
10800     __ leave();
10801     __ ret(lr);
10802 
10803     return start;
10804   }
10805 
10806   // exception handler for upcall stubs
10807   address generate_upcall_stub_exception_handler() {
10808     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10809     StubCodeMark mark(this, stub_id);
10810     address start = __ pc();
10811 
10812     // Native caller has no idea how to handle exceptions,
10813     // so we just crash here. Up to callee to catch exceptions.
10814     __ verify_oop(r0);
10815     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10816     __ blr(rscratch1);
10817     __ should_not_reach_here();
10818 
10819     return start;
10820   }
10821 
10822   // load Method* target of MethodHandle
10823   // j_rarg0 = jobject receiver
10824   // rmethod = result
10825   address generate_upcall_stub_load_target() {
10826     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10827     StubCodeMark mark(this, stub_id);
10828     address start = __ pc();
10829 
10830     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10831       // Load target method from receiver
10832     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10833     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10834     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10835     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10836                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10837                       noreg, noreg);
10838     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10839 
10840     __ ret(lr);
10841 
10842     return start;
10843   }
10844 
10845 #undef __
10846 #define __ masm->
10847 
10848   class MontgomeryMultiplyGenerator : public MacroAssembler {
10849 
10850     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10851       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10852 
10853     RegSet _toSave;
10854     bool _squaring;
10855 
10856   public:
10857     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10858       : MacroAssembler(as->code()), _squaring(squaring) {
10859 
10860       // Register allocation
10861 
10862       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10863       Pa_base = *regs;       // Argument registers
10864       if (squaring)
10865         Pb_base = Pa_base;
10866       else
10867         Pb_base = *++regs;
10868       Pn_base = *++regs;
10869       Rlen= *++regs;
10870       inv = *++regs;
10871       Pm_base = *++regs;
10872 
10873                           // Working registers:
10874       Ra =  *++regs;        // The current digit of a, b, n, and m.
10875       Rb =  *++regs;
10876       Rm =  *++regs;
10877       Rn =  *++regs;
10878 
10879       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10880       Pb =  *++regs;
10881       Pm =  *++regs;
10882       Pn =  *++regs;
10883 
10884       t0 =  *++regs;        // Three registers which form a
10885       t1 =  *++regs;        // triple-precision accumuator.
10886       t2 =  *++regs;
10887 
10888       Ri =  *++regs;        // Inner and outer loop indexes.
10889       Rj =  *++regs;
10890 
10891       Rhi_ab = *++regs;     // Product registers: low and high parts
10892       Rlo_ab = *++regs;     // of a*b and m*n.
10893       Rhi_mn = *++regs;
10894       Rlo_mn = *++regs;
10895 
10896       // r19 and up are callee-saved.
10897       _toSave = RegSet::range(r19, *regs) + Pm_base;
10898     }
10899 
10900   private:
10901     void save_regs() {
10902       push(_toSave, sp);
10903     }
10904 
10905     void restore_regs() {
10906       pop(_toSave, sp);
10907     }
10908 
10909     template <typename T>
10910     void unroll_2(Register count, T block) {
10911       Label loop, end, odd;
10912       tbnz(count, 0, odd);
10913       cbz(count, end);
10914       align(16);
10915       bind(loop);
10916       (this->*block)();
10917       bind(odd);
10918       (this->*block)();
10919       subs(count, count, 2);
10920       br(Assembler::GT, loop);
10921       bind(end);
10922     }
10923 
10924     template <typename T>
10925     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10926       Label loop, end, odd;
10927       tbnz(count, 0, odd);
10928       cbz(count, end);
10929       align(16);
10930       bind(loop);
10931       (this->*block)(d, s, tmp);
10932       bind(odd);
10933       (this->*block)(d, s, tmp);
10934       subs(count, count, 2);
10935       br(Assembler::GT, loop);
10936       bind(end);
10937     }
10938 
10939     void pre1(RegisterOrConstant i) {
10940       block_comment("pre1");
10941       // Pa = Pa_base;
10942       // Pb = Pb_base + i;
10943       // Pm = Pm_base;
10944       // Pn = Pn_base + i;
10945       // Ra = *Pa;
10946       // Rb = *Pb;
10947       // Rm = *Pm;
10948       // Rn = *Pn;
10949       ldr(Ra, Address(Pa_base));
10950       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10951       ldr(Rm, Address(Pm_base));
10952       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10953       lea(Pa, Address(Pa_base));
10954       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10955       lea(Pm, Address(Pm_base));
10956       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10957 
10958       // Zero the m*n result.
10959       mov(Rhi_mn, zr);
10960       mov(Rlo_mn, zr);
10961     }
10962 
10963     // The core multiply-accumulate step of a Montgomery
10964     // multiplication.  The idea is to schedule operations as a
10965     // pipeline so that instructions with long latencies (loads and
10966     // multiplies) have time to complete before their results are
10967     // used.  This most benefits in-order implementations of the
10968     // architecture but out-of-order ones also benefit.
10969     void step() {
10970       block_comment("step");
10971       // MACC(Ra, Rb, t0, t1, t2);
10972       // Ra = *++Pa;
10973       // Rb = *--Pb;
10974       umulh(Rhi_ab, Ra, Rb);
10975       mul(Rlo_ab, Ra, Rb);
10976       ldr(Ra, pre(Pa, wordSize));
10977       ldr(Rb, pre(Pb, -wordSize));
10978       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10979                                        // previous iteration.
10980       // MACC(Rm, Rn, t0, t1, t2);
10981       // Rm = *++Pm;
10982       // Rn = *--Pn;
10983       umulh(Rhi_mn, Rm, Rn);
10984       mul(Rlo_mn, Rm, Rn);
10985       ldr(Rm, pre(Pm, wordSize));
10986       ldr(Rn, pre(Pn, -wordSize));
10987       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10988     }
10989 
10990     void post1() {
10991       block_comment("post1");
10992 
10993       // MACC(Ra, Rb, t0, t1, t2);
10994       // Ra = *++Pa;
10995       // Rb = *--Pb;
10996       umulh(Rhi_ab, Ra, Rb);
10997       mul(Rlo_ab, Ra, Rb);
10998       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10999       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11000 
11001       // *Pm = Rm = t0 * inv;
11002       mul(Rm, t0, inv);
11003       str(Rm, Address(Pm));
11004 
11005       // MACC(Rm, Rn, t0, t1, t2);
11006       // t0 = t1; t1 = t2; t2 = 0;
11007       umulh(Rhi_mn, Rm, Rn);
11008 
11009 #ifndef PRODUCT
11010       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11011       {
11012         mul(Rlo_mn, Rm, Rn);
11013         add(Rlo_mn, t0, Rlo_mn);
11014         Label ok;
11015         cbz(Rlo_mn, ok); {
11016           stop("broken Montgomery multiply");
11017         } bind(ok);
11018       }
11019 #endif
11020       // We have very carefully set things up so that
11021       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11022       // the lower half of Rm * Rn because we know the result already:
11023       // it must be -t0.  t0 + (-t0) must generate a carry iff
11024       // t0 != 0.  So, rather than do a mul and an adds we just set
11025       // the carry flag iff t0 is nonzero.
11026       //
11027       // mul(Rlo_mn, Rm, Rn);
11028       // adds(zr, t0, Rlo_mn);
11029       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11030       adcs(t0, t1, Rhi_mn);
11031       adc(t1, t2, zr);
11032       mov(t2, zr);
11033     }
11034 
11035     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11036       block_comment("pre2");
11037       // Pa = Pa_base + i-len;
11038       // Pb = Pb_base + len;
11039       // Pm = Pm_base + i-len;
11040       // Pn = Pn_base + len;
11041 
11042       if (i.is_register()) {
11043         sub(Rj, i.as_register(), len);
11044       } else {
11045         mov(Rj, i.as_constant());
11046         sub(Rj, Rj, len);
11047       }
11048       // Rj == i-len
11049 
11050       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11051       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11052       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11053       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11054 
11055       // Ra = *++Pa;
11056       // Rb = *--Pb;
11057       // Rm = *++Pm;
11058       // Rn = *--Pn;
11059       ldr(Ra, pre(Pa, wordSize));
11060       ldr(Rb, pre(Pb, -wordSize));
11061       ldr(Rm, pre(Pm, wordSize));
11062       ldr(Rn, pre(Pn, -wordSize));
11063 
11064       mov(Rhi_mn, zr);
11065       mov(Rlo_mn, zr);
11066     }
11067 
11068     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11069       block_comment("post2");
11070       if (i.is_constant()) {
11071         mov(Rj, i.as_constant()-len.as_constant());
11072       } else {
11073         sub(Rj, i.as_register(), len);
11074       }
11075 
11076       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11077 
11078       // As soon as we know the least significant digit of our result,
11079       // store it.
11080       // Pm_base[i-len] = t0;
11081       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11082 
11083       // t0 = t1; t1 = t2; t2 = 0;
11084       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11085       adc(t1, t2, zr);
11086       mov(t2, zr);
11087     }
11088 
11089     // A carry in t0 after Montgomery multiplication means that we
11090     // should subtract multiples of n from our result in m.  We'll
11091     // keep doing that until there is no carry.
11092     void normalize(RegisterOrConstant len) {
11093       block_comment("normalize");
11094       // while (t0)
11095       //   t0 = sub(Pm_base, Pn_base, t0, len);
11096       Label loop, post, again;
11097       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11098       cbz(t0, post); {
11099         bind(again); {
11100           mov(i, zr);
11101           mov(cnt, len);
11102           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11103           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11104           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11105           align(16);
11106           bind(loop); {
11107             sbcs(Rm, Rm, Rn);
11108             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11109             add(i, i, 1);
11110             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11111             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11112             sub(cnt, cnt, 1);
11113           } cbnz(cnt, loop);
11114           sbc(t0, t0, zr);
11115         } cbnz(t0, again);
11116       } bind(post);
11117     }
11118 
11119     // Move memory at s to d, reversing words.
11120     //    Increments d to end of copied memory
11121     //    Destroys tmp1, tmp2
11122     //    Preserves len
11123     //    Leaves s pointing to the address which was in d at start
11124     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11125       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11126       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11127 
11128       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11129       mov(tmp1, len);
11130       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11131       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11132     }
11133     // where
11134     void reverse1(Register d, Register s, Register tmp) {
11135       ldr(tmp, pre(s, -wordSize));
11136       ror(tmp, tmp, 32);
11137       str(tmp, post(d, wordSize));
11138     }
11139 
11140     void step_squaring() {
11141       // An extra ACC
11142       step();
11143       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11144     }
11145 
11146     void last_squaring(RegisterOrConstant i) {
11147       Label dont;
11148       // if ((i & 1) == 0) {
11149       tbnz(i.as_register(), 0, dont); {
11150         // MACC(Ra, Rb, t0, t1, t2);
11151         // Ra = *++Pa;
11152         // Rb = *--Pb;
11153         umulh(Rhi_ab, Ra, Rb);
11154         mul(Rlo_ab, Ra, Rb);
11155         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11156       } bind(dont);
11157     }
11158 
11159     void extra_step_squaring() {
11160       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11161 
11162       // MACC(Rm, Rn, t0, t1, t2);
11163       // Rm = *++Pm;
11164       // Rn = *--Pn;
11165       umulh(Rhi_mn, Rm, Rn);
11166       mul(Rlo_mn, Rm, Rn);
11167       ldr(Rm, pre(Pm, wordSize));
11168       ldr(Rn, pre(Pn, -wordSize));
11169     }
11170 
11171     void post1_squaring() {
11172       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11173 
11174       // *Pm = Rm = t0 * inv;
11175       mul(Rm, t0, inv);
11176       str(Rm, Address(Pm));
11177 
11178       // MACC(Rm, Rn, t0, t1, t2);
11179       // t0 = t1; t1 = t2; t2 = 0;
11180       umulh(Rhi_mn, Rm, Rn);
11181 
11182 #ifndef PRODUCT
11183       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11184       {
11185         mul(Rlo_mn, Rm, Rn);
11186         add(Rlo_mn, t0, Rlo_mn);
11187         Label ok;
11188         cbz(Rlo_mn, ok); {
11189           stop("broken Montgomery multiply");
11190         } bind(ok);
11191       }
11192 #endif
11193       // We have very carefully set things up so that
11194       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11195       // the lower half of Rm * Rn because we know the result already:
11196       // it must be -t0.  t0 + (-t0) must generate a carry iff
11197       // t0 != 0.  So, rather than do a mul and an adds we just set
11198       // the carry flag iff t0 is nonzero.
11199       //
11200       // mul(Rlo_mn, Rm, Rn);
11201       // adds(zr, t0, Rlo_mn);
11202       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11203       adcs(t0, t1, Rhi_mn);
11204       adc(t1, t2, zr);
11205       mov(t2, zr);
11206     }
11207 
11208     void acc(Register Rhi, Register Rlo,
11209              Register t0, Register t1, Register t2) {
11210       adds(t0, t0, Rlo);
11211       adcs(t1, t1, Rhi);
11212       adc(t2, t2, zr);
11213     }
11214 
11215   public:
11216     /**
11217      * Fast Montgomery multiplication.  The derivation of the
11218      * algorithm is in A Cryptographic Library for the Motorola
11219      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11220      *
11221      * Arguments:
11222      *
11223      * Inputs for multiplication:
11224      *   c_rarg0   - int array elements a
11225      *   c_rarg1   - int array elements b
11226      *   c_rarg2   - int array elements n (the modulus)
11227      *   c_rarg3   - int length
11228      *   c_rarg4   - int inv
11229      *   c_rarg5   - int array elements m (the result)
11230      *
11231      * Inputs for squaring:
11232      *   c_rarg0   - int array elements a
11233      *   c_rarg1   - int array elements n (the modulus)
11234      *   c_rarg2   - int length
11235      *   c_rarg3   - int inv
11236      *   c_rarg4   - int array elements m (the result)
11237      *
11238      */
11239     address generate_multiply() {
11240       Label argh, nothing;
11241       bind(argh);
11242       stop("MontgomeryMultiply total_allocation must be <= 8192");
11243 
11244       align(CodeEntryAlignment);
11245       address entry = pc();
11246 
11247       cbzw(Rlen, nothing);
11248 
11249       enter();
11250 
11251       // Make room.
11252       cmpw(Rlen, 512);
11253       br(Assembler::HI, argh);
11254       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11255       andr(sp, Ra, -2 * wordSize);
11256 
11257       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11258 
11259       {
11260         // Copy input args, reversing as we go.  We use Ra as a
11261         // temporary variable.
11262         reverse(Ra, Pa_base, Rlen, t0, t1);
11263         if (!_squaring)
11264           reverse(Ra, Pb_base, Rlen, t0, t1);
11265         reverse(Ra, Pn_base, Rlen, t0, t1);
11266       }
11267 
11268       // Push all call-saved registers and also Pm_base which we'll need
11269       // at the end.
11270       save_regs();
11271 
11272 #ifndef PRODUCT
11273       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11274       {
11275         ldr(Rn, Address(Pn_base, 0));
11276         mul(Rlo_mn, Rn, inv);
11277         subs(zr, Rlo_mn, -1);
11278         Label ok;
11279         br(EQ, ok); {
11280           stop("broken inverse in Montgomery multiply");
11281         } bind(ok);
11282       }
11283 #endif
11284 
11285       mov(Pm_base, Ra);
11286 
11287       mov(t0, zr);
11288       mov(t1, zr);
11289       mov(t2, zr);
11290 
11291       block_comment("for (int i = 0; i < len; i++) {");
11292       mov(Ri, zr); {
11293         Label loop, end;
11294         cmpw(Ri, Rlen);
11295         br(Assembler::GE, end);
11296 
11297         bind(loop);
11298         pre1(Ri);
11299 
11300         block_comment("  for (j = i; j; j--) {"); {
11301           movw(Rj, Ri);
11302           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11303         } block_comment("  } // j");
11304 
11305         post1();
11306         addw(Ri, Ri, 1);
11307         cmpw(Ri, Rlen);
11308         br(Assembler::LT, loop);
11309         bind(end);
11310         block_comment("} // i");
11311       }
11312 
11313       block_comment("for (int i = len; i < 2*len; i++) {");
11314       mov(Ri, Rlen); {
11315         Label loop, end;
11316         cmpw(Ri, Rlen, Assembler::LSL, 1);
11317         br(Assembler::GE, end);
11318 
11319         bind(loop);
11320         pre2(Ri, Rlen);
11321 
11322         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11323           lslw(Rj, Rlen, 1);
11324           subw(Rj, Rj, Ri);
11325           subw(Rj, Rj, 1);
11326           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11327         } block_comment("  } // j");
11328 
11329         post2(Ri, Rlen);
11330         addw(Ri, Ri, 1);
11331         cmpw(Ri, Rlen, Assembler::LSL, 1);
11332         br(Assembler::LT, loop);
11333         bind(end);
11334       }
11335       block_comment("} // i");
11336 
11337       normalize(Rlen);
11338 
11339       mov(Ra, Pm_base);  // Save Pm_base in Ra
11340       restore_regs();  // Restore caller's Pm_base
11341 
11342       // Copy our result into caller's Pm_base
11343       reverse(Pm_base, Ra, Rlen, t0, t1);
11344 
11345       leave();
11346       bind(nothing);
11347       ret(lr);
11348 
11349       return entry;
11350     }
11351     // In C, approximately:
11352 
11353     // void
11354     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11355     //                     julong Pn_base[], julong Pm_base[],
11356     //                     julong inv, int len) {
11357     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11358     //   julong *Pa, *Pb, *Pn, *Pm;
11359     //   julong Ra, Rb, Rn, Rm;
11360 
11361     //   int i;
11362 
11363     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11364 
11365     //   for (i = 0; i < len; i++) {
11366     //     int j;
11367 
11368     //     Pa = Pa_base;
11369     //     Pb = Pb_base + i;
11370     //     Pm = Pm_base;
11371     //     Pn = Pn_base + i;
11372 
11373     //     Ra = *Pa;
11374     //     Rb = *Pb;
11375     //     Rm = *Pm;
11376     //     Rn = *Pn;
11377 
11378     //     int iters = i;
11379     //     for (j = 0; iters--; j++) {
11380     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11381     //       MACC(Ra, Rb, t0, t1, t2);
11382     //       Ra = *++Pa;
11383     //       Rb = *--Pb;
11384     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11385     //       MACC(Rm, Rn, t0, t1, t2);
11386     //       Rm = *++Pm;
11387     //       Rn = *--Pn;
11388     //     }
11389 
11390     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11391     //     MACC(Ra, Rb, t0, t1, t2);
11392     //     *Pm = Rm = t0 * inv;
11393     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11394     //     MACC(Rm, Rn, t0, t1, t2);
11395 
11396     //     assert(t0 == 0, "broken Montgomery multiply");
11397 
11398     //     t0 = t1; t1 = t2; t2 = 0;
11399     //   }
11400 
11401     //   for (i = len; i < 2*len; i++) {
11402     //     int j;
11403 
11404     //     Pa = Pa_base + i-len;
11405     //     Pb = Pb_base + len;
11406     //     Pm = Pm_base + i-len;
11407     //     Pn = Pn_base + len;
11408 
11409     //     Ra = *++Pa;
11410     //     Rb = *--Pb;
11411     //     Rm = *++Pm;
11412     //     Rn = *--Pn;
11413 
11414     //     int iters = len*2-i-1;
11415     //     for (j = i-len+1; iters--; j++) {
11416     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11417     //       MACC(Ra, Rb, t0, t1, t2);
11418     //       Ra = *++Pa;
11419     //       Rb = *--Pb;
11420     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11421     //       MACC(Rm, Rn, t0, t1, t2);
11422     //       Rm = *++Pm;
11423     //       Rn = *--Pn;
11424     //     }
11425 
11426     //     Pm_base[i-len] = t0;
11427     //     t0 = t1; t1 = t2; t2 = 0;
11428     //   }
11429 
11430     //   while (t0)
11431     //     t0 = sub(Pm_base, Pn_base, t0, len);
11432     // }
11433 
11434     /**
11435      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11436      * multiplies than Montgomery multiplication so it should be up to
11437      * 25% faster.  However, its loop control is more complex and it
11438      * may actually run slower on some machines.
11439      *
11440      * Arguments:
11441      *
11442      * Inputs:
11443      *   c_rarg0   - int array elements a
11444      *   c_rarg1   - int array elements n (the modulus)
11445      *   c_rarg2   - int length
11446      *   c_rarg3   - int inv
11447      *   c_rarg4   - int array elements m (the result)
11448      *
11449      */
11450     address generate_square() {
11451       Label argh;
11452       bind(argh);
11453       stop("MontgomeryMultiply total_allocation must be <= 8192");
11454 
11455       align(CodeEntryAlignment);
11456       address entry = pc();
11457 
11458       enter();
11459 
11460       // Make room.
11461       cmpw(Rlen, 512);
11462       br(Assembler::HI, argh);
11463       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11464       andr(sp, Ra, -2 * wordSize);
11465 
11466       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11467 
11468       {
11469         // Copy input args, reversing as we go.  We use Ra as a
11470         // temporary variable.
11471         reverse(Ra, Pa_base, Rlen, t0, t1);
11472         reverse(Ra, Pn_base, Rlen, t0, t1);
11473       }
11474 
11475       // Push all call-saved registers and also Pm_base which we'll need
11476       // at the end.
11477       save_regs();
11478 
11479       mov(Pm_base, Ra);
11480 
11481       mov(t0, zr);
11482       mov(t1, zr);
11483       mov(t2, zr);
11484 
11485       block_comment("for (int i = 0; i < len; i++) {");
11486       mov(Ri, zr); {
11487         Label loop, end;
11488         bind(loop);
11489         cmp(Ri, Rlen);
11490         br(Assembler::GE, end);
11491 
11492         pre1(Ri);
11493 
11494         block_comment("for (j = (i+1)/2; j; j--) {"); {
11495           add(Rj, Ri, 1);
11496           lsr(Rj, Rj, 1);
11497           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11498         } block_comment("  } // j");
11499 
11500         last_squaring(Ri);
11501 
11502         block_comment("  for (j = i/2; j; j--) {"); {
11503           lsr(Rj, Ri, 1);
11504           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11505         } block_comment("  } // j");
11506 
11507         post1_squaring();
11508         add(Ri, Ri, 1);
11509         cmp(Ri, Rlen);
11510         br(Assembler::LT, loop);
11511 
11512         bind(end);
11513         block_comment("} // i");
11514       }
11515 
11516       block_comment("for (int i = len; i < 2*len; i++) {");
11517       mov(Ri, Rlen); {
11518         Label loop, end;
11519         bind(loop);
11520         cmp(Ri, Rlen, Assembler::LSL, 1);
11521         br(Assembler::GE, end);
11522 
11523         pre2(Ri, Rlen);
11524 
11525         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11526           lsl(Rj, Rlen, 1);
11527           sub(Rj, Rj, Ri);
11528           sub(Rj, Rj, 1);
11529           lsr(Rj, Rj, 1);
11530           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11531         } block_comment("  } // j");
11532 
11533         last_squaring(Ri);
11534 
11535         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11536           lsl(Rj, Rlen, 1);
11537           sub(Rj, Rj, Ri);
11538           lsr(Rj, Rj, 1);
11539           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11540         } block_comment("  } // j");
11541 
11542         post2(Ri, Rlen);
11543         add(Ri, Ri, 1);
11544         cmp(Ri, Rlen, Assembler::LSL, 1);
11545 
11546         br(Assembler::LT, loop);
11547         bind(end);
11548         block_comment("} // i");
11549       }
11550 
11551       normalize(Rlen);
11552 
11553       mov(Ra, Pm_base);  // Save Pm_base in Ra
11554       restore_regs();  // Restore caller's Pm_base
11555 
11556       // Copy our result into caller's Pm_base
11557       reverse(Pm_base, Ra, Rlen, t0, t1);
11558 
11559       leave();
11560       ret(lr);
11561 
11562       return entry;
11563     }
11564     // In C, approximately:
11565 
11566     // void
11567     // montgomery_square(julong Pa_base[], julong Pn_base[],
11568     //                   julong Pm_base[], julong inv, int len) {
11569     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11570     //   julong *Pa, *Pb, *Pn, *Pm;
11571     //   julong Ra, Rb, Rn, Rm;
11572 
11573     //   int i;
11574 
11575     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11576 
11577     //   for (i = 0; i < len; i++) {
11578     //     int j;
11579 
11580     //     Pa = Pa_base;
11581     //     Pb = Pa_base + i;
11582     //     Pm = Pm_base;
11583     //     Pn = Pn_base + i;
11584 
11585     //     Ra = *Pa;
11586     //     Rb = *Pb;
11587     //     Rm = *Pm;
11588     //     Rn = *Pn;
11589 
11590     //     int iters = (i+1)/2;
11591     //     for (j = 0; iters--; j++) {
11592     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11593     //       MACC2(Ra, Rb, t0, t1, t2);
11594     //       Ra = *++Pa;
11595     //       Rb = *--Pb;
11596     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11597     //       MACC(Rm, Rn, t0, t1, t2);
11598     //       Rm = *++Pm;
11599     //       Rn = *--Pn;
11600     //     }
11601     //     if ((i & 1) == 0) {
11602     //       assert(Ra == Pa_base[j], "must be");
11603     //       MACC(Ra, Ra, t0, t1, t2);
11604     //     }
11605     //     iters = i/2;
11606     //     assert(iters == i-j, "must be");
11607     //     for (; iters--; j++) {
11608     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11609     //       MACC(Rm, Rn, t0, t1, t2);
11610     //       Rm = *++Pm;
11611     //       Rn = *--Pn;
11612     //     }
11613 
11614     //     *Pm = Rm = t0 * inv;
11615     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11616     //     MACC(Rm, Rn, t0, t1, t2);
11617 
11618     //     assert(t0 == 0, "broken Montgomery multiply");
11619 
11620     //     t0 = t1; t1 = t2; t2 = 0;
11621     //   }
11622 
11623     //   for (i = len; i < 2*len; i++) {
11624     //     int start = i-len+1;
11625     //     int end = start + (len - start)/2;
11626     //     int j;
11627 
11628     //     Pa = Pa_base + i-len;
11629     //     Pb = Pa_base + len;
11630     //     Pm = Pm_base + i-len;
11631     //     Pn = Pn_base + len;
11632 
11633     //     Ra = *++Pa;
11634     //     Rb = *--Pb;
11635     //     Rm = *++Pm;
11636     //     Rn = *--Pn;
11637 
11638     //     int iters = (2*len-i-1)/2;
11639     //     assert(iters == end-start, "must be");
11640     //     for (j = start; iters--; j++) {
11641     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11642     //       MACC2(Ra, Rb, t0, t1, t2);
11643     //       Ra = *++Pa;
11644     //       Rb = *--Pb;
11645     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11646     //       MACC(Rm, Rn, t0, t1, t2);
11647     //       Rm = *++Pm;
11648     //       Rn = *--Pn;
11649     //     }
11650     //     if ((i & 1) == 0) {
11651     //       assert(Ra == Pa_base[j], "must be");
11652     //       MACC(Ra, Ra, t0, t1, t2);
11653     //     }
11654     //     iters =  (2*len-i)/2;
11655     //     assert(iters == len-j, "must be");
11656     //     for (; iters--; j++) {
11657     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11658     //       MACC(Rm, Rn, t0, t1, t2);
11659     //       Rm = *++Pm;
11660     //       Rn = *--Pn;
11661     //     }
11662     //     Pm_base[i-len] = t0;
11663     //     t0 = t1; t1 = t2; t2 = 0;
11664     //   }
11665 
11666     //   while (t0)
11667     //     t0 = sub(Pm_base, Pn_base, t0, len);
11668     // }
11669   };
11670 
11671   // Initialization
11672   void generate_preuniverse_stubs() {
11673     // preuniverse stubs are not needed for aarch64
11674   }
11675 
11676   void generate_initial_stubs() {
11677     // Generate initial stubs and initializes the entry points
11678 
11679     // entry points that exist in all platforms Note: This is code
11680     // that could be shared among different platforms - however the
11681     // benefit seems to be smaller than the disadvantage of having a
11682     // much more complicated generator structure. See also comment in
11683     // stubRoutines.hpp.
11684 
11685     StubRoutines::_forward_exception_entry = generate_forward_exception();
11686 
11687     StubRoutines::_call_stub_entry =
11688       generate_call_stub(StubRoutines::_call_stub_return_address);
11689 
11690     // is referenced by megamorphic call
11691     StubRoutines::_catch_exception_entry = generate_catch_exception();
11692 
11693     // Initialize table for copy memory (arraycopy) check.
11694     if (UnsafeMemoryAccess::_table == nullptr) {
11695       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11696     }
11697 
11698     if (UseCRC32Intrinsics) {
11699       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11700     }
11701 
11702     if (UseCRC32CIntrinsics) {
11703       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11704     }
11705 
11706     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11707       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11708     }
11709 
11710     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11711       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11712     }
11713 
11714     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11715         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11716       StubRoutines::_hf2f = generate_float16ToFloat();
11717       StubRoutines::_f2hf = generate_floatToFloat16();
11718     }
11719   }
11720 
11721   void generate_continuation_stubs() {
11722     // Continuation stubs:
11723     StubRoutines::_cont_thaw          = generate_cont_thaw();
11724     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11725     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11726     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11727   }
11728 
11729   void generate_final_stubs() {
11730     // support for verify_oop (must happen after universe_init)
11731     if (VerifyOops) {
11732       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11733     }
11734 
11735     // arraycopy stubs used by compilers
11736     generate_arraycopy_stubs();
11737 
11738     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11739 
11740     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11741 
11742     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11743     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11744 
11745 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11746 
11747     generate_atomic_entry_points();
11748 
11749 #endif // LINUX
11750 
11751 #ifdef COMPILER2
11752     if (UseSecondarySupersTable) {
11753       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11754       if (! InlineSecondarySupersTest) {
11755         generate_lookup_secondary_supers_table_stub();
11756       }
11757     }
11758 #endif
11759 
11760     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11761 
11762     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11763   }
11764 
11765   void generate_compiler_stubs() {
11766 #if COMPILER2_OR_JVMCI
11767 
11768     if (UseSVE == 0) {
11769       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11770     }
11771 
11772     // array equals stub for large arrays.
11773     if (!UseSimpleArrayEquals) {
11774       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11775     }
11776 
11777     // arrays_hascode stub for large arrays.
11778     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11779     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11780     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11781     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11782     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11783 
11784     // byte_array_inflate stub for large arrays.
11785     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11786 
11787     // countPositives stub for large arrays.
11788     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11789 
11790     generate_compare_long_strings();
11791 
11792     generate_string_indexof_stubs();
11793 
11794 #ifdef COMPILER2
11795     if (UseMultiplyToLenIntrinsic) {
11796       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11797     }
11798 
11799     if (UseSquareToLenIntrinsic) {
11800       StubRoutines::_squareToLen = generate_squareToLen();
11801     }
11802 
11803     if (UseMulAddIntrinsic) {
11804       StubRoutines::_mulAdd = generate_mulAdd();
11805     }
11806 
11807     if (UseSIMDForBigIntegerShiftIntrinsics) {
11808       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11809       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11810     }
11811 
11812     if (UseMontgomeryMultiplyIntrinsic) {
11813       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11814       StubCodeMark mark(this, stub_id);
11815       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11816       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11817     }
11818 
11819     if (UseMontgomerySquareIntrinsic) {
11820       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11821       StubCodeMark mark(this, stub_id);
11822       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11823       // We use generate_multiply() rather than generate_square()
11824       // because it's faster for the sizes of modulus we care about.
11825       StubRoutines::_montgomerySquare = g.generate_multiply();
11826     }
11827 
11828 #endif // COMPILER2
11829 
11830     if (UseChaCha20Intrinsics) {
11831       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11832     }
11833 
11834     if (UseKyberIntrinsics) {
11835       StubRoutines::_kyberNtt = generate_kyberNtt();
11836       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11837       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11838       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11839       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11840       StubRoutines::_kyber12To16 = generate_kyber12To16();
11841       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11842     }
11843 
11844     if (UseDilithiumIntrinsics) {
11845       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11846       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11847       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11848       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11849       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11850     }
11851 
11852     if (UseBASE64Intrinsics) {
11853         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11854         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11855     }
11856 
11857     // data cache line writeback
11858     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11859     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11860 
11861     if (UseAESIntrinsics) {
11862       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11863       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11864       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11865       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11866       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11867     }
11868     if (UseGHASHIntrinsics) {
11869       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11870       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11871     }
11872     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11873       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11874     }
11875 
11876     if (UseMD5Intrinsics) {
11877       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11878       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11879     }
11880     if (UseSHA1Intrinsics) {
11881       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11882       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11883     }
11884     if (UseSHA256Intrinsics) {
11885       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11886       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11887     }
11888     if (UseSHA512Intrinsics) {
11889       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11890       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11891     }
11892     if (UseSHA3Intrinsics) {
11893 
11894       StubRoutines::_double_keccak         = generate_double_keccak();
11895       if (UseSIMDForSHA3Intrinsic) {
11896          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11897          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11898       } else {
11899          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11900          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11901       }
11902     }
11903 
11904     if (UsePoly1305Intrinsics) {
11905       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11906     }
11907 
11908     // generate Adler32 intrinsics code
11909     if (UseAdler32Intrinsics) {
11910       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11911     }
11912 
11913 #endif // COMPILER2_OR_JVMCI
11914   }
11915 
11916  public:
11917   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11918     switch(blob_id) {
11919     case BlobId::stubgen_preuniverse_id:
11920       generate_preuniverse_stubs();
11921       break;
11922     case BlobId::stubgen_initial_id:
11923       generate_initial_stubs();
11924       break;
11925      case BlobId::stubgen_continuation_id:
11926       generate_continuation_stubs();
11927       break;
11928     case BlobId::stubgen_compiler_id:
11929       generate_compiler_stubs();
11930       break;
11931     case BlobId::stubgen_final_id:
11932       generate_final_stubs();
11933       break;
11934     default:
11935       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11936       break;
11937     };
11938   }
11939 }; // end class declaration
11940 
11941 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11942   StubGenerator g(code, blob_id);
11943 }
11944 
11945 
11946 #if defined (LINUX)
11947 
11948 // Define pointers to atomic stubs and initialize them to point to the
11949 // code in atomic_aarch64.S.
11950 
11951 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11952   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11953     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11954   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11955     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11956 
11957 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11958 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11959 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11960 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11961 DEFAULT_ATOMIC_OP(xchg, 4, )
11962 DEFAULT_ATOMIC_OP(xchg, 8, )
11963 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11964 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11965 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11966 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11967 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11968 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11969 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11970 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11971 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11972 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11973 
11974 #undef DEFAULT_ATOMIC_OP
11975 
11976 #endif // LINUX