1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "code/aotCodeCache.hpp"
   31 #include "compiler/oopMap.hpp"
   32 #include "gc/shared/barrierSet.hpp"
   33 #include "gc/shared/barrierSetAssembler.hpp"
   34 #include "gc/shared/gc_globals.hpp"
   35 #include "gc/shared/tlab_globals.hpp"
   36 #include "interpreter/interpreter.hpp"
   37 #include "memory/universe.hpp"
   38 #include "nativeInst_aarch64.hpp"
   39 #include "oops/instanceOop.hpp"
   40 #include "oops/method.hpp"
   41 #include "oops/objArrayKlass.hpp"
   42 #include "oops/oop.inline.hpp"
   43 #include "prims/methodHandles.hpp"
   44 #include "prims/upcallLinker.hpp"
   45 #include "runtime/arguments.hpp"
   46 #include "runtime/atomic.hpp"
   47 #include "runtime/continuation.hpp"
   48 #include "runtime/continuationEntry.inline.hpp"
   49 #include "runtime/frame.inline.hpp"
   50 #include "runtime/handles.inline.hpp"
   51 #include "runtime/javaThread.hpp"
   52 #include "runtime/sharedRuntime.hpp"
   53 #include "runtime/stubCodeGenerator.hpp"
   54 #include "runtime/stubRoutines.hpp"
   55 #include "utilities/align.hpp"
   56 #include "utilities/checkedCast.hpp"
   57 #include "utilities/debug.hpp"
   58 #include "utilities/globalDefinitions.hpp"
   59 #include "utilities/intpow.hpp"
   60 #include "utilities/powerOfTwo.hpp"
   61 #ifdef COMPILER2
   62 #include "opto/runtime.hpp"
   63 #endif
   64 #if INCLUDE_ZGC
   65 #include "gc/z/zThreadLocalData.hpp"
   66 #endif
   67 
   68 // Declaration and definition of StubGenerator (no .hpp file).
   69 // For a more detailed description of the stub routine structure
   70 // see the comment in stubRoutines.hpp
   71 
   72 #undef __
   73 #define __ _masm->
   74 
   75 #ifdef PRODUCT
   76 #define BLOCK_COMMENT(str) /* nothing */
   77 #else
   78 #define BLOCK_COMMENT(str) __ block_comment(str)
   79 #endif
   80 
   81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   82 
   83 // Stub Code definitions
   84 
   85 class StubGenerator: public StubCodeGenerator {
   86  private:
   87 
   88 #ifdef PRODUCT
   89 #define inc_counter_np(counter) ((void)0)
   90 #else
   91   void inc_counter_np_(uint& counter) {
   92     __ incrementw(ExternalAddress((address)&counter));
   93   }
   94 #define inc_counter_np(counter) \
   95   BLOCK_COMMENT("inc_counter " #counter); \
   96   inc_counter_np_(counter);
   97 #endif
   98 
   99   // Call stubs are used to call Java from C
  100   //
  101   // Arguments:
  102   //    c_rarg0:   call wrapper address                   address
  103   //    c_rarg1:   result                                 address
  104   //    c_rarg2:   result type                            BasicType
  105   //    c_rarg3:   method                                 Method*
  106   //    c_rarg4:   (interpreter) entry point              address
  107   //    c_rarg5:   parameters                             intptr_t*
  108   //    c_rarg6:   parameter size (in words)              int
  109   //    c_rarg7:   thread                                 Thread*
  110   //
  111   // There is no return from the stub itself as any Java result
  112   // is written to result
  113   //
  114   // we save r30 (lr) as the return PC at the base of the frame and
  115   // link r29 (fp) below it as the frame pointer installing sp (r31)
  116   // into fp.
  117   //
  118   // we save r0-r7, which accounts for all the c arguments.
  119   //
  120   // TODO: strictly do we need to save them all? they are treated as
  121   // volatile by C so could we omit saving the ones we are going to
  122   // place in global registers (thread? method?) or those we only use
  123   // during setup of the Java call?
  124   //
  125   // we don't need to save r8 which C uses as an indirect result location
  126   // return register.
  127   //
  128   // we don't need to save r9-r15 which both C and Java treat as
  129   // volatile
  130   //
  131   // we don't need to save r16-18 because Java does not use them
  132   //
  133   // we save r19-r28 which Java uses as scratch registers and C
  134   // expects to be callee-save
  135   //
  136   // we save the bottom 64 bits of each value stored in v8-v15; it is
  137   // the responsibility of the caller to preserve larger values.
  138   //
  139   // so the stub frame looks like this when we enter Java code
  140   //
  141   //     [ return_from_Java     ] <--- sp
  142   //     [ argument word n      ]
  143   //      ...
  144   // -29 [ argument word 1      ]
  145   // -28 [ saved Floating-point Control Register ]
  146   // -26 [ saved v15            ] <--- sp_after_call
  147   // -25 [ saved v14            ]
  148   // -24 [ saved v13            ]
  149   // -23 [ saved v12            ]
  150   // -22 [ saved v11            ]
  151   // -21 [ saved v10            ]
  152   // -20 [ saved v9             ]
  153   // -19 [ saved v8             ]
  154   // -18 [ saved r28            ]
  155   // -17 [ saved r27            ]
  156   // -16 [ saved r26            ]
  157   // -15 [ saved r25            ]
  158   // -14 [ saved r24            ]
  159   // -13 [ saved r23            ]
  160   // -12 [ saved r22            ]
  161   // -11 [ saved r21            ]
  162   // -10 [ saved r20            ]
  163   //  -9 [ saved r19            ]
  164   //  -8 [ call wrapper    (r0) ]
  165   //  -7 [ result          (r1) ]
  166   //  -6 [ result type     (r2) ]
  167   //  -5 [ method          (r3) ]
  168   //  -4 [ entry point     (r4) ]
  169   //  -3 [ parameters      (r5) ]
  170   //  -2 [ parameter size  (r6) ]
  171   //  -1 [ thread (r7)          ]
  172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  173   //   1 [ saved lr       (r30) ]
  174 
  175   // Call stub stack layout word offsets from fp
  176   enum call_stub_layout {
  177     sp_after_call_off  = -28,
  178 
  179     fpcr_off           = sp_after_call_off,
  180     d15_off            = -26,
  181     d13_off            = -24,
  182     d11_off            = -22,
  183     d9_off             = -20,
  184 
  185     r28_off            = -18,
  186     r26_off            = -16,
  187     r24_off            = -14,
  188     r22_off            = -12,
  189     r20_off            = -10,
  190     call_wrapper_off   =  -8,
  191     result_off         =  -7,
  192     result_type_off    =  -6,
  193     method_off         =  -5,
  194     entry_point_off    =  -4,
  195     parameter_size_off =  -2,
  196     thread_off         =  -1,
  197     fp_f               =   0,
  198     retaddr_off        =   1,
  199   };
  200 
  201   address generate_call_stub(address& return_address) {
  202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  204            "adjust this code");
  205 
  206     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  207     StubCodeMark mark(this, stub_id);
  208     address start = __ pc();
  209 
  210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  211 
  212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  214     const Address result        (rfp, result_off         * wordSize);
  215     const Address result_type   (rfp, result_type_off    * wordSize);
  216     const Address method        (rfp, method_off         * wordSize);
  217     const Address entry_point   (rfp, entry_point_off    * wordSize);
  218     const Address parameter_size(rfp, parameter_size_off * wordSize);
  219 
  220     const Address thread        (rfp, thread_off         * wordSize);
  221 
  222     const Address d15_save      (rfp, d15_off * wordSize);
  223     const Address d13_save      (rfp, d13_off * wordSize);
  224     const Address d11_save      (rfp, d11_off * wordSize);
  225     const Address d9_save       (rfp, d9_off * wordSize);
  226 
  227     const Address r28_save      (rfp, r28_off * wordSize);
  228     const Address r26_save      (rfp, r26_off * wordSize);
  229     const Address r24_save      (rfp, r24_off * wordSize);
  230     const Address r22_save      (rfp, r22_off * wordSize);
  231     const Address r20_save      (rfp, r20_off * wordSize);
  232 
  233     // stub code
  234 
  235     address aarch64_entry = __ pc();
  236 
  237     // set up frame and move sp to end of save area
  238     __ enter();
  239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  240 
  241     // save register parameters and Java scratch/global registers
  242     // n.b. we save thread even though it gets installed in
  243     // rthread because we want to sanity check rthread later
  244     __ str(c_rarg7,  thread);
  245     __ strw(c_rarg6, parameter_size);
  246     __ stp(c_rarg4, c_rarg5,  entry_point);
  247     __ stp(c_rarg2, c_rarg3,  result_type);
  248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  249 
  250     __ stp(r20, r19,   r20_save);
  251     __ stp(r22, r21,   r22_save);
  252     __ stp(r24, r23,   r24_save);
  253     __ stp(r26, r25,   r26_save);
  254     __ stp(r28, r27,   r28_save);
  255 
  256     __ stpd(v9,  v8,   d9_save);
  257     __ stpd(v11, v10,  d11_save);
  258     __ stpd(v13, v12,  d13_save);
  259     __ stpd(v15, v14,  d15_save);
  260 
  261     __ get_fpcr(rscratch1);
  262     __ str(rscratch1, fpcr_save);
  263     // Set FPCR to the state we need. We do want Round to Nearest. We
  264     // don't want non-IEEE rounding modes or floating-point traps.
  265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  267     __ set_fpcr(rscratch1);
  268 
  269     // install Java thread in global register now we have saved
  270     // whatever value it held
  271     __ mov(rthread, c_rarg7);
  272     // And method
  273     __ mov(rmethod, c_rarg3);
  274 
  275     // set up the heapbase register
  276     __ reinit_heapbase();
  277 
  278 #ifdef ASSERT
  279     // make sure we have no pending exceptions
  280     {
  281       Label L;
  282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  283       __ cmp(rscratch1, (u1)NULL_WORD);
  284       __ br(Assembler::EQ, L);
  285       __ stop("StubRoutines::call_stub: entered with pending exception");
  286       __ BIND(L);
  287     }
  288 #endif
  289     // pass parameters if any
  290     __ mov(esp, sp);
  291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  292     __ andr(sp, rscratch1, -2 * wordSize);
  293 
  294     BLOCK_COMMENT("pass parameters if any");
  295     Label parameters_done;
  296     // parameter count is still in c_rarg6
  297     // and parameter pointer identifying param 1 is in c_rarg5
  298     __ cbzw(c_rarg6, parameters_done);
  299 
  300     address loop = __ pc();
  301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  302     __ subsw(c_rarg6, c_rarg6, 1);
  303     __ push(rscratch1);
  304     __ br(Assembler::GT, loop);
  305 
  306     __ BIND(parameters_done);
  307 
  308     // call Java entry -- passing methdoOop, and current sp
  309     //      rmethod: Method*
  310     //      r19_sender_sp: sender sp
  311     BLOCK_COMMENT("call Java function");
  312     __ mov(r19_sender_sp, sp);
  313     __ blr(c_rarg4);
  314 
  315     // we do this here because the notify will already have been done
  316     // if we get to the next instruction via an exception
  317     //
  318     // n.b. adding this instruction here affects the calculation of
  319     // whether or not a routine returns to the call stub (used when
  320     // doing stack walks) since the normal test is to check the return
  321     // pc against the address saved below. so we may need to allow for
  322     // this extra instruction in the check.
  323 
  324     // save current address for use by exception handling code
  325 
  326     return_address = __ pc();
  327 
  328     // store result depending on type (everything that is not
  329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  330     // n.b. this assumes Java returns an integral result in r0
  331     // and a floating result in j_farg0
  332     __ ldr(j_rarg2, result);
  333     Label is_long, is_float, is_double, exit;
  334     __ ldr(j_rarg1, result_type);
  335     __ cmp(j_rarg1, (u1)T_OBJECT);
  336     __ br(Assembler::EQ, is_long);
  337     __ cmp(j_rarg1, (u1)T_LONG);
  338     __ br(Assembler::EQ, is_long);
  339     __ cmp(j_rarg1, (u1)T_FLOAT);
  340     __ br(Assembler::EQ, is_float);
  341     __ cmp(j_rarg1, (u1)T_DOUBLE);
  342     __ br(Assembler::EQ, is_double);
  343 
  344     // handle T_INT case
  345     __ strw(r0, Address(j_rarg2));
  346 
  347     __ BIND(exit);
  348 
  349     // pop parameters
  350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  351 
  352 #ifdef ASSERT
  353     // verify that threads correspond
  354     {
  355       Label L, S;
  356       __ ldr(rscratch1, thread);
  357       __ cmp(rthread, rscratch1);
  358       __ br(Assembler::NE, S);
  359       __ get_thread(rscratch1);
  360       __ cmp(rthread, rscratch1);
  361       __ br(Assembler::EQ, L);
  362       __ BIND(S);
  363       __ stop("StubRoutines::call_stub: threads must correspond");
  364       __ BIND(L);
  365     }
  366 #endif
  367 
  368     __ pop_cont_fastpath(rthread);
  369 
  370     // restore callee-save registers
  371     __ ldpd(v15, v14,  d15_save);
  372     __ ldpd(v13, v12,  d13_save);
  373     __ ldpd(v11, v10,  d11_save);
  374     __ ldpd(v9,  v8,   d9_save);
  375 
  376     __ ldp(r28, r27,   r28_save);
  377     __ ldp(r26, r25,   r26_save);
  378     __ ldp(r24, r23,   r24_save);
  379     __ ldp(r22, r21,   r22_save);
  380     __ ldp(r20, r19,   r20_save);
  381 
  382     // restore fpcr
  383     __ ldr(rscratch1,  fpcr_save);
  384     __ set_fpcr(rscratch1);
  385 
  386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  387     __ ldrw(c_rarg2, result_type);
  388     __ ldr(c_rarg3,  method);
  389     __ ldp(c_rarg4, c_rarg5,  entry_point);
  390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  391 
  392     // leave frame and return to caller
  393     __ leave();
  394     __ ret(lr);
  395 
  396     // handle return types different from T_INT
  397 
  398     __ BIND(is_long);
  399     __ str(r0, Address(j_rarg2, 0));
  400     __ br(Assembler::AL, exit);
  401 
  402     __ BIND(is_float);
  403     __ strs(j_farg0, Address(j_rarg2, 0));
  404     __ br(Assembler::AL, exit);
  405 
  406     __ BIND(is_double);
  407     __ strd(j_farg0, Address(j_rarg2, 0));
  408     __ br(Assembler::AL, exit);
  409 
  410     return start;
  411   }
  412 
  413   // Return point for a Java call if there's an exception thrown in
  414   // Java code.  The exception is caught and transformed into a
  415   // pending exception stored in JavaThread that can be tested from
  416   // within the VM.
  417   //
  418   // Note: Usually the parameters are removed by the callee. In case
  419   // of an exception crossing an activation frame boundary, that is
  420   // not the case if the callee is compiled code => need to setup the
  421   // rsp.
  422   //
  423   // r0: exception oop
  424 
  425   address generate_catch_exception() {
  426     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  427     StubCodeMark mark(this, stub_id);
  428     address start = __ pc();
  429 
  430     // same as in generate_call_stub():
  431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  432     const Address thread        (rfp, thread_off         * wordSize);
  433 
  434 #ifdef ASSERT
  435     // verify that threads correspond
  436     {
  437       Label L, S;
  438       __ ldr(rscratch1, thread);
  439       __ cmp(rthread, rscratch1);
  440       __ br(Assembler::NE, S);
  441       __ get_thread(rscratch1);
  442       __ cmp(rthread, rscratch1);
  443       __ br(Assembler::EQ, L);
  444       __ bind(S);
  445       __ stop("StubRoutines::catch_exception: threads must correspond");
  446       __ bind(L);
  447     }
  448 #endif
  449 
  450     // set pending exception
  451     __ verify_oop(r0);
  452 
  453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  454     __ mov(rscratch1, (address)__FILE__);
  455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  456     __ movw(rscratch1, (int)__LINE__);
  457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  458 
  459     // complete return to VM
  460     assert(StubRoutines::_call_stub_return_address != nullptr,
  461            "_call_stub_return_address must have been generated before");
  462     __ b(StubRoutines::_call_stub_return_address);
  463 
  464     return start;
  465   }
  466 
  467   // Continuation point for runtime calls returning with a pending
  468   // exception.  The pending exception check happened in the runtime
  469   // or native call stub.  The pending exception in Thread is
  470   // converted into a Java-level exception.
  471   //
  472   // Contract with Java-level exception handlers:
  473   // r0: exception
  474   // r3: throwing pc
  475   //
  476   // NOTE: At entry of this stub, exception-pc must be in LR !!
  477 
  478   // NOTE: this is always used as a jump target within generated code
  479   // so it just needs to be generated code with no x86 prolog
  480 
  481   address generate_forward_exception() {
  482     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  483     StubCodeMark mark(this, stub_id);
  484     address start = __ pc();
  485 
  486     // Upon entry, LR points to the return address returning into
  487     // Java (interpreted or compiled) code; i.e., the return address
  488     // becomes the throwing pc.
  489     //
  490     // Arguments pushed before the runtime call are still on the stack
  491     // but the exception handler will reset the stack pointer ->
  492     // ignore them.  A potential result in registers can be ignored as
  493     // well.
  494 
  495 #ifdef ASSERT
  496     // make sure this code is only executed if there is a pending exception
  497     {
  498       Label L;
  499       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  500       __ cbnz(rscratch1, L);
  501       __ stop("StubRoutines::forward exception: no pending exception (1)");
  502       __ bind(L);
  503     }
  504 #endif
  505 
  506     // compute exception handler into r19
  507 
  508     // call the VM to find the handler address associated with the
  509     // caller address. pass thread in r0 and caller pc (ret address)
  510     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  511     // the stack.
  512     __ mov(c_rarg1, lr);
  513     // lr will be trashed by the VM call so we move it to R19
  514     // (callee-saved) because we also need to pass it to the handler
  515     // returned by this call.
  516     __ mov(r19, lr);
  517     BLOCK_COMMENT("call exception_handler_for_return_address");
  518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  519                          SharedRuntime::exception_handler_for_return_address),
  520                     rthread, c_rarg1);
  521     // Reinitialize the ptrue predicate register, in case the external runtime
  522     // call clobbers ptrue reg, as we may return to SVE compiled code.
  523     __ reinitialize_ptrue();
  524 
  525     // we should not really care that lr is no longer the callee
  526     // address. we saved the value the handler needs in r19 so we can
  527     // just copy it to r3. however, the C2 handler will push its own
  528     // frame and then calls into the VM and the VM code asserts that
  529     // the PC for the frame above the handler belongs to a compiled
  530     // Java method. So, we restore lr here to satisfy that assert.
  531     __ mov(lr, r19);
  532     // setup r0 & r3 & clear pending exception
  533     __ mov(r3, r19);
  534     __ mov(r19, r0);
  535     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  536     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  537 
  538 #ifdef ASSERT
  539     // make sure exception is set
  540     {
  541       Label L;
  542       __ cbnz(r0, L);
  543       __ stop("StubRoutines::forward exception: no pending exception (2)");
  544       __ bind(L);
  545     }
  546 #endif
  547 
  548     // continue at exception handler
  549     // r0: exception
  550     // r3: throwing pc
  551     // r19: exception handler
  552     __ verify_oop(r0);
  553     __ br(r19);
  554 
  555     return start;
  556   }
  557 
  558   // Non-destructive plausibility checks for oops
  559   //
  560   // Arguments:
  561   //    r0: oop to verify
  562   //    rscratch1: error message
  563   //
  564   // Stack after saving c_rarg3:
  565   //    [tos + 0]: saved c_rarg3
  566   //    [tos + 1]: saved c_rarg2
  567   //    [tos + 2]: saved lr
  568   //    [tos + 3]: saved rscratch2
  569   //    [tos + 4]: saved r0
  570   //    [tos + 5]: saved rscratch1
  571   address generate_verify_oop() {
  572     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  573     StubCodeMark mark(this, stub_id);
  574     address start = __ pc();
  575 
  576     Label exit, error;
  577 
  578     // save c_rarg2 and c_rarg3
  579     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  580 
  581     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  583     __ ldr(c_rarg3, Address(c_rarg2));
  584     __ add(c_rarg3, c_rarg3, 1);
  585     __ str(c_rarg3, Address(c_rarg2));
  586 
  587     // object is in r0
  588     // make sure object is 'reasonable'
  589     __ cbz(r0, exit); // if obj is null it is OK
  590 
  591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  592     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  593 
  594     // return if everything seems ok
  595     __ bind(exit);
  596 
  597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  598     __ ret(lr);
  599 
  600     // handle errors
  601     __ bind(error);
  602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  603 
  604     __ push(RegSet::range(r0, r29), sp);
  605     // debug(char* msg, int64_t pc, int64_t regs[])
  606     __ mov(c_rarg0, rscratch1);      // pass address of error message
  607     __ mov(c_rarg1, lr);             // pass return address
  608     __ mov(c_rarg2, sp);             // pass address of regs on stack
  609 #ifndef PRODUCT
  610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  611 #endif
  612     BLOCK_COMMENT("call MacroAssembler::debug");
  613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  614     __ blr(rscratch1);
  615     __ hlt(0);
  616 
  617     return start;
  618   }
  619 
  620   // Generate indices for iota vector.
  621   address generate_iota_indices(StubGenStubId stub_id) {
  622     __ align(CodeEntryAlignment);
  623     StubCodeMark mark(this, stub_id);
  624     address start = __ pc();
  625     // B
  626     __ emit_data64(0x0706050403020100, relocInfo::none);
  627     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  628     // H
  629     __ emit_data64(0x0003000200010000, relocInfo::none);
  630     __ emit_data64(0x0007000600050004, relocInfo::none);
  631     // S
  632     __ emit_data64(0x0000000100000000, relocInfo::none);
  633     __ emit_data64(0x0000000300000002, relocInfo::none);
  634     // D
  635     __ emit_data64(0x0000000000000000, relocInfo::none);
  636     __ emit_data64(0x0000000000000001, relocInfo::none);
  637     // S - FP
  638     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  639     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  640     // D - FP
  641     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  642     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  643     return start;
  644   }
  645 
  646   // The inner part of zero_words().  This is the bulk operation,
  647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  648   // caller is responsible for zeroing the last few words.
  649   //
  650   // Inputs:
  651   // r10: the HeapWord-aligned base address of an array to zero.
  652   // r11: the count in HeapWords, r11 > 0.
  653   //
  654   // Returns r10 and r11, adjusted for the caller to clear.
  655   // r10: the base address of the tail of words left to clear.
  656   // r11: the number of words in the tail.
  657   //      r11 < MacroAssembler::zero_words_block_size.
  658 
  659   address generate_zero_blocks() {
  660     Label done;
  661     Label base_aligned;
  662 
  663     Register base = r10, cnt = r11;
  664 
  665     __ align(CodeEntryAlignment);
  666     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  667     StubCodeMark mark(this, stub_id);
  668     address start = __ pc();
  669 
  670     if (UseBlockZeroing) {
  671       int zva_length = VM_Version::zva_length();
  672 
  673       // Ensure ZVA length can be divided by 16. This is required by
  674       // the subsequent operations.
  675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  676 
  677       __ tbz(base, 3, base_aligned);
  678       __ str(zr, Address(__ post(base, 8)));
  679       __ sub(cnt, cnt, 1);
  680       __ bind(base_aligned);
  681 
  682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  683       // alignment.
  684       Label small;
  685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  686       __ subs(rscratch1, cnt, low_limit >> 3);
  687       __ br(Assembler::LT, small);
  688       __ zero_dcache_blocks(base, cnt);
  689       __ bind(small);
  690     }
  691 
  692     {
  693       // Number of stp instructions we'll unroll
  694       const int unroll =
  695         MacroAssembler::zero_words_block_size / 2;
  696       // Clear the remaining blocks.
  697       Label loop;
  698       __ subs(cnt, cnt, unroll * 2);
  699       __ br(Assembler::LT, done);
  700       __ bind(loop);
  701       for (int i = 0; i < unroll; i++)
  702         __ stp(zr, zr, __ post(base, 16));
  703       __ subs(cnt, cnt, unroll * 2);
  704       __ br(Assembler::GE, loop);
  705       __ bind(done);
  706       __ add(cnt, cnt, unroll * 2);
  707     }
  708 
  709     __ ret(lr);
  710 
  711     return start;
  712   }
  713 
  714 
  715   typedef enum {
  716     copy_forwards = 1,
  717     copy_backwards = -1
  718   } copy_direction;
  719 
  720   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  721   // for arraycopy stubs.
  722   class ArrayCopyBarrierSetHelper : StackObj {
  723     BarrierSetAssembler* _bs_asm;
  724     MacroAssembler* _masm;
  725     DecoratorSet _decorators;
  726     BasicType _type;
  727     Register _gct1;
  728     Register _gct2;
  729     Register _gct3;
  730     FloatRegister _gcvt1;
  731     FloatRegister _gcvt2;
  732     FloatRegister _gcvt3;
  733 
  734   public:
  735     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  736                               DecoratorSet decorators,
  737                               BasicType type,
  738                               Register gct1,
  739                               Register gct2,
  740                               Register gct3,
  741                               FloatRegister gcvt1,
  742                               FloatRegister gcvt2,
  743                               FloatRegister gcvt3)
  744       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  745         _masm(masm),
  746         _decorators(decorators),
  747         _type(type),
  748         _gct1(gct1),
  749         _gct2(gct2),
  750         _gct3(gct3),
  751         _gcvt1(gcvt1),
  752         _gcvt2(gcvt2),
  753         _gcvt3(gcvt3) {
  754     }
  755 
  756     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  757       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  758                             dst1, dst2, src,
  759                             _gct1, _gct2, _gcvt1);
  760     }
  761 
  762     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  763       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  764                              dst, src1, src2,
  765                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  766     }
  767 
  768     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  769       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  770                             dst1, dst2, src,
  771                             _gct1);
  772     }
  773 
  774     void copy_store_at_16(Address dst, Register src1, Register src2) {
  775       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  776                              dst, src1, src2,
  777                              _gct1, _gct2, _gct3);
  778     }
  779 
  780     void copy_load_at_8(Register dst, Address src) {
  781       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  782                             dst, noreg, src,
  783                             _gct1);
  784     }
  785 
  786     void copy_store_at_8(Address dst, Register src) {
  787       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  788                              dst, src, noreg,
  789                              _gct1, _gct2, _gct3);
  790     }
  791   };
  792 
  793   // Bulk copy of blocks of 8 words.
  794   //
  795   // count is a count of words.
  796   //
  797   // Precondition: count >= 8
  798   //
  799   // Postconditions:
  800   //
  801   // The least significant bit of count contains the remaining count
  802   // of words to copy.  The rest of count is trash.
  803   //
  804   // s and d are adjusted to point to the remaining words to copy
  805   //
  806   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  807     BasicType type;
  808     copy_direction direction;
  809 
  810     switch (stub_id) {
  811     case copy_byte_f_id:
  812       direction = copy_forwards;
  813       type = T_BYTE;
  814       break;
  815     case copy_byte_b_id:
  816       direction = copy_backwards;
  817       type = T_BYTE;
  818       break;
  819     case copy_oop_f_id:
  820       direction = copy_forwards;
  821       type = T_OBJECT;
  822       break;
  823     case copy_oop_b_id:
  824       direction = copy_backwards;
  825       type = T_OBJECT;
  826       break;
  827     case copy_oop_uninit_f_id:
  828       direction = copy_forwards;
  829       type = T_OBJECT;
  830       break;
  831     case copy_oop_uninit_b_id:
  832       direction = copy_backwards;
  833       type = T_OBJECT;
  834       break;
  835     default:
  836       ShouldNotReachHere();
  837     }
  838 
  839     int unit = wordSize * direction;
  840     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  841 
  842     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  843       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  844     const Register stride = r14;
  845     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  846     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  847     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  848 
  849     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  850     assert_different_registers(s, d, count, rscratch1, rscratch2);
  851 
  852     Label again, drain;
  853 
  854     __ align(CodeEntryAlignment);
  855 
  856     StubCodeMark mark(this, stub_id);
  857 
  858     __ bind(start);
  859 
  860     Label unaligned_copy_long;
  861     if (AvoidUnalignedAccesses) {
  862       __ tbnz(d, 3, unaligned_copy_long);
  863     }
  864 
  865     if (direction == copy_forwards) {
  866       __ sub(s, s, bias);
  867       __ sub(d, d, bias);
  868     }
  869 
  870 #ifdef ASSERT
  871     // Make sure we are never given < 8 words
  872     {
  873       Label L;
  874       __ cmp(count, (u1)8);
  875       __ br(Assembler::GE, L);
  876       __ stop("genrate_copy_longs called with < 8 words");
  877       __ bind(L);
  878     }
  879 #endif
  880 
  881     // Fill 8 registers
  882     if (UseSIMDForMemoryOps) {
  883       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  884       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  885     } else {
  886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  888       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  890     }
  891 
  892     __ subs(count, count, 16);
  893     __ br(Assembler::LO, drain);
  894 
  895     int prefetch = PrefetchCopyIntervalInBytes;
  896     bool use_stride = false;
  897     if (direction == copy_backwards) {
  898        use_stride = prefetch > 256;
  899        prefetch = -prefetch;
  900        if (use_stride) __ mov(stride, prefetch);
  901     }
  902 
  903     __ bind(again);
  904 
  905     if (PrefetchCopyIntervalInBytes > 0)
  906       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  907 
  908     if (UseSIMDForMemoryOps) {
  909       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  910       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  911       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  912       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  913     } else {
  914       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  915       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  916       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  917       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  919       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  920       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  921       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  922     }
  923 
  924     __ subs(count, count, 8);
  925     __ br(Assembler::HS, again);
  926 
  927     // Drain
  928     __ bind(drain);
  929     if (UseSIMDForMemoryOps) {
  930       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  931       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  932     } else {
  933       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  934       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  935       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  936       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  937     }
  938 
  939     {
  940       Label L1, L2;
  941       __ tbz(count, exact_log2(4), L1);
  942       if (UseSIMDForMemoryOps) {
  943         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  944         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  945       } else {
  946         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  947         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  948         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  950       }
  951       __ bind(L1);
  952 
  953       if (direction == copy_forwards) {
  954         __ add(s, s, bias);
  955         __ add(d, d, bias);
  956       }
  957 
  958       __ tbz(count, 1, L2);
  959       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  960       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  961       __ bind(L2);
  962     }
  963 
  964     __ ret(lr);
  965 
  966     if (AvoidUnalignedAccesses) {
  967       Label drain, again;
  968       // Register order for storing. Order is different for backward copy.
  969 
  970       __ bind(unaligned_copy_long);
  971 
  972       // source address is even aligned, target odd aligned
  973       //
  974       // when forward copying word pairs we read long pairs at offsets
  975       // {0, 2, 4, 6} (in long words). when backwards copying we read
  976       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  977       // address by -2 in the forwards case so we can compute the
  978       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  979       // or -1.
  980       //
  981       // when forward copying we need to store 1 word, 3 pairs and
  982       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  983       // zero offset We adjust the destination by -1 which means we
  984       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  985       //
  986       // When backwards copyng we need to store 1 word, 3 pairs and
  987       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  988       // offsets {1, 3, 5, 7, 8} * unit.
  989 
  990       if (direction == copy_forwards) {
  991         __ sub(s, s, 16);
  992         __ sub(d, d, 8);
  993       }
  994 
  995       // Fill 8 registers
  996       //
  997       // for forwards copy s was offset by -16 from the original input
  998       // value of s so the register contents are at these offsets
  999       // relative to the 64 bit block addressed by that original input
 1000       // and so on for each successive 64 byte block when s is updated
 1001       //
 1002       // t0 at offset 0,  t1 at offset 8
 1003       // t2 at offset 16, t3 at offset 24
 1004       // t4 at offset 32, t5 at offset 40
 1005       // t6 at offset 48, t7 at offset 56
 1006 
 1007       // for backwards copy s was not offset so the register contents
 1008       // are at these offsets into the preceding 64 byte block
 1009       // relative to that original input and so on for each successive
 1010       // preceding 64 byte block when s is updated. this explains the
 1011       // slightly counter-intuitive looking pattern of register usage
 1012       // in the stp instructions for backwards copy.
 1013       //
 1014       // t0 at offset -16, t1 at offset -8
 1015       // t2 at offset -32, t3 at offset -24
 1016       // t4 at offset -48, t5 at offset -40
 1017       // t6 at offset -64, t7 at offset -56
 1018 
 1019       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1020       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1021       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1022       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1023 
 1024       __ subs(count, count, 16);
 1025       __ br(Assembler::LO, drain);
 1026 
 1027       int prefetch = PrefetchCopyIntervalInBytes;
 1028       bool use_stride = false;
 1029       if (direction == copy_backwards) {
 1030          use_stride = prefetch > 256;
 1031          prefetch = -prefetch;
 1032          if (use_stride) __ mov(stride, prefetch);
 1033       }
 1034 
 1035       __ bind(again);
 1036 
 1037       if (PrefetchCopyIntervalInBytes > 0)
 1038         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1039 
 1040       if (direction == copy_forwards) {
 1041        // allowing for the offset of -8 the store instructions place
 1042        // registers into the target 64 bit block at the following
 1043        // offsets
 1044        //
 1045        // t0 at offset 0
 1046        // t1 at offset 8,  t2 at offset 16
 1047        // t3 at offset 24, t4 at offset 32
 1048        // t5 at offset 40, t6 at offset 48
 1049        // t7 at offset 56
 1050 
 1051         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1052         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1053         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1054         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1055         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1056         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1057         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1058         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1059         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1060       } else {
 1061        // d was not offset when we started so the registers are
 1062        // written into the 64 bit block preceding d with the following
 1063        // offsets
 1064        //
 1065        // t1 at offset -8
 1066        // t3 at offset -24, t0 at offset -16
 1067        // t5 at offset -48, t2 at offset -32
 1068        // t7 at offset -56, t4 at offset -48
 1069        //                   t6 at offset -64
 1070        //
 1071        // note that this matches the offsets previously noted for the
 1072        // loads
 1073 
 1074         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1075         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1076         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1077         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1078         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1079         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1080         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1082         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1083       }
 1084 
 1085       __ subs(count, count, 8);
 1086       __ br(Assembler::HS, again);
 1087 
 1088       // Drain
 1089       //
 1090       // this uses the same pattern of offsets and register arguments
 1091       // as above
 1092       __ bind(drain);
 1093       if (direction == copy_forwards) {
 1094         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1095         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1096         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1097         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1098         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1099       } else {
 1100         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1101         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1102         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1103         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1104         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1105       }
 1106       // now we need to copy any remaining part block which may
 1107       // include a 4 word block subblock and/or a 2 word subblock.
 1108       // bits 2 and 1 in the count are the tell-tale for whether we
 1109       // have each such subblock
 1110       {
 1111         Label L1, L2;
 1112         __ tbz(count, exact_log2(4), L1);
 1113        // this is the same as above but copying only 4 longs hence
 1114        // with only one intervening stp between the str instructions
 1115        // but note that the offsets and registers still follow the
 1116        // same pattern
 1117         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1118         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1119         if (direction == copy_forwards) {
 1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1121           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1122           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1123         } else {
 1124           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1125           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1126           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1127         }
 1128         __ bind(L1);
 1129 
 1130         __ tbz(count, 1, L2);
 1131        // this is the same as above but copying only 2 longs hence
 1132        // there is no intervening stp between the str instructions
 1133        // but note that the offset and register patterns are still
 1134        // the same
 1135         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1136         if (direction == copy_forwards) {
 1137           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1138           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1139         } else {
 1140           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1141           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1142         }
 1143         __ bind(L2);
 1144 
 1145        // for forwards copy we need to re-adjust the offsets we
 1146        // applied so that s and d are follow the last words written
 1147 
 1148        if (direction == copy_forwards) {
 1149          __ add(s, s, 16);
 1150          __ add(d, d, 8);
 1151        }
 1152 
 1153       }
 1154 
 1155       __ ret(lr);
 1156       }
 1157   }
 1158 
 1159   // Small copy: less than 16 bytes.
 1160   //
 1161   // NB: Ignores all of the bits of count which represent more than 15
 1162   // bytes, so a caller doesn't have to mask them.
 1163 
 1164   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1165     bool is_backwards = step < 0;
 1166     size_t granularity = uabs(step);
 1167     int direction = is_backwards ? -1 : 1;
 1168 
 1169     Label Lword, Lint, Lshort, Lbyte;
 1170 
 1171     assert(granularity
 1172            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1173 
 1174     const Register t0 = r3;
 1175     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1176     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1177 
 1178     // ??? I don't know if this bit-test-and-branch is the right thing
 1179     // to do.  It does a lot of jumping, resulting in several
 1180     // mispredicted branches.  It might make more sense to do this
 1181     // with something like Duff's device with a single computed branch.
 1182 
 1183     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1184     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1185     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1186     __ bind(Lword);
 1187 
 1188     if (granularity <= sizeof (jint)) {
 1189       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1190       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1191       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1192       __ bind(Lint);
 1193     }
 1194 
 1195     if (granularity <= sizeof (jshort)) {
 1196       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1197       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1198       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1199       __ bind(Lshort);
 1200     }
 1201 
 1202     if (granularity <= sizeof (jbyte)) {
 1203       __ tbz(count, 0, Lbyte);
 1204       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1205       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1206       __ bind(Lbyte);
 1207     }
 1208   }
 1209 
 1210   Label copy_f, copy_b;
 1211   Label copy_obj_f, copy_obj_b;
 1212   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1213 
 1214   // All-singing all-dancing memory copy.
 1215   //
 1216   // Copy count units of memory from s to d.  The size of a unit is
 1217   // step, which can be positive or negative depending on the direction
 1218   // of copy.  If is_aligned is false, we align the source address.
 1219   //
 1220 
 1221   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1222                    Register s, Register d, Register count, int step) {
 1223     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1224     bool is_backwards = step < 0;
 1225     unsigned int granularity = uabs(step);
 1226     const Register t0 = r3, t1 = r4;
 1227 
 1228     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1229     // load all the data before writing anything
 1230     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1231     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1232     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1233     const Register send = r17, dend = r16;
 1234     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1235     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1236     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1237 
 1238     if (PrefetchCopyIntervalInBytes > 0)
 1239       __ prfm(Address(s, 0), PLDL1KEEP);
 1240     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1241     __ br(Assembler::HI, copy_big);
 1242 
 1243     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1244     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1245 
 1246     __ cmp(count, u1(16/granularity));
 1247     __ br(Assembler::LS, copy16);
 1248 
 1249     __ cmp(count, u1(64/granularity));
 1250     __ br(Assembler::HI, copy80);
 1251 
 1252     __ cmp(count, u1(32/granularity));
 1253     __ br(Assembler::LS, copy32);
 1254 
 1255     // 33..64 bytes
 1256     if (UseSIMDForMemoryOps) {
 1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1258       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1259       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1260       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1261     } else {
 1262       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1263       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1264       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1265       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1266 
 1267       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1268       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1269       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1270       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1271     }
 1272     __ b(finish);
 1273 
 1274     // 17..32 bytes
 1275     __ bind(copy32);
 1276     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1277     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1278 
 1279     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1280     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1281     __ b(finish);
 1282 
 1283     // 65..80/96 bytes
 1284     // (96 bytes if SIMD because we do 32 byes per instruction)
 1285     __ bind(copy80);
 1286     if (UseSIMDForMemoryOps) {
 1287       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1288       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1289       // Unaligned pointers can be an issue for copying.
 1290       // The issue has more chances to happen when granularity of data is
 1291       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1292       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1293       // The most performance drop has been seen for the range 65-80 bytes.
 1294       // For such cases using the pair of ldp/stp instead of the third pair of
 1295       // ldpq/stpq fixes the performance issue.
 1296       if (granularity < sizeof (jint)) {
 1297         Label copy96;
 1298         __ cmp(count, u1(80/granularity));
 1299         __ br(Assembler::HI, copy96);
 1300         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1301 
 1302         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1303         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1304 
 1305         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1306         __ b(finish);
 1307 
 1308         __ bind(copy96);
 1309       }
 1310       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1311 
 1312       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1313       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1314 
 1315       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1316     } else {
 1317       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1318       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1319       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1320       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1321       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1322 
 1323       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1324       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1325       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1326       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1327       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1328     }
 1329     __ b(finish);
 1330 
 1331     // 0..16 bytes
 1332     __ bind(copy16);
 1333     __ cmp(count, u1(8/granularity));
 1334     __ br(Assembler::LO, copy8);
 1335 
 1336     // 8..16 bytes
 1337     bs.copy_load_at_8(t0, Address(s, 0));
 1338     bs.copy_load_at_8(t1, Address(send, -8));
 1339     bs.copy_store_at_8(Address(d, 0), t0);
 1340     bs.copy_store_at_8(Address(dend, -8), t1);
 1341     __ b(finish);
 1342 
 1343     if (granularity < 8) {
 1344       // 4..7 bytes
 1345       __ bind(copy8);
 1346       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1347       __ ldrw(t0, Address(s, 0));
 1348       __ ldrw(t1, Address(send, -4));
 1349       __ strw(t0, Address(d, 0));
 1350       __ strw(t1, Address(dend, -4));
 1351       __ b(finish);
 1352       if (granularity < 4) {
 1353         // 0..3 bytes
 1354         __ bind(copy4);
 1355         __ cbz(count, finish); // get rid of 0 case
 1356         if (granularity == 2) {
 1357           __ ldrh(t0, Address(s, 0));
 1358           __ strh(t0, Address(d, 0));
 1359         } else { // granularity == 1
 1360           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1361           // the first and last byte.
 1362           // Handle the 3 byte case by loading and storing base + count/2
 1363           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1364           // This does means in the 1 byte case we load/store the same
 1365           // byte 3 times.
 1366           __ lsr(count, count, 1);
 1367           __ ldrb(t0, Address(s, 0));
 1368           __ ldrb(t1, Address(send, -1));
 1369           __ ldrb(t2, Address(s, count));
 1370           __ strb(t0, Address(d, 0));
 1371           __ strb(t1, Address(dend, -1));
 1372           __ strb(t2, Address(d, count));
 1373         }
 1374         __ b(finish);
 1375       }
 1376     }
 1377 
 1378     __ bind(copy_big);
 1379     if (is_backwards) {
 1380       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1381       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1382     }
 1383 
 1384     // Now we've got the small case out of the way we can align the
 1385     // source address on a 2-word boundary.
 1386 
 1387     // Here we will materialize a count in r15, which is used by copy_memory_small
 1388     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1389     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1390     // can not be used as a temp register, as it contains the count.
 1391 
 1392     Label aligned;
 1393 
 1394     if (is_aligned) {
 1395       // We may have to adjust by 1 word to get s 2-word-aligned.
 1396       __ tbz(s, exact_log2(wordSize), aligned);
 1397       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1398       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1399       __ sub(count, count, wordSize/granularity);
 1400     } else {
 1401       if (is_backwards) {
 1402         __ andr(r15, s, 2 * wordSize - 1);
 1403       } else {
 1404         __ neg(r15, s);
 1405         __ andr(r15, r15, 2 * wordSize - 1);
 1406       }
 1407       // r15 is the byte adjustment needed to align s.
 1408       __ cbz(r15, aligned);
 1409       int shift = exact_log2(granularity);
 1410       if (shift > 0) {
 1411         __ lsr(r15, r15, shift);
 1412       }
 1413       __ sub(count, count, r15);
 1414 
 1415 #if 0
 1416       // ?? This code is only correct for a disjoint copy.  It may or
 1417       // may not make sense to use it in that case.
 1418 
 1419       // Copy the first pair; s and d may not be aligned.
 1420       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1421       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1422 
 1423       // Align s and d, adjust count
 1424       if (is_backwards) {
 1425         __ sub(s, s, r15);
 1426         __ sub(d, d, r15);
 1427       } else {
 1428         __ add(s, s, r15);
 1429         __ add(d, d, r15);
 1430       }
 1431 #else
 1432       copy_memory_small(decorators, type, s, d, r15, step);
 1433 #endif
 1434     }
 1435 
 1436     __ bind(aligned);
 1437 
 1438     // s is now 2-word-aligned.
 1439 
 1440     // We have a count of units and some trailing bytes. Adjust the
 1441     // count and do a bulk copy of words. If the shift is zero
 1442     // perform a move instead to benefit from zero latency moves.
 1443     int shift = exact_log2(wordSize/granularity);
 1444     if (shift > 0) {
 1445       __ lsr(r15, count, shift);
 1446     } else {
 1447       __ mov(r15, count);
 1448     }
 1449     if (direction == copy_forwards) {
 1450       if (type != T_OBJECT) {
 1451         __ bl(copy_f);
 1452       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1453         __ bl(copy_obj_uninit_f);
 1454       } else {
 1455         __ bl(copy_obj_f);
 1456       }
 1457     } else {
 1458       if (type != T_OBJECT) {
 1459         __ bl(copy_b);
 1460       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1461         __ bl(copy_obj_uninit_b);
 1462       } else {
 1463         __ bl(copy_obj_b);
 1464       }
 1465     }
 1466 
 1467     // And the tail.
 1468     copy_memory_small(decorators, type, s, d, count, step);
 1469 
 1470     if (granularity >= 8) __ bind(copy8);
 1471     if (granularity >= 4) __ bind(copy4);
 1472     __ bind(finish);
 1473   }
 1474 
 1475 
 1476   void clobber_registers() {
 1477 #ifdef ASSERT
 1478     RegSet clobbered
 1479       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1480     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1481     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1482     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1483       __ mov(*it, rscratch1);
 1484     }
 1485 #endif
 1486 
 1487   }
 1488 
 1489   // Scan over array at a for count oops, verifying each one.
 1490   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1491   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1492     Label loop, end;
 1493     __ mov(rscratch1, a);
 1494     __ mov(rscratch2, zr);
 1495     __ bind(loop);
 1496     __ cmp(rscratch2, count);
 1497     __ br(Assembler::HS, end);
 1498     if (size == wordSize) {
 1499       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1500       __ verify_oop(temp);
 1501     } else {
 1502       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1503       __ decode_heap_oop(temp); // calls verify_oop
 1504     }
 1505     __ add(rscratch2, rscratch2, 1);
 1506     __ b(loop);
 1507     __ bind(end);
 1508   }
 1509 
 1510   // Arguments:
 1511   //   stub_id - is used to name the stub and identify all details of
 1512   //             how to perform the copy.
 1513   //
 1514   //   entry - is assigned to the stub's post push entry point unless
 1515   //           it is null
 1516   //
 1517   // Inputs:
 1518   //   c_rarg0   - source array address
 1519   //   c_rarg1   - destination array address
 1520   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1521   //
 1522   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1523   // the hardware handle it.  The two dwords within qwords that span
 1524   // cache line boundaries will still be loaded and stored atomically.
 1525   //
 1526   // Side Effects: entry is set to the (post push) entry point so it
 1527   //               can be used by the corresponding conjoint copy
 1528   //               method
 1529   //
 1530   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1531     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1532     RegSet saved_reg = RegSet::of(s, d, count);
 1533     int size;
 1534     bool aligned;
 1535     bool is_oop;
 1536     bool dest_uninitialized;
 1537     switch (stub_id) {
 1538     case jbyte_disjoint_arraycopy_id:
 1539       size = sizeof(jbyte);
 1540       aligned = false;
 1541       is_oop = false;
 1542       dest_uninitialized = false;
 1543       break;
 1544     case arrayof_jbyte_disjoint_arraycopy_id:
 1545       size = sizeof(jbyte);
 1546       aligned = true;
 1547       is_oop = false;
 1548       dest_uninitialized = false;
 1549       break;
 1550     case jshort_disjoint_arraycopy_id:
 1551       size = sizeof(jshort);
 1552       aligned = false;
 1553       is_oop = false;
 1554       dest_uninitialized = false;
 1555       break;
 1556     case arrayof_jshort_disjoint_arraycopy_id:
 1557       size = sizeof(jshort);
 1558       aligned = true;
 1559       is_oop = false;
 1560       dest_uninitialized = false;
 1561       break;
 1562     case jint_disjoint_arraycopy_id:
 1563       size = sizeof(jint);
 1564       aligned = false;
 1565       is_oop = false;
 1566       dest_uninitialized = false;
 1567       break;
 1568     case arrayof_jint_disjoint_arraycopy_id:
 1569       size = sizeof(jint);
 1570       aligned = true;
 1571       is_oop = false;
 1572       dest_uninitialized = false;
 1573       break;
 1574     case jlong_disjoint_arraycopy_id:
 1575       // since this is always aligned we can (should!) use the same
 1576       // stub as for case arrayof_jlong_disjoint_arraycopy
 1577       ShouldNotReachHere();
 1578       break;
 1579     case arrayof_jlong_disjoint_arraycopy_id:
 1580       size = sizeof(jlong);
 1581       aligned = true;
 1582       is_oop = false;
 1583       dest_uninitialized = false;
 1584       break;
 1585     case oop_disjoint_arraycopy_id:
 1586       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1587       aligned = !UseCompressedOops;
 1588       is_oop = true;
 1589       dest_uninitialized = false;
 1590       break;
 1591     case arrayof_oop_disjoint_arraycopy_id:
 1592       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1593       aligned = !UseCompressedOops;
 1594       is_oop = true;
 1595       dest_uninitialized = false;
 1596       break;
 1597     case oop_disjoint_arraycopy_uninit_id:
 1598       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1599       aligned = !UseCompressedOops;
 1600       is_oop = true;
 1601       dest_uninitialized = true;
 1602       break;
 1603     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1604       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1605       aligned = !UseCompressedOops;
 1606       is_oop = true;
 1607       dest_uninitialized = true;
 1608       break;
 1609     default:
 1610       ShouldNotReachHere();
 1611       break;
 1612     }
 1613 
 1614     __ align(CodeEntryAlignment);
 1615     StubCodeMark mark(this, stub_id);
 1616     address start = __ pc();
 1617     __ enter();
 1618 
 1619     if (entry != nullptr) {
 1620       *entry = __ pc();
 1621       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1622       BLOCK_COMMENT("Entry:");
 1623     }
 1624 
 1625     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1626     if (dest_uninitialized) {
 1627       decorators |= IS_DEST_UNINITIALIZED;
 1628     }
 1629     if (aligned) {
 1630       decorators |= ARRAYCOPY_ALIGNED;
 1631     }
 1632 
 1633     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1634     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1635 
 1636     if (is_oop) {
 1637       // save regs before copy_memory
 1638       __ push(RegSet::of(d, count), sp);
 1639     }
 1640     {
 1641       // UnsafeMemoryAccess page error: continue after unsafe access
 1642       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1643       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1644       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1645     }
 1646 
 1647     if (is_oop) {
 1648       __ pop(RegSet::of(d, count), sp);
 1649       if (VerifyOops)
 1650         verify_oop_array(size, d, count, r16);
 1651     }
 1652 
 1653     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1654 
 1655     __ leave();
 1656     __ mov(r0, zr); // return 0
 1657     __ ret(lr);
 1658     return start;
 1659   }
 1660 
 1661   // Arguments:
 1662   //   stub_id - is used to name the stub and identify all details of
 1663   //             how to perform the copy.
 1664   //
 1665   //   nooverlap_target - identifes the (post push) entry for the
 1666   //             corresponding disjoint copy routine which can be
 1667   //             jumped to if the ranges do not actually overlap
 1668   //
 1669   //   entry - is assigned to the stub's post push entry point unless
 1670   //           it is null
 1671   //
 1672   //
 1673   // Inputs:
 1674   //   c_rarg0   - source array address
 1675   //   c_rarg1   - destination array address
 1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1677   //
 1678   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1679   // the hardware handle it.  The two dwords within qwords that span
 1680   // cache line boundaries will still be loaded and stored atomically.
 1681   //
 1682   // Side Effects:
 1683   //   entry is set to the no-overlap entry point so it can be used by
 1684   //   some other conjoint copy method
 1685   //
 1686   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1687     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1688     RegSet saved_regs = RegSet::of(s, d, count);
 1689     int size;
 1690     bool aligned;
 1691     bool is_oop;
 1692     bool dest_uninitialized;
 1693     switch (stub_id) {
 1694     case jbyte_arraycopy_id:
 1695       size = sizeof(jbyte);
 1696       aligned = false;
 1697       is_oop = false;
 1698       dest_uninitialized = false;
 1699       break;
 1700     case arrayof_jbyte_arraycopy_id:
 1701       size = sizeof(jbyte);
 1702       aligned = true;
 1703       is_oop = false;
 1704       dest_uninitialized = false;
 1705       break;
 1706     case jshort_arraycopy_id:
 1707       size = sizeof(jshort);
 1708       aligned = false;
 1709       is_oop = false;
 1710       dest_uninitialized = false;
 1711       break;
 1712     case arrayof_jshort_arraycopy_id:
 1713       size = sizeof(jshort);
 1714       aligned = true;
 1715       is_oop = false;
 1716       dest_uninitialized = false;
 1717       break;
 1718     case jint_arraycopy_id:
 1719       size = sizeof(jint);
 1720       aligned = false;
 1721       is_oop = false;
 1722       dest_uninitialized = false;
 1723       break;
 1724     case arrayof_jint_arraycopy_id:
 1725       size = sizeof(jint);
 1726       aligned = true;
 1727       is_oop = false;
 1728       dest_uninitialized = false;
 1729       break;
 1730     case jlong_arraycopy_id:
 1731       // since this is always aligned we can (should!) use the same
 1732       // stub as for case arrayof_jlong_disjoint_arraycopy
 1733       ShouldNotReachHere();
 1734       break;
 1735     case arrayof_jlong_arraycopy_id:
 1736       size = sizeof(jlong);
 1737       aligned = true;
 1738       is_oop = false;
 1739       dest_uninitialized = false;
 1740       break;
 1741     case oop_arraycopy_id:
 1742       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1743       aligned = !UseCompressedOops;
 1744       is_oop = true;
 1745       dest_uninitialized = false;
 1746       break;
 1747     case arrayof_oop_arraycopy_id:
 1748       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1749       aligned = !UseCompressedOops;
 1750       is_oop = true;
 1751       dest_uninitialized = false;
 1752       break;
 1753     case oop_arraycopy_uninit_id:
 1754       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1755       aligned = !UseCompressedOops;
 1756       is_oop = true;
 1757       dest_uninitialized = true;
 1758       break;
 1759     case arrayof_oop_arraycopy_uninit_id:
 1760       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1761       aligned = !UseCompressedOops;
 1762       is_oop = true;
 1763       dest_uninitialized = true;
 1764       break;
 1765     default:
 1766       ShouldNotReachHere();
 1767     }
 1768 
 1769     StubCodeMark mark(this, stub_id);
 1770     address start = __ pc();
 1771     __ enter();
 1772 
 1773     if (entry != nullptr) {
 1774       *entry = __ pc();
 1775       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1776       BLOCK_COMMENT("Entry:");
 1777     }
 1778 
 1779     // use fwd copy when (d-s) above_equal (count*size)
 1780     __ sub(rscratch1, d, s);
 1781     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1782     __ br(Assembler::HS, nooverlap_target);
 1783 
 1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1785     if (dest_uninitialized) {
 1786       decorators |= IS_DEST_UNINITIALIZED;
 1787     }
 1788     if (aligned) {
 1789       decorators |= ARRAYCOPY_ALIGNED;
 1790     }
 1791 
 1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1794 
 1795     if (is_oop) {
 1796       // save regs before copy_memory
 1797       __ push(RegSet::of(d, count), sp);
 1798     }
 1799     {
 1800       // UnsafeMemoryAccess page error: continue after unsafe access
 1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1804     }
 1805     if (is_oop) {
 1806       __ pop(RegSet::of(d, count), sp);
 1807       if (VerifyOops)
 1808         verify_oop_array(size, d, count, r16);
 1809     }
 1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1811     __ leave();
 1812     __ mov(r0, zr); // return 0
 1813     __ ret(lr);
 1814     return start;
 1815   }
 1816 
 1817   // Helper for generating a dynamic type check.
 1818   // Smashes rscratch1, rscratch2.
 1819   void generate_type_check(Register sub_klass,
 1820                            Register super_check_offset,
 1821                            Register super_klass,
 1822                            Register temp1,
 1823                            Register temp2,
 1824                            Register result,
 1825                            Label& L_success) {
 1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1827 
 1828     BLOCK_COMMENT("type_check:");
 1829 
 1830     Label L_miss;
 1831 
 1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1833                                      super_check_offset);
 1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1835 
 1836     // Fall through on failure!
 1837     __ BIND(L_miss);
 1838   }
 1839 
 1840   //
 1841   //  Generate checkcasting array copy stub
 1842   //
 1843   //  Input:
 1844   //    c_rarg0   - source array address
 1845   //    c_rarg1   - destination array address
 1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1847   //    c_rarg3   - size_t ckoff (super_check_offset)
 1848   //    c_rarg4   - oop ckval (super_klass)
 1849   //
 1850   //  Output:
 1851   //    r0 ==  0  -  success
 1852   //    r0 == -1^K - failure, where K is partial transfer count
 1853   //
 1854   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1855     bool dest_uninitialized;
 1856     switch (stub_id) {
 1857     case checkcast_arraycopy_id:
 1858       dest_uninitialized = false;
 1859       break;
 1860     case checkcast_arraycopy_uninit_id:
 1861       dest_uninitialized = true;
 1862       break;
 1863     default:
 1864       ShouldNotReachHere();
 1865     }
 1866 
 1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1868 
 1869     // Input registers (after setup_arg_regs)
 1870     const Register from        = c_rarg0;   // source array address
 1871     const Register to          = c_rarg1;   // destination array address
 1872     const Register count       = c_rarg2;   // elementscount
 1873     const Register ckoff       = c_rarg3;   // super_check_offset
 1874     const Register ckval       = c_rarg4;   // super_klass
 1875 
 1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1877     RegSet wb_post_saved_regs = RegSet::of(count);
 1878 
 1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1880     const Register copied_oop  = r22;       // actual oop copied
 1881     const Register count_save  = r21;       // orig elementscount
 1882     const Register start_to    = r20;       // destination array start address
 1883     const Register r19_klass   = r19;       // oop._klass
 1884 
 1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1887 
 1888     //---------------------------------------------------------------
 1889     // Assembler stub will be used for this call to arraycopy
 1890     // if the two arrays are subtypes of Object[] but the
 1891     // destination array type is not equal to or a supertype
 1892     // of the source type.  Each element must be separately
 1893     // checked.
 1894 
 1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1896                                copied_oop, r19_klass, count_save);
 1897 
 1898     __ align(CodeEntryAlignment);
 1899     StubCodeMark mark(this, stub_id);
 1900     address start = __ pc();
 1901 
 1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1903 
 1904 #ifdef ASSERT
 1905     // caller guarantees that the arrays really are different
 1906     // otherwise, we would have to make conjoint checks
 1907     { Label L;
 1908       __ b(L);                  // conjoint check not yet implemented
 1909       __ stop("checkcast_copy within a single array");
 1910       __ bind(L);
 1911     }
 1912 #endif //ASSERT
 1913 
 1914     // Caller of this entry point must set up the argument registers.
 1915     if (entry != nullptr) {
 1916       *entry = __ pc();
 1917       BLOCK_COMMENT("Entry:");
 1918     }
 1919 
 1920      // Empty array:  Nothing to do.
 1921     __ cbz(count, L_done);
 1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1923 
 1924 #ifdef ASSERT
 1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1926     // The ckoff and ckval must be mutually consistent,
 1927     // even though caller generates both.
 1928     { Label L;
 1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1930       __ ldrw(start_to, Address(ckval, sco_offset));
 1931       __ cmpw(ckoff, start_to);
 1932       __ br(Assembler::EQ, L);
 1933       __ stop("super_check_offset inconsistent");
 1934       __ bind(L);
 1935     }
 1936 #endif //ASSERT
 1937 
 1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1939     bool is_oop = true;
 1940     int element_size = UseCompressedOops ? 4 : 8;
 1941     if (dest_uninitialized) {
 1942       decorators |= IS_DEST_UNINITIALIZED;
 1943     }
 1944 
 1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1947 
 1948     // save the original count
 1949     __ mov(count_save, count);
 1950 
 1951     // Copy from low to high addresses
 1952     __ mov(start_to, to);              // Save destination array start address
 1953     __ b(L_load_element);
 1954 
 1955     // ======== begin loop ========
 1956     // (Loop is rotated; its entry is L_load_element.)
 1957     // Loop control:
 1958     //   for (; count != 0; count--) {
 1959     //     copied_oop = load_heap_oop(from++);
 1960     //     ... generate_type_check ...;
 1961     //     store_heap_oop(to++, copied_oop);
 1962     //   }
 1963     __ align(OptoLoopAlignment);
 1964 
 1965     __ BIND(L_store_element);
 1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1967                       __ post(to, element_size), copied_oop, noreg,
 1968                       gct1, gct2, gct3);
 1969     __ sub(count, count, 1);
 1970     __ cbz(count, L_do_card_marks);
 1971 
 1972     // ======== loop entry is here ========
 1973     __ BIND(L_load_element);
 1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1975                      copied_oop, noreg, __ post(from, element_size),
 1976                      gct1);
 1977     __ cbz(copied_oop, L_store_element);
 1978 
 1979     __ load_klass(r19_klass, copied_oop);// query the object klass
 1980 
 1981     BLOCK_COMMENT("type_check:");
 1982     generate_type_check(/*sub_klass*/r19_klass,
 1983                         /*super_check_offset*/ckoff,
 1984                         /*super_klass*/ckval,
 1985                         /*r_array_base*/gct1,
 1986                         /*temp2*/gct2,
 1987                         /*result*/r10, L_store_element);
 1988 
 1989     // Fall through on failure!
 1990 
 1991     // ======== end loop ========
 1992 
 1993     // It was a real error; we must depend on the caller to finish the job.
 1994     // Register count = remaining oops, count_orig = total oops.
 1995     // Emit GC store barriers for the oops we have copied and report
 1996     // their number to the caller.
 1997 
 1998     __ subs(count, count_save, count);     // K = partially copied oop count
 1999     __ eon(count, count, zr);              // report (-1^K) to caller
 2000     __ br(Assembler::EQ, L_done_pop);
 2001 
 2002     __ BIND(L_do_card_marks);
 2003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2004 
 2005     __ bind(L_done_pop);
 2006     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2007     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2008 
 2009     __ bind(L_done);
 2010     __ mov(r0, count);
 2011     __ leave();
 2012     __ ret(lr);
 2013 
 2014     return start;
 2015   }
 2016 
 2017   // Perform range checks on the proposed arraycopy.
 2018   // Kills temp, but nothing else.
 2019   // Also, clean the sign bits of src_pos and dst_pos.
 2020   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2021                               Register src_pos, // source position (c_rarg1)
 2022                               Register dst,     // destination array oo (c_rarg2)
 2023                               Register dst_pos, // destination position (c_rarg3)
 2024                               Register length,
 2025                               Register temp,
 2026                               Label& L_failed) {
 2027     BLOCK_COMMENT("arraycopy_range_checks:");
 2028 
 2029     assert_different_registers(rscratch1, temp);
 2030 
 2031     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2032     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2033     __ addw(temp, length, src_pos);
 2034     __ cmpw(temp, rscratch1);
 2035     __ br(Assembler::HI, L_failed);
 2036 
 2037     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2038     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2039     __ addw(temp, length, dst_pos);
 2040     __ cmpw(temp, rscratch1);
 2041     __ br(Assembler::HI, L_failed);
 2042 
 2043     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2044     __ movw(src_pos, src_pos);
 2045     __ movw(dst_pos, dst_pos);
 2046 
 2047     BLOCK_COMMENT("arraycopy_range_checks done");
 2048   }
 2049 
 2050   // These stubs get called from some dumb test routine.
 2051   // I'll write them properly when they're called from
 2052   // something that's actually doing something.
 2053   static void fake_arraycopy_stub(address src, address dst, int count) {
 2054     assert(count == 0, "huh?");
 2055   }
 2056 
 2057 
 2058   //
 2059   //  Generate 'unsafe' array copy stub
 2060   //  Though just as safe as the other stubs, it takes an unscaled
 2061   //  size_t argument instead of an element count.
 2062   //
 2063   //  Input:
 2064   //    c_rarg0   - source array address
 2065   //    c_rarg1   - destination array address
 2066   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2067   //
 2068   // Examines the alignment of the operands and dispatches
 2069   // to a long, int, short, or byte copy loop.
 2070   //
 2071   address generate_unsafe_copy(address byte_copy_entry,
 2072                                address short_copy_entry,
 2073                                address int_copy_entry,
 2074                                address long_copy_entry) {
 2075     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2076 
 2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2079 
 2080     __ align(CodeEntryAlignment);
 2081     StubCodeMark mark(this, stub_id);
 2082     address start = __ pc();
 2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2084 
 2085     // bump this on entry, not on exit:
 2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2087 
 2088     __ orr(rscratch1, s, d);
 2089     __ orr(rscratch1, rscratch1, count);
 2090 
 2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2092     __ cbz(rscratch1, L_long_aligned);
 2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2094     __ cbz(rscratch1, L_int_aligned);
 2095     __ tbz(rscratch1, 0, L_short_aligned);
 2096     __ b(RuntimeAddress(byte_copy_entry));
 2097 
 2098     __ BIND(L_short_aligned);
 2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2100     __ b(RuntimeAddress(short_copy_entry));
 2101     __ BIND(L_int_aligned);
 2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2103     __ b(RuntimeAddress(int_copy_entry));
 2104     __ BIND(L_long_aligned);
 2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2106     __ b(RuntimeAddress(long_copy_entry));
 2107 
 2108     return start;
 2109   }
 2110 
 2111   //
 2112   //  Generate generic array copy stubs
 2113   //
 2114   //  Input:
 2115   //    c_rarg0    -  src oop
 2116   //    c_rarg1    -  src_pos (32-bits)
 2117   //    c_rarg2    -  dst oop
 2118   //    c_rarg3    -  dst_pos (32-bits)
 2119   //    c_rarg4    -  element count (32-bits)
 2120   //
 2121   //  Output:
 2122   //    r0 ==  0  -  success
 2123   //    r0 == -1^K - failure, where K is partial transfer count
 2124   //
 2125   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2126                                 address int_copy_entry, address oop_copy_entry,
 2127                                 address long_copy_entry, address checkcast_copy_entry) {
 2128     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2129 
 2130     Label L_failed, L_objArray;
 2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2132 
 2133     // Input registers
 2134     const Register src        = c_rarg0;  // source array oop
 2135     const Register src_pos    = c_rarg1;  // source position
 2136     const Register dst        = c_rarg2;  // destination array oop
 2137     const Register dst_pos    = c_rarg3;  // destination position
 2138     const Register length     = c_rarg4;
 2139 
 2140 
 2141     // Registers used as temps
 2142     const Register dst_klass  = c_rarg5;
 2143 
 2144     __ align(CodeEntryAlignment);
 2145 
 2146     StubCodeMark mark(this, stub_id);
 2147 
 2148     address start = __ pc();
 2149 
 2150     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2151 
 2152     // bump this on entry, not on exit:
 2153     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2154 
 2155     //-----------------------------------------------------------------------
 2156     // Assembler stub will be used for this call to arraycopy
 2157     // if the following conditions are met:
 2158     //
 2159     // (1) src and dst must not be null.
 2160     // (2) src_pos must not be negative.
 2161     // (3) dst_pos must not be negative.
 2162     // (4) length  must not be negative.
 2163     // (5) src klass and dst klass should be the same and not null.
 2164     // (6) src and dst should be arrays.
 2165     // (7) src_pos + length must not exceed length of src.
 2166     // (8) dst_pos + length must not exceed length of dst.
 2167     //
 2168 
 2169     //  if (src == nullptr) return -1;
 2170     __ cbz(src, L_failed);
 2171 
 2172     //  if (src_pos < 0) return -1;
 2173     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2174 
 2175     //  if (dst == nullptr) return -1;
 2176     __ cbz(dst, L_failed);
 2177 
 2178     //  if (dst_pos < 0) return -1;
 2179     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2180 
 2181     // registers used as temp
 2182     const Register scratch_length    = r16; // elements count to copy
 2183     const Register scratch_src_klass = r17; // array klass
 2184     const Register lh                = r15; // layout helper
 2185 
 2186     //  if (length < 0) return -1;
 2187     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2188     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     __ load_klass(scratch_src_klass, src);
 2191 #ifdef ASSERT
 2192     //  assert(src->klass() != nullptr);
 2193     {
 2194       BLOCK_COMMENT("assert klasses not null {");
 2195       Label L1, L2;
 2196       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2197       __ bind(L1);
 2198       __ stop("broken null klass");
 2199       __ bind(L2);
 2200       __ load_klass(rscratch1, dst);
 2201       __ cbz(rscratch1, L1);     // this would be broken also
 2202       BLOCK_COMMENT("} assert klasses not null done");
 2203     }
 2204 #endif
 2205 
 2206     // Load layout helper (32-bits)
 2207     //
 2208     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2209     // 32        30    24            16              8     2                 0
 2210     //
 2211     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2212     //
 2213 
 2214     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2215 
 2216     // Handle objArrays completely differently...
 2217     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2218     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2219     __ movw(rscratch1, objArray_lh);
 2220     __ eorw(rscratch2, lh, rscratch1);
 2221     __ cbzw(rscratch2, L_objArray);
 2222 
 2223     //  if (src->klass() != dst->klass()) return -1;
 2224     __ load_klass(rscratch2, dst);
 2225     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2226     __ cbnz(rscratch2, L_failed);
 2227 
 2228     //  if (!src->is_Array()) return -1;
 2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2230 
 2231     // At this point, it is known to be a typeArray (array_tag 0x3).
 2232 #ifdef ASSERT
 2233     {
 2234       BLOCK_COMMENT("assert primitive array {");
 2235       Label L;
 2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2237       __ cmpw(lh, rscratch2);
 2238       __ br(Assembler::GE, L);
 2239       __ stop("must be a primitive array");
 2240       __ bind(L);
 2241       BLOCK_COMMENT("} assert primitive array done");
 2242     }
 2243 #endif
 2244 
 2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2246                            rscratch2, L_failed);
 2247 
 2248     // TypeArrayKlass
 2249     //
 2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2252     //
 2253 
 2254     const Register rscratch1_offset = rscratch1;    // array offset
 2255     const Register r15_elsize = lh; // element size
 2256 
 2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2259     __ add(src, src, rscratch1_offset);           // src array offset
 2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2261     BLOCK_COMMENT("choose copy loop based on element size");
 2262 
 2263     // next registers should be set before the jump to corresponding stub
 2264     const Register from     = c_rarg0;  // source array address
 2265     const Register to       = c_rarg1;  // destination array address
 2266     const Register count    = c_rarg2;  // elements count
 2267 
 2268     // 'from', 'to', 'count' registers should be set in such order
 2269     // since they are the same as 'src', 'src_pos', 'dst'.
 2270 
 2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2272 
 2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2274     // size in bytes).  We do a simple bitwise binary search.
 2275   __ BIND(L_copy_bytes);
 2276     __ tbnz(r15_elsize, 1, L_copy_ints);
 2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2278     __ lea(from, Address(src, src_pos));// src_addr
 2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2280     __ movw(count, scratch_length); // length
 2281     __ b(RuntimeAddress(byte_copy_entry));
 2282 
 2283   __ BIND(L_copy_shorts);
 2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2286     __ movw(count, scratch_length); // length
 2287     __ b(RuntimeAddress(short_copy_entry));
 2288 
 2289   __ BIND(L_copy_ints);
 2290     __ tbnz(r15_elsize, 0, L_copy_longs);
 2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2293     __ movw(count, scratch_length); // length
 2294     __ b(RuntimeAddress(int_copy_entry));
 2295 
 2296   __ BIND(L_copy_longs);
 2297 #ifdef ASSERT
 2298     {
 2299       BLOCK_COMMENT("assert long copy {");
 2300       Label L;
 2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2302       __ cmpw(r15_elsize, LogBytesPerLong);
 2303       __ br(Assembler::EQ, L);
 2304       __ stop("must be long copy, but elsize is wrong");
 2305       __ bind(L);
 2306       BLOCK_COMMENT("} assert long copy done");
 2307     }
 2308 #endif
 2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2311     __ movw(count, scratch_length); // length
 2312     __ b(RuntimeAddress(long_copy_entry));
 2313 
 2314     // ObjArrayKlass
 2315   __ BIND(L_objArray);
 2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2317 
 2318     Label L_plain_copy, L_checkcast_copy;
 2319     //  test array classes for subtyping
 2320     __ load_klass(r15, dst);
 2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2322     __ br(Assembler::NE, L_checkcast_copy);
 2323 
 2324     // Identically typed arrays can be copied without element-wise checks.
 2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2326                            rscratch2, L_failed);
 2327 
 2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2332     __ movw(count, scratch_length); // length
 2333   __ BIND(L_plain_copy);
 2334     __ b(RuntimeAddress(oop_copy_entry));
 2335 
 2336   __ BIND(L_checkcast_copy);
 2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2338     {
 2339       // Before looking at dst.length, make sure dst is also an objArray.
 2340       __ ldrw(rscratch1, Address(r15, lh_offset));
 2341       __ movw(rscratch2, objArray_lh);
 2342       __ eorw(rscratch1, rscratch1, rscratch2);
 2343       __ cbnzw(rscratch1, L_failed);
 2344 
 2345       // It is safe to examine both src.length and dst.length.
 2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                              r15, L_failed);
 2348 
 2349       __ load_klass(dst_klass, dst); // reload
 2350 
 2351       // Marshal the base address arguments now, freeing registers.
 2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2356       __ movw(count, length);           // length (reloaded)
 2357       Register sco_temp = c_rarg3;      // this register is free now
 2358       assert_different_registers(from, to, count, sco_temp,
 2359                                  dst_klass, scratch_src_klass);
 2360       // assert_clean_int(count, sco_temp);
 2361 
 2362       // Generate the type check.
 2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2365 
 2366       // Smashes rscratch1, rscratch2
 2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2368                           L_plain_copy);
 2369 
 2370       // Fetch destination element klass from the ObjArrayKlass header.
 2371       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2372       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2373       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2374 
 2375       // the checkcast_copy loop needs two extra arguments:
 2376       assert(c_rarg3 == sco_temp, "#3 already in place");
 2377       // Set up arguments for checkcast_copy_entry.
 2378       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2379       __ b(RuntimeAddress(checkcast_copy_entry));
 2380     }
 2381 
 2382   __ BIND(L_failed);
 2383     __ mov(r0, -1);
 2384     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2385     __ ret(lr);
 2386 
 2387     return start;
 2388   }
 2389 
 2390   //
 2391   // Generate stub for array fill. If "aligned" is true, the
 2392   // "to" address is assumed to be heapword aligned.
 2393   //
 2394   // Arguments for generated stub:
 2395   //   to:    c_rarg0
 2396   //   value: c_rarg1
 2397   //   count: c_rarg2 treated as signed
 2398   //
 2399   address generate_fill(StubGenStubId stub_id) {
 2400     BasicType t;
 2401     bool aligned;
 2402 
 2403     switch (stub_id) {
 2404     case jbyte_fill_id:
 2405       t = T_BYTE;
 2406       aligned = false;
 2407       break;
 2408     case jshort_fill_id:
 2409       t = T_SHORT;
 2410       aligned = false;
 2411       break;
 2412     case jint_fill_id:
 2413       t = T_INT;
 2414       aligned = false;
 2415       break;
 2416     case arrayof_jbyte_fill_id:
 2417       t = T_BYTE;
 2418       aligned = true;
 2419       break;
 2420     case arrayof_jshort_fill_id:
 2421       t = T_SHORT;
 2422       aligned = true;
 2423       break;
 2424     case arrayof_jint_fill_id:
 2425       t = T_INT;
 2426       aligned = true;
 2427       break;
 2428     default:
 2429       ShouldNotReachHere();
 2430     };
 2431 
 2432     __ align(CodeEntryAlignment);
 2433     StubCodeMark mark(this, stub_id);
 2434     address start = __ pc();
 2435 
 2436     BLOCK_COMMENT("Entry:");
 2437 
 2438     const Register to        = c_rarg0;  // source array address
 2439     const Register value     = c_rarg1;  // value
 2440     const Register count     = c_rarg2;  // elements count
 2441 
 2442     const Register bz_base = r10;        // base for block_zero routine
 2443     const Register cnt_words = r11;      // temp register
 2444 
 2445     __ enter();
 2446 
 2447     Label L_fill_elements, L_exit1;
 2448 
 2449     int shift = -1;
 2450     switch (t) {
 2451       case T_BYTE:
 2452         shift = 0;
 2453         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2454         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2455         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2456         __ br(Assembler::LO, L_fill_elements);
 2457         break;
 2458       case T_SHORT:
 2459         shift = 1;
 2460         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2461         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2462         __ br(Assembler::LO, L_fill_elements);
 2463         break;
 2464       case T_INT:
 2465         shift = 2;
 2466         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2467         __ br(Assembler::LO, L_fill_elements);
 2468         break;
 2469       default: ShouldNotReachHere();
 2470     }
 2471 
 2472     // Align source address at 8 bytes address boundary.
 2473     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2474     if (!aligned) {
 2475       switch (t) {
 2476         case T_BYTE:
 2477           // One byte misalignment happens only for byte arrays.
 2478           __ tbz(to, 0, L_skip_align1);
 2479           __ strb(value, Address(__ post(to, 1)));
 2480           __ subw(count, count, 1);
 2481           __ bind(L_skip_align1);
 2482           // Fallthrough
 2483         case T_SHORT:
 2484           // Two bytes misalignment happens only for byte and short (char) arrays.
 2485           __ tbz(to, 1, L_skip_align2);
 2486           __ strh(value, Address(__ post(to, 2)));
 2487           __ subw(count, count, 2 >> shift);
 2488           __ bind(L_skip_align2);
 2489           // Fallthrough
 2490         case T_INT:
 2491           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2492           __ tbz(to, 2, L_skip_align4);
 2493           __ strw(value, Address(__ post(to, 4)));
 2494           __ subw(count, count, 4 >> shift);
 2495           __ bind(L_skip_align4);
 2496           break;
 2497         default: ShouldNotReachHere();
 2498       }
 2499     }
 2500 
 2501     //
 2502     //  Fill large chunks
 2503     //
 2504     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2505     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2506     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2507     if (UseBlockZeroing) {
 2508       Label non_block_zeroing, rest;
 2509       // If the fill value is zero we can use the fast zero_words().
 2510       __ cbnz(value, non_block_zeroing);
 2511       __ mov(bz_base, to);
 2512       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2513       address tpc = __ zero_words(bz_base, cnt_words);
 2514       if (tpc == nullptr) {
 2515         fatal("CodeCache is full at generate_fill");
 2516       }
 2517       __ b(rest);
 2518       __ bind(non_block_zeroing);
 2519       __ fill_words(to, cnt_words, value);
 2520       __ bind(rest);
 2521     } else {
 2522       __ fill_words(to, cnt_words, value);
 2523     }
 2524 
 2525     // Remaining count is less than 8 bytes. Fill it by a single store.
 2526     // Note that the total length is no less than 8 bytes.
 2527     if (t == T_BYTE || t == T_SHORT) {
 2528       Label L_exit1;
 2529       __ cbzw(count, L_exit1);
 2530       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2531       __ str(value, Address(to, -8));    // overwrite some elements
 2532       __ bind(L_exit1);
 2533       __ leave();
 2534       __ ret(lr);
 2535     }
 2536 
 2537     // Handle copies less than 8 bytes.
 2538     Label L_fill_2, L_fill_4, L_exit2;
 2539     __ bind(L_fill_elements);
 2540     switch (t) {
 2541       case T_BYTE:
 2542         __ tbz(count, 0, L_fill_2);
 2543         __ strb(value, Address(__ post(to, 1)));
 2544         __ bind(L_fill_2);
 2545         __ tbz(count, 1, L_fill_4);
 2546         __ strh(value, Address(__ post(to, 2)));
 2547         __ bind(L_fill_4);
 2548         __ tbz(count, 2, L_exit2);
 2549         __ strw(value, Address(to));
 2550         break;
 2551       case T_SHORT:
 2552         __ tbz(count, 0, L_fill_4);
 2553         __ strh(value, Address(__ post(to, 2)));
 2554         __ bind(L_fill_4);
 2555         __ tbz(count, 1, L_exit2);
 2556         __ strw(value, Address(to));
 2557         break;
 2558       case T_INT:
 2559         __ cbzw(count, L_exit2);
 2560         __ strw(value, Address(to));
 2561         break;
 2562       default: ShouldNotReachHere();
 2563     }
 2564     __ bind(L_exit2);
 2565     __ leave();
 2566     __ ret(lr);
 2567     return start;
 2568   }
 2569 
 2570   address generate_data_cache_writeback() {
 2571     const Register line        = c_rarg0;  // address of line to write back
 2572 
 2573     __ align(CodeEntryAlignment);
 2574 
 2575     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2576     StubCodeMark mark(this, stub_id);
 2577 
 2578     address start = __ pc();
 2579     __ enter();
 2580     __ cache_wb(Address(line, 0));
 2581     __ leave();
 2582     __ ret(lr);
 2583 
 2584     return start;
 2585   }
 2586 
 2587   address generate_data_cache_writeback_sync() {
 2588     const Register is_pre     = c_rarg0;  // pre or post sync
 2589 
 2590     __ align(CodeEntryAlignment);
 2591 
 2592     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2593     StubCodeMark mark(this, stub_id);
 2594 
 2595     // pre wbsync is a no-op
 2596     // post wbsync translates to an sfence
 2597 
 2598     Label skip;
 2599     address start = __ pc();
 2600     __ enter();
 2601     __ cbnz(is_pre, skip);
 2602     __ cache_wbsync(false);
 2603     __ bind(skip);
 2604     __ leave();
 2605     __ ret(lr);
 2606 
 2607     return start;
 2608   }
 2609 
 2610   void generate_arraycopy_stubs() {
 2611     address entry;
 2612     address entry_jbyte_arraycopy;
 2613     address entry_jshort_arraycopy;
 2614     address entry_jint_arraycopy;
 2615     address entry_oop_arraycopy;
 2616     address entry_jlong_arraycopy;
 2617     address entry_checkcast_arraycopy;
 2618 
 2619     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2620     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2621 
 2622     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2623     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2624 
 2625     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2626     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2627 
 2628     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2629 
 2630     //*** jbyte
 2631     // Always need aligned and unaligned versions
 2632     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2633     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2634     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2635     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2636 
 2637     //*** jshort
 2638     // Always need aligned and unaligned versions
 2639     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2640     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2641     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2642     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2643 
 2644     //*** jint
 2645     // Aligned versions
 2646     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2647     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2648     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2649     // entry_jint_arraycopy always points to the unaligned version
 2650     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2651     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2652 
 2653     //*** jlong
 2654     // It is always aligned
 2655     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2656     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2657     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2658     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2659 
 2660     //*** oops
 2661     {
 2662       // With compressed oops we need unaligned versions; notice that
 2663       // we overwrite entry_oop_arraycopy.
 2664       bool aligned = !UseCompressedOops;
 2665 
 2666       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2667         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2668       StubRoutines::_arrayof_oop_arraycopy
 2669         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2670       // Aligned versions without pre-barriers
 2671       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2672         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2673       StubRoutines::_arrayof_oop_arraycopy_uninit
 2674         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2675     }
 2676 
 2677     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2678     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2679     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2680     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2681 
 2682     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2683     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2684 
 2685     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2686                                                               entry_jshort_arraycopy,
 2687                                                               entry_jint_arraycopy,
 2688                                                               entry_jlong_arraycopy);
 2689 
 2690     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2691                                                                entry_jshort_arraycopy,
 2692                                                                entry_jint_arraycopy,
 2693                                                                entry_oop_arraycopy,
 2694                                                                entry_jlong_arraycopy,
 2695                                                                entry_checkcast_arraycopy);
 2696 
 2697     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2698     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2699     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2700     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2701     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2702     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2703   }
 2704 
 2705   void generate_math_stubs() { Unimplemented(); }
 2706 
 2707   // Arguments:
 2708   //
 2709   // Inputs:
 2710   //   c_rarg0   - source byte array address
 2711   //   c_rarg1   - destination byte array address
 2712   //   c_rarg2   - K (key) in little endian int array
 2713   //
 2714   address generate_aescrypt_encryptBlock() {
 2715     __ align(CodeEntryAlignment);
 2716     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2717     StubCodeMark mark(this, stub_id);
 2718 
 2719     const Register from        = c_rarg0;  // source array address
 2720     const Register to          = c_rarg1;  // destination array address
 2721     const Register key         = c_rarg2;  // key array address
 2722     const Register keylen      = rscratch1;
 2723 
 2724     address start = __ pc();
 2725     __ enter();
 2726 
 2727     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2728 
 2729     __ aesenc_loadkeys(key, keylen);
 2730     __ aesecb_encrypt(from, to, keylen);
 2731 
 2732     __ mov(r0, 0);
 2733 
 2734     __ leave();
 2735     __ ret(lr);
 2736 
 2737     return start;
 2738   }
 2739 
 2740   // Arguments:
 2741   //
 2742   // Inputs:
 2743   //   c_rarg0   - source byte array address
 2744   //   c_rarg1   - destination byte array address
 2745   //   c_rarg2   - K (key) in little endian int array
 2746   //
 2747   address generate_aescrypt_decryptBlock() {
 2748     assert(UseAES, "need AES cryptographic extension support");
 2749     __ align(CodeEntryAlignment);
 2750     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2751     StubCodeMark mark(this, stub_id);
 2752     Label L_doLast;
 2753 
 2754     const Register from        = c_rarg0;  // source array address
 2755     const Register to          = c_rarg1;  // destination array address
 2756     const Register key         = c_rarg2;  // key array address
 2757     const Register keylen      = rscratch1;
 2758 
 2759     address start = __ pc();
 2760     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2761 
 2762     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2763 
 2764     __ aesecb_decrypt(from, to, key, keylen);
 2765 
 2766     __ mov(r0, 0);
 2767 
 2768     __ leave();
 2769     __ ret(lr);
 2770 
 2771     return start;
 2772   }
 2773 
 2774   // Arguments:
 2775   //
 2776   // Inputs:
 2777   //   c_rarg0   - source byte array address
 2778   //   c_rarg1   - destination byte array address
 2779   //   c_rarg2   - K (key) in little endian int array
 2780   //   c_rarg3   - r vector byte array address
 2781   //   c_rarg4   - input length
 2782   //
 2783   // Output:
 2784   //   x0        - input length
 2785   //
 2786   address generate_cipherBlockChaining_encryptAESCrypt() {
 2787     assert(UseAES, "need AES cryptographic extension support");
 2788     __ align(CodeEntryAlignment);
 2789     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2790     StubCodeMark mark(this, stub_id);
 2791 
 2792     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2793 
 2794     const Register from        = c_rarg0;  // source array address
 2795     const Register to          = c_rarg1;  // destination array address
 2796     const Register key         = c_rarg2;  // key array address
 2797     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2798                                            // and left with the results of the last encryption block
 2799     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2800     const Register keylen      = rscratch1;
 2801 
 2802     address start = __ pc();
 2803 
 2804       __ enter();
 2805 
 2806       __ movw(rscratch2, len_reg);
 2807 
 2808       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2809 
 2810       __ ld1(v0, __ T16B, rvec);
 2811 
 2812       __ cmpw(keylen, 52);
 2813       __ br(Assembler::CC, L_loadkeys_44);
 2814       __ br(Assembler::EQ, L_loadkeys_52);
 2815 
 2816       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2817       __ rev32(v17, __ T16B, v17);
 2818       __ rev32(v18, __ T16B, v18);
 2819     __ BIND(L_loadkeys_52);
 2820       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2821       __ rev32(v19, __ T16B, v19);
 2822       __ rev32(v20, __ T16B, v20);
 2823     __ BIND(L_loadkeys_44);
 2824       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2825       __ rev32(v21, __ T16B, v21);
 2826       __ rev32(v22, __ T16B, v22);
 2827       __ rev32(v23, __ T16B, v23);
 2828       __ rev32(v24, __ T16B, v24);
 2829       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2830       __ rev32(v25, __ T16B, v25);
 2831       __ rev32(v26, __ T16B, v26);
 2832       __ rev32(v27, __ T16B, v27);
 2833       __ rev32(v28, __ T16B, v28);
 2834       __ ld1(v29, v30, v31, __ T16B, key);
 2835       __ rev32(v29, __ T16B, v29);
 2836       __ rev32(v30, __ T16B, v30);
 2837       __ rev32(v31, __ T16B, v31);
 2838 
 2839     __ BIND(L_aes_loop);
 2840       __ ld1(v1, __ T16B, __ post(from, 16));
 2841       __ eor(v0, __ T16B, v0, v1);
 2842 
 2843       __ br(Assembler::CC, L_rounds_44);
 2844       __ br(Assembler::EQ, L_rounds_52);
 2845 
 2846       __ aese(v0, v17); __ aesmc(v0, v0);
 2847       __ aese(v0, v18); __ aesmc(v0, v0);
 2848     __ BIND(L_rounds_52);
 2849       __ aese(v0, v19); __ aesmc(v0, v0);
 2850       __ aese(v0, v20); __ aesmc(v0, v0);
 2851     __ BIND(L_rounds_44);
 2852       __ aese(v0, v21); __ aesmc(v0, v0);
 2853       __ aese(v0, v22); __ aesmc(v0, v0);
 2854       __ aese(v0, v23); __ aesmc(v0, v0);
 2855       __ aese(v0, v24); __ aesmc(v0, v0);
 2856       __ aese(v0, v25); __ aesmc(v0, v0);
 2857       __ aese(v0, v26); __ aesmc(v0, v0);
 2858       __ aese(v0, v27); __ aesmc(v0, v0);
 2859       __ aese(v0, v28); __ aesmc(v0, v0);
 2860       __ aese(v0, v29); __ aesmc(v0, v0);
 2861       __ aese(v0, v30);
 2862       __ eor(v0, __ T16B, v0, v31);
 2863 
 2864       __ st1(v0, __ T16B, __ post(to, 16));
 2865 
 2866       __ subw(len_reg, len_reg, 16);
 2867       __ cbnzw(len_reg, L_aes_loop);
 2868 
 2869       __ st1(v0, __ T16B, rvec);
 2870 
 2871       __ mov(r0, rscratch2);
 2872 
 2873       __ leave();
 2874       __ ret(lr);
 2875 
 2876       return start;
 2877   }
 2878 
 2879   // Arguments:
 2880   //
 2881   // Inputs:
 2882   //   c_rarg0   - source byte array address
 2883   //   c_rarg1   - destination byte array address
 2884   //   c_rarg2   - K (key) in little endian int array
 2885   //   c_rarg3   - r vector byte array address
 2886   //   c_rarg4   - input length
 2887   //
 2888   // Output:
 2889   //   r0        - input length
 2890   //
 2891   address generate_cipherBlockChaining_decryptAESCrypt() {
 2892     assert(UseAES, "need AES cryptographic extension support");
 2893     __ align(CodeEntryAlignment);
 2894     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2895     StubCodeMark mark(this, stub_id);
 2896 
 2897     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2898 
 2899     const Register from        = c_rarg0;  // source array address
 2900     const Register to          = c_rarg1;  // destination array address
 2901     const Register key         = c_rarg2;  // key array address
 2902     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2903                                            // and left with the results of the last encryption block
 2904     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2905     const Register keylen      = rscratch1;
 2906 
 2907     address start = __ pc();
 2908 
 2909       __ enter();
 2910 
 2911       __ movw(rscratch2, len_reg);
 2912 
 2913       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2914 
 2915       __ ld1(v2, __ T16B, rvec);
 2916 
 2917       __ ld1(v31, __ T16B, __ post(key, 16));
 2918       __ rev32(v31, __ T16B, v31);
 2919 
 2920       __ cmpw(keylen, 52);
 2921       __ br(Assembler::CC, L_loadkeys_44);
 2922       __ br(Assembler::EQ, L_loadkeys_52);
 2923 
 2924       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2925       __ rev32(v17, __ T16B, v17);
 2926       __ rev32(v18, __ T16B, v18);
 2927     __ BIND(L_loadkeys_52);
 2928       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2929       __ rev32(v19, __ T16B, v19);
 2930       __ rev32(v20, __ T16B, v20);
 2931     __ BIND(L_loadkeys_44);
 2932       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2933       __ rev32(v21, __ T16B, v21);
 2934       __ rev32(v22, __ T16B, v22);
 2935       __ rev32(v23, __ T16B, v23);
 2936       __ rev32(v24, __ T16B, v24);
 2937       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2938       __ rev32(v25, __ T16B, v25);
 2939       __ rev32(v26, __ T16B, v26);
 2940       __ rev32(v27, __ T16B, v27);
 2941       __ rev32(v28, __ T16B, v28);
 2942       __ ld1(v29, v30, __ T16B, key);
 2943       __ rev32(v29, __ T16B, v29);
 2944       __ rev32(v30, __ T16B, v30);
 2945 
 2946     __ BIND(L_aes_loop);
 2947       __ ld1(v0, __ T16B, __ post(from, 16));
 2948       __ orr(v1, __ T16B, v0, v0);
 2949 
 2950       __ br(Assembler::CC, L_rounds_44);
 2951       __ br(Assembler::EQ, L_rounds_52);
 2952 
 2953       __ aesd(v0, v17); __ aesimc(v0, v0);
 2954       __ aesd(v0, v18); __ aesimc(v0, v0);
 2955     __ BIND(L_rounds_52);
 2956       __ aesd(v0, v19); __ aesimc(v0, v0);
 2957       __ aesd(v0, v20); __ aesimc(v0, v0);
 2958     __ BIND(L_rounds_44);
 2959       __ aesd(v0, v21); __ aesimc(v0, v0);
 2960       __ aesd(v0, v22); __ aesimc(v0, v0);
 2961       __ aesd(v0, v23); __ aesimc(v0, v0);
 2962       __ aesd(v0, v24); __ aesimc(v0, v0);
 2963       __ aesd(v0, v25); __ aesimc(v0, v0);
 2964       __ aesd(v0, v26); __ aesimc(v0, v0);
 2965       __ aesd(v0, v27); __ aesimc(v0, v0);
 2966       __ aesd(v0, v28); __ aesimc(v0, v0);
 2967       __ aesd(v0, v29); __ aesimc(v0, v0);
 2968       __ aesd(v0, v30);
 2969       __ eor(v0, __ T16B, v0, v31);
 2970       __ eor(v0, __ T16B, v0, v2);
 2971 
 2972       __ st1(v0, __ T16B, __ post(to, 16));
 2973       __ orr(v2, __ T16B, v1, v1);
 2974 
 2975       __ subw(len_reg, len_reg, 16);
 2976       __ cbnzw(len_reg, L_aes_loop);
 2977 
 2978       __ st1(v2, __ T16B, rvec);
 2979 
 2980       __ mov(r0, rscratch2);
 2981 
 2982       __ leave();
 2983       __ ret(lr);
 2984 
 2985     return start;
 2986   }
 2987 
 2988   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 2989   // Inputs: 128-bits. in is preserved.
 2990   // The least-significant 64-bit word is in the upper dword of each vector.
 2991   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 2992   // Output: result
 2993   void be_add_128_64(FloatRegister result, FloatRegister in,
 2994                      FloatRegister inc, FloatRegister tmp) {
 2995     assert_different_registers(result, tmp, inc);
 2996 
 2997     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 2998                                            // input
 2999     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3000     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3001                                            // MSD == 0 (must be!) to LSD
 3002     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3003   }
 3004 
 3005   // CTR AES crypt.
 3006   // Arguments:
 3007   //
 3008   // Inputs:
 3009   //   c_rarg0   - source byte array address
 3010   //   c_rarg1   - destination byte array address
 3011   //   c_rarg2   - K (key) in little endian int array
 3012   //   c_rarg3   - counter vector byte array address
 3013   //   c_rarg4   - input length
 3014   //   c_rarg5   - saved encryptedCounter start
 3015   //   c_rarg6   - saved used length
 3016   //
 3017   // Output:
 3018   //   r0       - input length
 3019   //
 3020   address generate_counterMode_AESCrypt() {
 3021     const Register in = c_rarg0;
 3022     const Register out = c_rarg1;
 3023     const Register key = c_rarg2;
 3024     const Register counter = c_rarg3;
 3025     const Register saved_len = c_rarg4, len = r10;
 3026     const Register saved_encrypted_ctr = c_rarg5;
 3027     const Register used_ptr = c_rarg6, used = r12;
 3028 
 3029     const Register offset = r7;
 3030     const Register keylen = r11;
 3031 
 3032     const unsigned char block_size = 16;
 3033     const int bulk_width = 4;
 3034     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3035     // performance with larger data sizes, but it also means that the
 3036     // fast path isn't used until you have at least 8 blocks, and up
 3037     // to 127 bytes of data will be executed on the slow path. For
 3038     // that reason, and also so as not to blow away too much icache, 4
 3039     // blocks seems like a sensible compromise.
 3040 
 3041     // Algorithm:
 3042     //
 3043     //    if (len == 0) {
 3044     //        goto DONE;
 3045     //    }
 3046     //    int result = len;
 3047     //    do {
 3048     //        if (used >= blockSize) {
 3049     //            if (len >= bulk_width * blockSize) {
 3050     //                CTR_large_block();
 3051     //                if (len == 0)
 3052     //                    goto DONE;
 3053     //            }
 3054     //            for (;;) {
 3055     //                16ByteVector v0 = counter;
 3056     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3057     //                used = 0;
 3058     //                if (len < blockSize)
 3059     //                    break;    /* goto NEXT */
 3060     //                16ByteVector v1 = load16Bytes(in, offset);
 3061     //                v1 = v1 ^ encryptedCounter;
 3062     //                store16Bytes(out, offset);
 3063     //                used = blockSize;
 3064     //                offset += blockSize;
 3065     //                len -= blockSize;
 3066     //                if (len == 0)
 3067     //                    goto DONE;
 3068     //            }
 3069     //        }
 3070     //      NEXT:
 3071     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3072     //        len--;
 3073     //    } while (len != 0);
 3074     //  DONE:
 3075     //    return result;
 3076     //
 3077     // CTR_large_block()
 3078     //    Wide bulk encryption of whole blocks.
 3079 
 3080     __ align(CodeEntryAlignment);
 3081     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3082     StubCodeMark mark(this, stub_id);
 3083     const address start = __ pc();
 3084     __ enter();
 3085 
 3086     Label DONE, CTR_large_block, large_block_return;
 3087     __ ldrw(used, Address(used_ptr));
 3088     __ cbzw(saved_len, DONE);
 3089 
 3090     __ mov(len, saved_len);
 3091     __ mov(offset, 0);
 3092 
 3093     // Compute #rounds for AES based on the length of the key array
 3094     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3095 
 3096     __ aesenc_loadkeys(key, keylen);
 3097 
 3098     {
 3099       Label L_CTR_loop, NEXT;
 3100 
 3101       __ bind(L_CTR_loop);
 3102 
 3103       __ cmp(used, block_size);
 3104       __ br(__ LO, NEXT);
 3105 
 3106       // Maybe we have a lot of data
 3107       __ subsw(rscratch1, len, bulk_width * block_size);
 3108       __ br(__ HS, CTR_large_block);
 3109       __ BIND(large_block_return);
 3110       __ cbzw(len, DONE);
 3111 
 3112       // Setup the counter
 3113       __ movi(v4, __ T4S, 0);
 3114       __ movi(v5, __ T4S, 1);
 3115       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3116 
 3117       // 128-bit big-endian increment
 3118       __ ld1(v0, __ T16B, counter);
 3119       __ rev64(v16, __ T16B, v0);
 3120       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3121       __ rev64(v16, __ T16B, v16);
 3122       __ st1(v16, __ T16B, counter);
 3123       // Previous counter value is in v0
 3124       // v4 contains { 0, 1 }
 3125 
 3126       {
 3127         // We have fewer than bulk_width blocks of data left. Encrypt
 3128         // them one by one until there is less than a full block
 3129         // remaining, being careful to save both the encrypted counter
 3130         // and the counter.
 3131 
 3132         Label inner_loop;
 3133         __ bind(inner_loop);
 3134         // Counter to encrypt is in v0
 3135         __ aesecb_encrypt(noreg, noreg, keylen);
 3136         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3137 
 3138         // Do we have a remaining full block?
 3139 
 3140         __ mov(used, 0);
 3141         __ cmp(len, block_size);
 3142         __ br(__ LO, NEXT);
 3143 
 3144         // Yes, we have a full block
 3145         __ ldrq(v1, Address(in, offset));
 3146         __ eor(v1, __ T16B, v1, v0);
 3147         __ strq(v1, Address(out, offset));
 3148         __ mov(used, block_size);
 3149         __ add(offset, offset, block_size);
 3150 
 3151         __ subw(len, len, block_size);
 3152         __ cbzw(len, DONE);
 3153 
 3154         // Increment the counter, store it back
 3155         __ orr(v0, __ T16B, v16, v16);
 3156         __ rev64(v16, __ T16B, v16);
 3157         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3158         __ rev64(v16, __ T16B, v16);
 3159         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3160 
 3161         __ b(inner_loop);
 3162       }
 3163 
 3164       __ BIND(NEXT);
 3165 
 3166       // Encrypt a single byte, and loop.
 3167       // We expect this to be a rare event.
 3168       __ ldrb(rscratch1, Address(in, offset));
 3169       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3170       __ eor(rscratch1, rscratch1, rscratch2);
 3171       __ strb(rscratch1, Address(out, offset));
 3172       __ add(offset, offset, 1);
 3173       __ add(used, used, 1);
 3174       __ subw(len, len,1);
 3175       __ cbnzw(len, L_CTR_loop);
 3176     }
 3177 
 3178     __ bind(DONE);
 3179     __ strw(used, Address(used_ptr));
 3180     __ mov(r0, saved_len);
 3181 
 3182     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3183     __ ret(lr);
 3184 
 3185     // Bulk encryption
 3186 
 3187     __ BIND (CTR_large_block);
 3188     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3189 
 3190     if (bulk_width == 8) {
 3191       __ sub(sp, sp, 4 * 16);
 3192       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3193     }
 3194     __ sub(sp, sp, 4 * 16);
 3195     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3196     RegSet saved_regs = (RegSet::of(in, out, offset)
 3197                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3198     __ push(saved_regs, sp);
 3199     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3200     __ add(in, in, offset);
 3201     __ add(out, out, offset);
 3202 
 3203     // Keys should already be loaded into the correct registers
 3204 
 3205     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3206     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3207 
 3208     // AES/CTR loop
 3209     {
 3210       Label L_CTR_loop;
 3211       __ BIND(L_CTR_loop);
 3212 
 3213       // Setup the counters
 3214       __ movi(v8, __ T4S, 0);
 3215       __ movi(v9, __ T4S, 1);
 3216       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3217 
 3218       for (int i = 0; i < bulk_width; i++) {
 3219         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3220         __ rev64(v0_ofs, __ T16B, v16);
 3221         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3222       }
 3223 
 3224       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3225 
 3226       // Encrypt the counters
 3227       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3228 
 3229       if (bulk_width == 8) {
 3230         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3231       }
 3232 
 3233       // XOR the encrypted counters with the inputs
 3234       for (int i = 0; i < bulk_width; i++) {
 3235         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3236         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3237         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3238       }
 3239 
 3240       // Write the encrypted data
 3241       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3242       if (bulk_width == 8) {
 3243         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3244       }
 3245 
 3246       __ subw(len, len, 16 * bulk_width);
 3247       __ cbnzw(len, L_CTR_loop);
 3248     }
 3249 
 3250     // Save the counter back where it goes
 3251     __ rev64(v16, __ T16B, v16);
 3252     __ st1(v16, __ T16B, counter);
 3253 
 3254     __ pop(saved_regs, sp);
 3255 
 3256     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3257     if (bulk_width == 8) {
 3258       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3259     }
 3260 
 3261     __ andr(rscratch1, len, -16 * bulk_width);
 3262     __ sub(len, len, rscratch1);
 3263     __ add(offset, offset, rscratch1);
 3264     __ mov(used, 16);
 3265     __ strw(used, Address(used_ptr));
 3266     __ b(large_block_return);
 3267 
 3268     return start;
 3269   }
 3270 
 3271   // Vector AES Galois Counter Mode implementation. Parameters:
 3272   //
 3273   // in = c_rarg0
 3274   // len = c_rarg1
 3275   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3276   // out = c_rarg3
 3277   // key = c_rarg4
 3278   // state = c_rarg5 - GHASH.state
 3279   // subkeyHtbl = c_rarg6 - powers of H
 3280   // counter = c_rarg7 - 16 bytes of CTR
 3281   // return - number of processed bytes
 3282   address generate_galoisCounterMode_AESCrypt() {
 3283     address ghash_polynomial = __ pc();
 3284     __ emit_int64(0x87);  // The low-order bits of the field
 3285                           // polynomial (i.e. p = z^7+z^2+z+1)
 3286                           // repeated in the low and high parts of a
 3287                           // 128-bit vector
 3288     __ emit_int64(0x87);
 3289 
 3290     __ align(CodeEntryAlignment);
 3291     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3292     StubCodeMark mark(this, stub_id);
 3293     address start = __ pc();
 3294     __ enter();
 3295 
 3296     const Register in = c_rarg0;
 3297     const Register len = c_rarg1;
 3298     const Register ct = c_rarg2;
 3299     const Register out = c_rarg3;
 3300     // and updated with the incremented counter in the end
 3301 
 3302     const Register key = c_rarg4;
 3303     const Register state = c_rarg5;
 3304 
 3305     const Register subkeyHtbl = c_rarg6;
 3306 
 3307     const Register counter = c_rarg7;
 3308 
 3309     const Register keylen = r10;
 3310     // Save state before entering routine
 3311     __ sub(sp, sp, 4 * 16);
 3312     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3313     __ sub(sp, sp, 4 * 16);
 3314     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3315 
 3316     // __ andr(len, len, -512);
 3317     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3318     __ str(len, __ pre(sp, -2 * wordSize));
 3319 
 3320     Label DONE;
 3321     __ cbz(len, DONE);
 3322 
 3323     // Compute #rounds for AES based on the length of the key array
 3324     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3325 
 3326     __ aesenc_loadkeys(key, keylen);
 3327     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3328     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3329 
 3330     // AES/CTR loop
 3331     {
 3332       Label L_CTR_loop;
 3333       __ BIND(L_CTR_loop);
 3334 
 3335       // Setup the counters
 3336       __ movi(v8, __ T4S, 0);
 3337       __ movi(v9, __ T4S, 1);
 3338       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3339 
 3340       assert(v0->encoding() < v8->encoding(), "");
 3341       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3342         FloatRegister f = as_FloatRegister(i);
 3343         __ rev32(f, __ T16B, v16);
 3344         __ addv(v16, __ T4S, v16, v8);
 3345       }
 3346 
 3347       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3348 
 3349       // Encrypt the counters
 3350       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3351 
 3352       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3353 
 3354       // XOR the encrypted counters with the inputs
 3355       for (int i = 0; i < 8; i++) {
 3356         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3357         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3358         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3359       }
 3360       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3361       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3362 
 3363       __ subw(len, len, 16 * 8);
 3364       __ cbnzw(len, L_CTR_loop);
 3365     }
 3366 
 3367     __ rev32(v16, __ T16B, v16);
 3368     __ st1(v16, __ T16B, counter);
 3369 
 3370     __ ldr(len, Address(sp));
 3371     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3372 
 3373     // GHASH/CTR loop
 3374     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3375                                 len, /*unrolls*/4);
 3376 
 3377 #ifdef ASSERT
 3378     { Label L;
 3379       __ cmp(len, (unsigned char)0);
 3380       __ br(Assembler::EQ, L);
 3381       __ stop("stubGenerator: abort");
 3382       __ bind(L);
 3383   }
 3384 #endif
 3385 
 3386   __ bind(DONE);
 3387     // Return the number of bytes processed
 3388     __ ldr(r0, __ post(sp, 2 * wordSize));
 3389 
 3390     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3391     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3392 
 3393     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3394     __ ret(lr);
 3395      return start;
 3396   }
 3397 
 3398   class Cached64Bytes {
 3399   private:
 3400     MacroAssembler *_masm;
 3401     Register _regs[8];
 3402 
 3403   public:
 3404     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3405       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3406       auto it = rs.begin();
 3407       for (auto &r: _regs) {
 3408         r = *it;
 3409         ++it;
 3410       }
 3411     }
 3412 
 3413     void gen_loads(Register base) {
 3414       for (int i = 0; i < 8; i += 2) {
 3415         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3416       }
 3417     }
 3418 
 3419     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3420     void extract_u32(Register dest, int i) {
 3421       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3422     }
 3423   };
 3424 
 3425   // Utility routines for md5.
 3426   // Clobbers r10 and r11.
 3427   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3428               int k, int s, int t) {
 3429     Register rscratch3 = r10;
 3430     Register rscratch4 = r11;
 3431 
 3432     __ eorw(rscratch3, r3, r4);
 3433     __ movw(rscratch2, t);
 3434     __ andw(rscratch3, rscratch3, r2);
 3435     __ addw(rscratch4, r1, rscratch2);
 3436     reg_cache.extract_u32(rscratch1, k);
 3437     __ eorw(rscratch3, rscratch3, r4);
 3438     __ addw(rscratch4, rscratch4, rscratch1);
 3439     __ addw(rscratch3, rscratch3, rscratch4);
 3440     __ rorw(rscratch2, rscratch3, 32 - s);
 3441     __ addw(r1, rscratch2, r2);
 3442   }
 3443 
 3444   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3445               int k, int s, int t) {
 3446     Register rscratch3 = r10;
 3447     Register rscratch4 = r11;
 3448 
 3449     reg_cache.extract_u32(rscratch1, k);
 3450     __ movw(rscratch2, t);
 3451     __ addw(rscratch4, r1, rscratch2);
 3452     __ addw(rscratch4, rscratch4, rscratch1);
 3453     __ bicw(rscratch2, r3, r4);
 3454     __ andw(rscratch3, r2, r4);
 3455     __ addw(rscratch2, rscratch2, rscratch4);
 3456     __ addw(rscratch2, rscratch2, rscratch3);
 3457     __ rorw(rscratch2, rscratch2, 32 - s);
 3458     __ addw(r1, rscratch2, r2);
 3459   }
 3460 
 3461   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3462               int k, int s, int t) {
 3463     Register rscratch3 = r10;
 3464     Register rscratch4 = r11;
 3465 
 3466     __ eorw(rscratch3, r3, r4);
 3467     __ movw(rscratch2, t);
 3468     __ addw(rscratch4, r1, rscratch2);
 3469     reg_cache.extract_u32(rscratch1, k);
 3470     __ eorw(rscratch3, rscratch3, r2);
 3471     __ addw(rscratch4, rscratch4, rscratch1);
 3472     __ addw(rscratch3, rscratch3, rscratch4);
 3473     __ rorw(rscratch2, rscratch3, 32 - s);
 3474     __ addw(r1, rscratch2, r2);
 3475   }
 3476 
 3477   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3478               int k, int s, int t) {
 3479     Register rscratch3 = r10;
 3480     Register rscratch4 = r11;
 3481 
 3482     __ movw(rscratch3, t);
 3483     __ ornw(rscratch2, r2, r4);
 3484     __ addw(rscratch4, r1, rscratch3);
 3485     reg_cache.extract_u32(rscratch1, k);
 3486     __ eorw(rscratch3, rscratch2, r3);
 3487     __ addw(rscratch4, rscratch4, rscratch1);
 3488     __ addw(rscratch3, rscratch3, rscratch4);
 3489     __ rorw(rscratch2, rscratch3, 32 - s);
 3490     __ addw(r1, rscratch2, r2);
 3491   }
 3492 
 3493   // Arguments:
 3494   //
 3495   // Inputs:
 3496   //   c_rarg0   - byte[]  source+offset
 3497   //   c_rarg1   - int[]   SHA.state
 3498   //   c_rarg2   - int     offset
 3499   //   c_rarg3   - int     limit
 3500   //
 3501   address generate_md5_implCompress(StubGenStubId stub_id) {
 3502     bool multi_block;
 3503     switch (stub_id) {
 3504     case md5_implCompress_id:
 3505       multi_block = false;
 3506       break;
 3507     case md5_implCompressMB_id:
 3508       multi_block = true;
 3509       break;
 3510     default:
 3511       ShouldNotReachHere();
 3512     }
 3513     __ align(CodeEntryAlignment);
 3514 
 3515     StubCodeMark mark(this, stub_id);
 3516     address start = __ pc();
 3517 
 3518     Register buf       = c_rarg0;
 3519     Register state     = c_rarg1;
 3520     Register ofs       = c_rarg2;
 3521     Register limit     = c_rarg3;
 3522     Register a         = r4;
 3523     Register b         = r5;
 3524     Register c         = r6;
 3525     Register d         = r7;
 3526     Register rscratch3 = r10;
 3527     Register rscratch4 = r11;
 3528 
 3529     Register state_regs[2] = { r12, r13 };
 3530     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3531     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3532 
 3533     __ push(saved_regs, sp);
 3534 
 3535     __ ldp(state_regs[0], state_regs[1], Address(state));
 3536     __ ubfx(a, state_regs[0],  0, 32);
 3537     __ ubfx(b, state_regs[0], 32, 32);
 3538     __ ubfx(c, state_regs[1],  0, 32);
 3539     __ ubfx(d, state_regs[1], 32, 32);
 3540 
 3541     Label md5_loop;
 3542     __ BIND(md5_loop);
 3543 
 3544     reg_cache.gen_loads(buf);
 3545 
 3546     // Round 1
 3547     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3548     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3549     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3550     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3551     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3552     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3553     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3554     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3555     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3556     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3557     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3558     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3559     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3560     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3561     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3562     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3563 
 3564     // Round 2
 3565     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3566     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3567     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3568     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3569     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3570     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3571     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3572     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3573     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3574     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3575     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3576     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3577     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3578     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3579     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3580     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3581 
 3582     // Round 3
 3583     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3584     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3585     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3586     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3587     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3588     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3589     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3590     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3591     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3592     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3593     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3594     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3595     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3596     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3597     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3598     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3599 
 3600     // Round 4
 3601     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3602     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3603     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3604     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3605     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3606     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3607     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3608     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3609     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3610     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3611     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3612     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3613     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3614     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3615     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3616     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3617 
 3618     __ addw(a, state_regs[0], a);
 3619     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3620     __ addw(b, rscratch2, b);
 3621     __ addw(c, state_regs[1], c);
 3622     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3623     __ addw(d, rscratch4, d);
 3624 
 3625     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3626     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3627 
 3628     if (multi_block) {
 3629       __ add(buf, buf, 64);
 3630       __ add(ofs, ofs, 64);
 3631       __ cmp(ofs, limit);
 3632       __ br(Assembler::LE, md5_loop);
 3633       __ mov(c_rarg0, ofs); // return ofs
 3634     }
 3635 
 3636     // write hash values back in the correct order
 3637     __ stp(state_regs[0], state_regs[1], Address(state));
 3638 
 3639     __ pop(saved_regs, sp);
 3640 
 3641     __ ret(lr);
 3642 
 3643     return start;
 3644   }
 3645 
 3646   // Arguments:
 3647   //
 3648   // Inputs:
 3649   //   c_rarg0   - byte[]  source+offset
 3650   //   c_rarg1   - int[]   SHA.state
 3651   //   c_rarg2   - int     offset
 3652   //   c_rarg3   - int     limit
 3653   //
 3654   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3655     bool multi_block;
 3656     switch (stub_id) {
 3657     case sha1_implCompress_id:
 3658       multi_block = false;
 3659       break;
 3660     case sha1_implCompressMB_id:
 3661       multi_block = true;
 3662       break;
 3663     default:
 3664       ShouldNotReachHere();
 3665     }
 3666 
 3667     __ align(CodeEntryAlignment);
 3668 
 3669     StubCodeMark mark(this, stub_id);
 3670     address start = __ pc();
 3671 
 3672     Register buf   = c_rarg0;
 3673     Register state = c_rarg1;
 3674     Register ofs   = c_rarg2;
 3675     Register limit = c_rarg3;
 3676 
 3677     Label keys;
 3678     Label sha1_loop;
 3679 
 3680     // load the keys into v0..v3
 3681     __ adr(rscratch1, keys);
 3682     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3683     // load 5 words state into v6, v7
 3684     __ ldrq(v6, Address(state, 0));
 3685     __ ldrs(v7, Address(state, 16));
 3686 
 3687 
 3688     __ BIND(sha1_loop);
 3689     // load 64 bytes of data into v16..v19
 3690     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3691     __ rev32(v16, __ T16B, v16);
 3692     __ rev32(v17, __ T16B, v17);
 3693     __ rev32(v18, __ T16B, v18);
 3694     __ rev32(v19, __ T16B, v19);
 3695 
 3696     // do the sha1
 3697     __ addv(v4, __ T4S, v16, v0);
 3698     __ orr(v20, __ T16B, v6, v6);
 3699 
 3700     FloatRegister d0 = v16;
 3701     FloatRegister d1 = v17;
 3702     FloatRegister d2 = v18;
 3703     FloatRegister d3 = v19;
 3704 
 3705     for (int round = 0; round < 20; round++) {
 3706       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3707       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3708       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3709       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3710       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3711 
 3712       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3713       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3714       __ sha1h(tmp2, __ T4S, v20);
 3715       if (round < 5)
 3716         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3717       else if (round < 10 || round >= 15)
 3718         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3719       else
 3720         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3721       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3722 
 3723       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3724     }
 3725 
 3726     __ addv(v7, __ T2S, v7, v21);
 3727     __ addv(v6, __ T4S, v6, v20);
 3728 
 3729     if (multi_block) {
 3730       __ add(ofs, ofs, 64);
 3731       __ cmp(ofs, limit);
 3732       __ br(Assembler::LE, sha1_loop);
 3733       __ mov(c_rarg0, ofs); // return ofs
 3734     }
 3735 
 3736     __ strq(v6, Address(state, 0));
 3737     __ strs(v7, Address(state, 16));
 3738 
 3739     __ ret(lr);
 3740 
 3741     __ bind(keys);
 3742     __ emit_int32(0x5a827999);
 3743     __ emit_int32(0x6ed9eba1);
 3744     __ emit_int32(0x8f1bbcdc);
 3745     __ emit_int32(0xca62c1d6);
 3746 
 3747     return start;
 3748   }
 3749 
 3750 
 3751   // Arguments:
 3752   //
 3753   // Inputs:
 3754   //   c_rarg0   - byte[]  source+offset
 3755   //   c_rarg1   - int[]   SHA.state
 3756   //   c_rarg2   - int     offset
 3757   //   c_rarg3   - int     limit
 3758   //
 3759   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3760     bool multi_block;
 3761     switch (stub_id) {
 3762     case sha256_implCompress_id:
 3763       multi_block = false;
 3764       break;
 3765     case sha256_implCompressMB_id:
 3766       multi_block = true;
 3767       break;
 3768     default:
 3769       ShouldNotReachHere();
 3770     }
 3771 
 3772     static const uint32_t round_consts[64] = {
 3773       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3774       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3775       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3776       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3777       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3778       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3779       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3780       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3781       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3782       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3783       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3784       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3785       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3786       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3787       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3788       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3789     };
 3790 
 3791     __ align(CodeEntryAlignment);
 3792 
 3793     StubCodeMark mark(this, stub_id);
 3794     address start = __ pc();
 3795 
 3796     Register buf   = c_rarg0;
 3797     Register state = c_rarg1;
 3798     Register ofs   = c_rarg2;
 3799     Register limit = c_rarg3;
 3800 
 3801     Label sha1_loop;
 3802 
 3803     __ stpd(v8, v9, __ pre(sp, -32));
 3804     __ stpd(v10, v11, Address(sp, 16));
 3805 
 3806 // dga == v0
 3807 // dgb == v1
 3808 // dg0 == v2
 3809 // dg1 == v3
 3810 // dg2 == v4
 3811 // t0 == v6
 3812 // t1 == v7
 3813 
 3814     // load 16 keys to v16..v31
 3815     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3816     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3817     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3818     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3819     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3820 
 3821     // load 8 words (256 bits) state
 3822     __ ldpq(v0, v1, state);
 3823 
 3824     __ BIND(sha1_loop);
 3825     // load 64 bytes of data into v8..v11
 3826     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3827     __ rev32(v8, __ T16B, v8);
 3828     __ rev32(v9, __ T16B, v9);
 3829     __ rev32(v10, __ T16B, v10);
 3830     __ rev32(v11, __ T16B, v11);
 3831 
 3832     __ addv(v6, __ T4S, v8, v16);
 3833     __ orr(v2, __ T16B, v0, v0);
 3834     __ orr(v3, __ T16B, v1, v1);
 3835 
 3836     FloatRegister d0 = v8;
 3837     FloatRegister d1 = v9;
 3838     FloatRegister d2 = v10;
 3839     FloatRegister d3 = v11;
 3840 
 3841 
 3842     for (int round = 0; round < 16; round++) {
 3843       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3844       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3845       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3846       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3847 
 3848       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3849        __ orr(v4, __ T16B, v2, v2);
 3850       if (round < 15)
 3851         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3852       __ sha256h(v2, __ T4S, v3, tmp2);
 3853       __ sha256h2(v3, __ T4S, v4, tmp2);
 3854       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3855 
 3856       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3857     }
 3858 
 3859     __ addv(v0, __ T4S, v0, v2);
 3860     __ addv(v1, __ T4S, v1, v3);
 3861 
 3862     if (multi_block) {
 3863       __ add(ofs, ofs, 64);
 3864       __ cmp(ofs, limit);
 3865       __ br(Assembler::LE, sha1_loop);
 3866       __ mov(c_rarg0, ofs); // return ofs
 3867     }
 3868 
 3869     __ ldpd(v10, v11, Address(sp, 16));
 3870     __ ldpd(v8, v9, __ post(sp, 32));
 3871 
 3872     __ stpq(v0, v1, state);
 3873 
 3874     __ ret(lr);
 3875 
 3876     return start;
 3877   }
 3878 
 3879   // Double rounds for sha512.
 3880   void sha512_dround(int dr,
 3881                      FloatRegister vi0, FloatRegister vi1,
 3882                      FloatRegister vi2, FloatRegister vi3,
 3883                      FloatRegister vi4, FloatRegister vrc0,
 3884                      FloatRegister vrc1, FloatRegister vin0,
 3885                      FloatRegister vin1, FloatRegister vin2,
 3886                      FloatRegister vin3, FloatRegister vin4) {
 3887       if (dr < 36) {
 3888         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3889       }
 3890       __ addv(v5, __ T2D, vrc0, vin0);
 3891       __ ext(v6, __ T16B, vi2, vi3, 8);
 3892       __ ext(v5, __ T16B, v5, v5, 8);
 3893       __ ext(v7, __ T16B, vi1, vi2, 8);
 3894       __ addv(vi3, __ T2D, vi3, v5);
 3895       if (dr < 32) {
 3896         __ ext(v5, __ T16B, vin3, vin4, 8);
 3897         __ sha512su0(vin0, __ T2D, vin1);
 3898       }
 3899       __ sha512h(vi3, __ T2D, v6, v7);
 3900       if (dr < 32) {
 3901         __ sha512su1(vin0, __ T2D, vin2, v5);
 3902       }
 3903       __ addv(vi4, __ T2D, vi1, vi3);
 3904       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3905   }
 3906 
 3907   // Arguments:
 3908   //
 3909   // Inputs:
 3910   //   c_rarg0   - byte[]  source+offset
 3911   //   c_rarg1   - int[]   SHA.state
 3912   //   c_rarg2   - int     offset
 3913   //   c_rarg3   - int     limit
 3914   //
 3915   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3916     bool multi_block;
 3917     switch (stub_id) {
 3918     case sha512_implCompress_id:
 3919       multi_block = false;
 3920       break;
 3921     case sha512_implCompressMB_id:
 3922       multi_block = true;
 3923       break;
 3924     default:
 3925       ShouldNotReachHere();
 3926     }
 3927 
 3928     static const uint64_t round_consts[80] = {
 3929       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3930       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3931       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3932       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3933       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3934       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3935       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3936       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3937       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3938       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3939       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3940       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3941       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3942       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3943       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3944       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3945       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3946       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3947       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3948       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3949       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3950       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3951       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3952       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3953       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3954       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3955       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3956     };
 3957 
 3958     __ align(CodeEntryAlignment);
 3959 
 3960     StubCodeMark mark(this, stub_id);
 3961     address start = __ pc();
 3962 
 3963     Register buf   = c_rarg0;
 3964     Register state = c_rarg1;
 3965     Register ofs   = c_rarg2;
 3966     Register limit = c_rarg3;
 3967 
 3968     __ stpd(v8, v9, __ pre(sp, -64));
 3969     __ stpd(v10, v11, Address(sp, 16));
 3970     __ stpd(v12, v13, Address(sp, 32));
 3971     __ stpd(v14, v15, Address(sp, 48));
 3972 
 3973     Label sha512_loop;
 3974 
 3975     // load state
 3976     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3977 
 3978     // load first 4 round constants
 3979     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3980     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 3981 
 3982     __ BIND(sha512_loop);
 3983     // load 128B of data into v12..v19
 3984     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 3985     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 3986     __ rev64(v12, __ T16B, v12);
 3987     __ rev64(v13, __ T16B, v13);
 3988     __ rev64(v14, __ T16B, v14);
 3989     __ rev64(v15, __ T16B, v15);
 3990     __ rev64(v16, __ T16B, v16);
 3991     __ rev64(v17, __ T16B, v17);
 3992     __ rev64(v18, __ T16B, v18);
 3993     __ rev64(v19, __ T16B, v19);
 3994 
 3995     __ mov(rscratch2, rscratch1);
 3996 
 3997     __ mov(v0, __ T16B, v8);
 3998     __ mov(v1, __ T16B, v9);
 3999     __ mov(v2, __ T16B, v10);
 4000     __ mov(v3, __ T16B, v11);
 4001 
 4002     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4003     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4004     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4005     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4006     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4007     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4008     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4009     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4010     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4011     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4012     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4013     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4014     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4015     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4016     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4017     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4018     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4019     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4020     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4021     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4022     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4023     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4024     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4025     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4026     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4027     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4028     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4029     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4030     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4031     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4032     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4033     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4034     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4035     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4036     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4037     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4038     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4039     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4040     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4041     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4042 
 4043     __ addv(v8, __ T2D, v8, v0);
 4044     __ addv(v9, __ T2D, v9, v1);
 4045     __ addv(v10, __ T2D, v10, v2);
 4046     __ addv(v11, __ T2D, v11, v3);
 4047 
 4048     if (multi_block) {
 4049       __ add(ofs, ofs, 128);
 4050       __ cmp(ofs, limit);
 4051       __ br(Assembler::LE, sha512_loop);
 4052       __ mov(c_rarg0, ofs); // return ofs
 4053     }
 4054 
 4055     __ st1(v8, v9, v10, v11, __ T2D, state);
 4056 
 4057     __ ldpd(v14, v15, Address(sp, 48));
 4058     __ ldpd(v12, v13, Address(sp, 32));
 4059     __ ldpd(v10, v11, Address(sp, 16));
 4060     __ ldpd(v8, v9, __ post(sp, 64));
 4061 
 4062     __ ret(lr);
 4063 
 4064     return start;
 4065   }
 4066 
 4067   // Execute one round of keccak of two computations in parallel.
 4068   // One of the states should be loaded into the lower halves of
 4069   // the vector registers v0-v24, the other should be loaded into
 4070   // the upper halves of those registers. The ld1r instruction loads
 4071   // the round constant into both halves of register v31.
 4072   // Intermediate results c0...c5 and d0...d5 are computed
 4073   // in registers v25...v30.
 4074   // All vector instructions that are used operate on both register
 4075   // halves in parallel.
 4076   // If only a single computation is needed, one can only load the lower halves.
 4077   void keccak_round(Register rscratch1) {
 4078   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4079   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4080   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4081   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4082   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4083   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4084   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4085   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4086   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4087   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4088 
 4089   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4090   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4091   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4092   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4093   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4094 
 4095   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4096   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4097   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4098   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4099   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4100   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4101   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4102   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4103   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4104   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4105   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4106   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4107   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4108   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4109   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4110   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4111   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4112   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4113   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4114   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4115   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4116   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4117   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4118   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4119   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4120 
 4121   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4122   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4123   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4124   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4125   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4126 
 4127   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4128 
 4129   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4130   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4131   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4132   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4133   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4134 
 4135   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4136   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4137   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4138   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4139   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4140 
 4141   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4142   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4143   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4144   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4145   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4146 
 4147   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4148   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4149   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4150   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4151   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4152 
 4153   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4154   }
 4155 
 4156   // Arguments:
 4157   //
 4158   // Inputs:
 4159   //   c_rarg0   - byte[]  source+offset
 4160   //   c_rarg1   - byte[]  SHA.state
 4161   //   c_rarg2   - int     block_size
 4162   //   c_rarg3   - int     offset
 4163   //   c_rarg4   - int     limit
 4164   //
 4165   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4166     bool multi_block;
 4167     switch (stub_id) {
 4168     case sha3_implCompress_id:
 4169       multi_block = false;
 4170       break;
 4171     case sha3_implCompressMB_id:
 4172       multi_block = true;
 4173       break;
 4174     default:
 4175       ShouldNotReachHere();
 4176     }
 4177 
 4178     static const uint64_t round_consts[24] = {
 4179       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4180       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4181       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4182       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4183       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4184       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4185       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4186       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4187     };
 4188 
 4189     __ align(CodeEntryAlignment);
 4190 
 4191     StubCodeMark mark(this, stub_id);
 4192     address start = __ pc();
 4193 
 4194     Register buf           = c_rarg0;
 4195     Register state         = c_rarg1;
 4196     Register block_size    = c_rarg2;
 4197     Register ofs           = c_rarg3;
 4198     Register limit         = c_rarg4;
 4199 
 4200     Label sha3_loop, rounds24_loop;
 4201     Label sha3_512_or_sha3_384, shake128;
 4202 
 4203     __ stpd(v8, v9, __ pre(sp, -64));
 4204     __ stpd(v10, v11, Address(sp, 16));
 4205     __ stpd(v12, v13, Address(sp, 32));
 4206     __ stpd(v14, v15, Address(sp, 48));
 4207 
 4208     // load state
 4209     __ add(rscratch1, state, 32);
 4210     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4211     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4212     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4213     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4214     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4215     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4216     __ ld1(v24, __ T1D, rscratch1);
 4217 
 4218     __ BIND(sha3_loop);
 4219 
 4220     // 24 keccak rounds
 4221     __ movw(rscratch2, 24);
 4222 
 4223     // load round_constants base
 4224     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4225 
 4226     // load input
 4227     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4228     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4229     __ eor(v0, __ T8B, v0, v25);
 4230     __ eor(v1, __ T8B, v1, v26);
 4231     __ eor(v2, __ T8B, v2, v27);
 4232     __ eor(v3, __ T8B, v3, v28);
 4233     __ eor(v4, __ T8B, v4, v29);
 4234     __ eor(v5, __ T8B, v5, v30);
 4235     __ eor(v6, __ T8B, v6, v31);
 4236 
 4237     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4238     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4239 
 4240     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4241     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4242     __ eor(v7, __ T8B, v7, v25);
 4243     __ eor(v8, __ T8B, v8, v26);
 4244     __ eor(v9, __ T8B, v9, v27);
 4245     __ eor(v10, __ T8B, v10, v28);
 4246     __ eor(v11, __ T8B, v11, v29);
 4247     __ eor(v12, __ T8B, v12, v30);
 4248     __ eor(v13, __ T8B, v13, v31);
 4249 
 4250     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4251     __ eor(v14, __ T8B, v14, v25);
 4252     __ eor(v15, __ T8B, v15, v26);
 4253     __ eor(v16, __ T8B, v16, v27);
 4254 
 4255     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4256     __ andw(c_rarg5, block_size, 48);
 4257     __ cbzw(c_rarg5, rounds24_loop);
 4258 
 4259     __ tbnz(block_size, 5, shake128);
 4260     // block_size == 144, bit5 == 0, SHA3-224
 4261     __ ldrd(v28, __ post(buf, 8));
 4262     __ eor(v17, __ T8B, v17, v28);
 4263     __ b(rounds24_loop);
 4264 
 4265     __ BIND(shake128);
 4266     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4267     __ eor(v17, __ T8B, v17, v28);
 4268     __ eor(v18, __ T8B, v18, v29);
 4269     __ eor(v19, __ T8B, v19, v30);
 4270     __ eor(v20, __ T8B, v20, v31);
 4271     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4272 
 4273     __ BIND(sha3_512_or_sha3_384);
 4274     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4275     __ eor(v7, __ T8B, v7, v25);
 4276     __ eor(v8, __ T8B, v8, v26);
 4277     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4278 
 4279     // SHA3-384
 4280     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4281     __ eor(v9,  __ T8B, v9,  v27);
 4282     __ eor(v10, __ T8B, v10, v28);
 4283     __ eor(v11, __ T8B, v11, v29);
 4284     __ eor(v12, __ T8B, v12, v30);
 4285 
 4286     __ BIND(rounds24_loop);
 4287     __ subw(rscratch2, rscratch2, 1);
 4288 
 4289     keccak_round(rscratch1);
 4290 
 4291     __ cbnzw(rscratch2, rounds24_loop);
 4292 
 4293     if (multi_block) {
 4294       __ add(ofs, ofs, block_size);
 4295       __ cmp(ofs, limit);
 4296       __ br(Assembler::LE, sha3_loop);
 4297       __ mov(c_rarg0, ofs); // return ofs
 4298     }
 4299 
 4300     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4301     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4302     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4303     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4304     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4305     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4306     __ st1(v24, __ T1D, state);
 4307 
 4308     // restore callee-saved registers
 4309     __ ldpd(v14, v15, Address(sp, 48));
 4310     __ ldpd(v12, v13, Address(sp, 32));
 4311     __ ldpd(v10, v11, Address(sp, 16));
 4312     __ ldpd(v8, v9, __ post(sp, 64));
 4313 
 4314     __ ret(lr);
 4315 
 4316     return start;
 4317   }
 4318 
 4319   // Inputs:
 4320   //   c_rarg0   - long[]  state0
 4321   //   c_rarg1   - long[]  state1
 4322   address generate_double_keccak() {
 4323     static const uint64_t round_consts[24] = {
 4324       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4325       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4326       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4327       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4328       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4329       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4330       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4331       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4332     };
 4333 
 4334     // Implements the double_keccak() method of the
 4335     // sun.secyrity.provider.SHA3Parallel class
 4336     __ align(CodeEntryAlignment);
 4337     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4338     address start = __ pc();
 4339     __ enter();
 4340 
 4341     Register state0        = c_rarg0;
 4342     Register state1        = c_rarg1;
 4343 
 4344     Label rounds24_loop;
 4345 
 4346     // save callee-saved registers
 4347     __ stpd(v8, v9, __ pre(sp, -64));
 4348     __ stpd(v10, v11, Address(sp, 16));
 4349     __ stpd(v12, v13, Address(sp, 32));
 4350     __ stpd(v14, v15, Address(sp, 48));
 4351 
 4352     // load states
 4353     __ add(rscratch1, state0, 32);
 4354     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4355     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4356     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4357     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4358     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4359     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4360     __ ld1(v24, __ D, 0, rscratch1);
 4361     __ add(rscratch1, state1, 32);
 4362     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4363     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4364     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4365     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4366     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4367     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4368     __ ld1(v24, __ D, 1, rscratch1);
 4369 
 4370     // 24 keccak rounds
 4371     __ movw(rscratch2, 24);
 4372 
 4373     // load round_constants base
 4374     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4375 
 4376     __ BIND(rounds24_loop);
 4377     __ subw(rscratch2, rscratch2, 1);
 4378     keccak_round(rscratch1);
 4379     __ cbnzw(rscratch2, rounds24_loop);
 4380 
 4381     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4382     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4383     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4384     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4385     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4386     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4387     __ st1(v24, __ D, 0, state0);
 4388     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4389     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4390     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4391     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4392     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4393     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4394     __ st1(v24, __ D, 1, state1);
 4395 
 4396     // restore callee-saved vector registers
 4397     __ ldpd(v14, v15, Address(sp, 48));
 4398     __ ldpd(v12, v13, Address(sp, 32));
 4399     __ ldpd(v10, v11, Address(sp, 16));
 4400     __ ldpd(v8, v9, __ post(sp, 64));
 4401 
 4402     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4403     __ mov(r0, zr); // return 0
 4404     __ ret(lr);
 4405 
 4406     return start;
 4407   }
 4408 
 4409   // ChaCha20 block function.  This version parallelizes the 32-bit
 4410   // state elements on each of 16 vectors, producing 4 blocks of
 4411   // keystream at a time.
 4412   //
 4413   // state (int[16]) = c_rarg0
 4414   // keystream (byte[256]) = c_rarg1
 4415   // return - number of bytes of produced keystream (always 256)
 4416   //
 4417   // This implementation takes each 32-bit integer from the state
 4418   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4419   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4420   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4421   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4422   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4423   // operations represented by one QUARTERROUND function, we instead stack all
 4424   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4425   // and then do the same for the second set of 4 quarter rounds.  This removes
 4426   // some latency that would otherwise be incurred by waiting for an add to
 4427   // complete before performing an xor (which depends on the result of the
 4428   // add), etc. An adjustment happens between the first and second groups of 4
 4429   // quarter rounds, but this is done only in the inputs to the macro functions
 4430   // that generate the assembly instructions - these adjustments themselves are
 4431   // not part of the resulting assembly.
 4432   // The 4 registers v0-v3 are used during the quarter round operations as
 4433   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4434   // registers become the vectors involved in adding the start state back onto
 4435   // the post-QR working state.  After the adds are complete, each of the 16
 4436   // vectors write their first lane back to the keystream buffer, followed
 4437   // by the second lane from all vectors and so on.
 4438   address generate_chacha20Block_blockpar() {
 4439     Label L_twoRounds, L_cc20_const;
 4440     // The constant data is broken into two 128-bit segments to be loaded
 4441     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4442     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4443     // The second 128-bits is a table constant used for 8-bit left rotations.
 4444     __ BIND(L_cc20_const);
 4445     __ emit_int64(0x0000000100000000UL);
 4446     __ emit_int64(0x0000000300000002UL);
 4447     __ emit_int64(0x0605040702010003UL);
 4448     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4449 
 4450     __ align(CodeEntryAlignment);
 4451     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4452     StubCodeMark mark(this, stub_id);
 4453     address start = __ pc();
 4454     __ enter();
 4455 
 4456     int i, j;
 4457     const Register state = c_rarg0;
 4458     const Register keystream = c_rarg1;
 4459     const Register loopCtr = r10;
 4460     const Register tmpAddr = r11;
 4461     const FloatRegister ctrAddOverlay = v28;
 4462     const FloatRegister lrot8Tbl = v29;
 4463 
 4464     // Organize SIMD registers in an array that facilitates
 4465     // putting repetitive opcodes into loop structures.  It is
 4466     // important that each grouping of 4 registers is monotonically
 4467     // increasing to support the requirements of multi-register
 4468     // instructions (e.g. ld4r, st4, etc.)
 4469     const FloatRegister workSt[16] = {
 4470          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4471         v20, v21, v22, v23, v24, v25, v26, v27
 4472     };
 4473 
 4474     // Pull in constant data.  The first 16 bytes are the add overlay
 4475     // which is applied to the vector holding the counter (state[12]).
 4476     // The second 16 bytes is the index register for the 8-bit left
 4477     // rotation tbl instruction.
 4478     __ adr(tmpAddr, L_cc20_const);
 4479     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4480 
 4481     // Load from memory and interlace across 16 SIMD registers,
 4482     // With each word from memory being broadcast to all lanes of
 4483     // each successive SIMD register.
 4484     //      Addr(0) -> All lanes in workSt[i]
 4485     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4486     __ mov(tmpAddr, state);
 4487     for (i = 0; i < 16; i += 4) {
 4488       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4489           __ post(tmpAddr, 16));
 4490     }
 4491     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4492 
 4493     // Before entering the loop, create 5 4-register arrays.  These
 4494     // will hold the 4 registers that represent the a/b/c/d fields
 4495     // in the quarter round operation.  For instance the "b" field
 4496     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4497     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4498     // since it is part of a diagonal organization.  The aSet and scratch
 4499     // register sets are defined at declaration time because they do not change
 4500     // organization at any point during the 20-round processing.
 4501     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4502     FloatRegister bSet[4];
 4503     FloatRegister cSet[4];
 4504     FloatRegister dSet[4];
 4505     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4506 
 4507     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4508     __ mov(loopCtr, 10);
 4509     __ BIND(L_twoRounds);
 4510 
 4511     // Set to columnar organization and do the following 4 quarter-rounds:
 4512     // QUARTERROUND(0, 4, 8, 12)
 4513     // QUARTERROUND(1, 5, 9, 13)
 4514     // QUARTERROUND(2, 6, 10, 14)
 4515     // QUARTERROUND(3, 7, 11, 15)
 4516     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4517     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4518     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4519 
 4520     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4521     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4522     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4523 
 4524     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4525     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4526     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4527 
 4528     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4529     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4530     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4531 
 4532     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4533     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4534     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4535 
 4536     // Set to diagonal organization and do the next 4 quarter-rounds:
 4537     // QUARTERROUND(0, 5, 10, 15)
 4538     // QUARTERROUND(1, 6, 11, 12)
 4539     // QUARTERROUND(2, 7, 8, 13)
 4540     // QUARTERROUND(3, 4, 9, 14)
 4541     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4542     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4543     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4544 
 4545     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4546     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4547     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4548 
 4549     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4550     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4551     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4552 
 4553     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4554     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4555     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4556 
 4557     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4558     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4559     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4560 
 4561     // Decrement and iterate
 4562     __ sub(loopCtr, loopCtr, 1);
 4563     __ cbnz(loopCtr, L_twoRounds);
 4564 
 4565     __ mov(tmpAddr, state);
 4566 
 4567     // Add the starting state back to the post-loop keystream
 4568     // state.  We read/interlace the state array from memory into
 4569     // 4 registers similar to what we did in the beginning.  Then
 4570     // add the counter overlay onto workSt[12] at the end.
 4571     for (i = 0; i < 16; i += 4) {
 4572       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4573       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4574       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4575       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4576       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4577     }
 4578     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4579 
 4580     // Write working state into the keystream buffer.  This is accomplished
 4581     // by taking the lane "i" from each of the four vectors and writing
 4582     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4583     // repeating with the next 4 vectors until all 16 vectors have been used.
 4584     // Then move to the next lane and repeat the process until all lanes have
 4585     // been written.
 4586     for (i = 0; i < 4; i++) {
 4587       for (j = 0; j < 16; j += 4) {
 4588         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4589             __ post(keystream, 16));
 4590       }
 4591     }
 4592 
 4593     __ mov(r0, 256);             // Return length of output keystream
 4594     __ leave();
 4595     __ ret(lr);
 4596 
 4597     return start;
 4598   }
 4599 
 4600   // Helpers to schedule parallel operation bundles across vector
 4601   // register sequences of size 2, 4 or 8.
 4602 
 4603   // Implement various primitive computations across vector sequences
 4604 
 4605   template<int N>
 4606   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4607                const VSeq<N>& v1, const VSeq<N>& v2) {
 4608     // output must not be constant
 4609     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4610     // output cannot overwrite pending inputs
 4611     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4612     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4613     for (int i = 0; i < N; i++) {
 4614       __ addv(v[i], T, v1[i], v2[i]);
 4615     }
 4616   }
 4617 
 4618   template<int N>
 4619   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4620                const VSeq<N>& v1, const VSeq<N>& v2) {
 4621     // output must not be constant
 4622     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4623     // output cannot overwrite pending inputs
 4624     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4625     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4626     for (int i = 0; i < N; i++) {
 4627       __ subv(v[i], T, v1[i], v2[i]);
 4628     }
 4629   }
 4630 
 4631   template<int N>
 4632   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4633                const VSeq<N>& v1, const VSeq<N>& v2) {
 4634     // output must not be constant
 4635     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4636     // output cannot overwrite pending inputs
 4637     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4638     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4639     for (int i = 0; i < N; i++) {
 4640       __ mulv(v[i], T, v1[i], v2[i]);
 4641     }
 4642   }
 4643 
 4644   template<int N>
 4645   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4646     // output must not be constant
 4647     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4648     // output cannot overwrite pending inputs
 4649     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4650     for (int i = 0; i < N; i++) {
 4651       __ negr(v[i], T, v1[i]);
 4652     }
 4653   }
 4654 
 4655   template<int N>
 4656   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4657                const VSeq<N>& v1, int shift) {
 4658     // output must not be constant
 4659     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4660     // output cannot overwrite pending inputs
 4661     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4662     for (int i = 0; i < N; i++) {
 4663       __ sshr(v[i], T, v1[i], shift);
 4664     }
 4665   }
 4666 
 4667   template<int N>
 4668   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4669     // output must not be constant
 4670     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4671     // output cannot overwrite pending inputs
 4672     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4673     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4674     for (int i = 0; i < N; i++) {
 4675       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4676     }
 4677   }
 4678 
 4679   template<int N>
 4680   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4681     // output must not be constant
 4682     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4683     // output cannot overwrite pending inputs
 4684     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4685     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4686     for (int i = 0; i < N; i++) {
 4687       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4688     }
 4689   }
 4690 
 4691   template<int N>
 4692   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4693     // output must not be constant
 4694     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4695     // output cannot overwrite pending inputs
 4696     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4697     for (int i = 0; i < N; i++) {
 4698       __ notr(v[i], __ T16B, v1[i]);
 4699     }
 4700   }
 4701 
 4702   template<int N>
 4703   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4704     // output must not be constant
 4705     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4706     // output cannot overwrite pending inputs
 4707     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4708     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4709     for (int i = 0; i < N; i++) {
 4710       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4711     }
 4712   }
 4713 
 4714   template<int N>
 4715   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4716     // output must not be constant
 4717     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4718     // output cannot overwrite pending inputs
 4719     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4720     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4721     for (int i = 0; i < N; i++) {
 4722       __ mlsv(v[i], T, v1[i], v2[i]);
 4723     }
 4724   }
 4725 
 4726   // load N/2 successive pairs of quadword values from memory in order
 4727   // into N successive vector registers of the sequence via the
 4728   // address supplied in base.
 4729   template<int N>
 4730   void vs_ldpq(const VSeq<N>& v, Register base) {
 4731     for (int i = 0; i < N; i += 2) {
 4732       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4733     }
 4734   }
 4735 
 4736   // load N/2 successive pairs of quadword values from memory in order
 4737   // into N vector registers of the sequence via the address supplied
 4738   // in base using post-increment addressing
 4739   template<int N>
 4740   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4741     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4742     for (int i = 0; i < N; i += 2) {
 4743       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4744     }
 4745   }
 4746 
 4747   // store N successive vector registers of the sequence into N/2
 4748   // successive pairs of quadword memory locations via the address
 4749   // supplied in base using post-increment addressing
 4750   template<int N>
 4751   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4752     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4753     for (int i = 0; i < N; i += 2) {
 4754       __ stpq(v[i], v[i+1], __ post(base, 32));
 4755     }
 4756   }
 4757 
 4758   // load N/2 pairs of quadword values from memory de-interleaved into
 4759   // N vector registers 2 at a time via the address supplied in base
 4760   // using post-increment addressing.
 4761   template<int N>
 4762   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4763     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4764     for (int i = 0; i < N; i += 2) {
 4765       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4766     }
 4767   }
 4768 
 4769   // store N vector registers interleaved into N/2 pairs of quadword
 4770   // memory locations via the address supplied in base using
 4771   // post-increment addressing.
 4772   template<int N>
 4773   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4774     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4775     for (int i = 0; i < N; i += 2) {
 4776       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4777     }
 4778   }
 4779 
 4780   // load N quadword values from memory de-interleaved into N vector
 4781   // registers 3 elements at a time via the address supplied in base.
 4782   template<int N>
 4783   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4784     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4785     for (int i = 0; i < N; i += 3) {
 4786       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4787     }
 4788   }
 4789 
 4790   // load N quadword values from memory de-interleaved into N vector
 4791   // registers 3 elements at a time via the address supplied in base
 4792   // using post-increment addressing.
 4793   template<int N>
 4794   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4795     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4796     for (int i = 0; i < N; i += 3) {
 4797       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4798     }
 4799   }
 4800 
 4801   // load N/2 pairs of quadword values from memory into N vector
 4802   // registers via the address supplied in base with each pair indexed
 4803   // using the the start offset plus the corresponding entry in the
 4804   // offsets array
 4805   template<int N>
 4806   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4807     for (int i = 0; i < N/2; i++) {
 4808       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4809     }
 4810   }
 4811 
 4812   // store N vector registers into N/2 pairs of quadword memory
 4813   // locations via the address supplied in base with each pair indexed
 4814   // using the the start offset plus the corresponding entry in the
 4815   // offsets array
 4816   template<int N>
 4817   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4818     for (int i = 0; i < N/2; i++) {
 4819       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4820     }
 4821   }
 4822 
 4823   // load N single quadword values from memory into N vector registers
 4824   // via the address supplied in base with each value indexed using
 4825   // the the start offset plus the corresponding entry in the offsets
 4826   // array
 4827   template<int N>
 4828   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4829                       int start, int (&offsets)[N]) {
 4830     for (int i = 0; i < N; i++) {
 4831       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4832     }
 4833   }
 4834 
 4835   // store N vector registers into N single quadword memory locations
 4836   // via the address supplied in base with each value indexed using
 4837   // the the start offset plus the corresponding entry in the offsets
 4838   // array
 4839   template<int N>
 4840   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4841                       int start, int (&offsets)[N]) {
 4842     for (int i = 0; i < N; i++) {
 4843       __ str(v[i], T, Address(base, start + offsets[i]));
 4844     }
 4845   }
 4846 
 4847   // load N/2 pairs of quadword values from memory de-interleaved into
 4848   // N vector registers 2 at a time via the address supplied in base
 4849   // with each pair indexed using the the start offset plus the
 4850   // corresponding entry in the offsets array
 4851   template<int N>
 4852   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4853                       Register tmp, int start, int (&offsets)[N/2]) {
 4854     for (int i = 0; i < N/2; i++) {
 4855       __ add(tmp, base, start + offsets[i]);
 4856       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4857     }
 4858   }
 4859 
 4860   // store N vector registers 2 at a time interleaved into N/2 pairs
 4861   // of quadword memory locations via the address supplied in base
 4862   // with each pair indexed using the the start offset plus the
 4863   // corresponding entry in the offsets array
 4864   template<int N>
 4865   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4866                       Register tmp, int start, int (&offsets)[N/2]) {
 4867     for (int i = 0; i < N/2; i++) {
 4868       __ add(tmp, base, start + offsets[i]);
 4869       __ st2(v[2*i], v[2*i+1], T, tmp);
 4870     }
 4871   }
 4872 
 4873   // Helper routines for various flavours of Montgomery multiply
 4874 
 4875   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 4876   // multiplications in parallel
 4877   //
 4878 
 4879   // See the montMul() method of the sun.security.provider.ML_DSA
 4880   // class.
 4881   //
 4882   // Computes 4x4S results or 8x8H results
 4883   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 4884   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 4885   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 4886   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 4887   // Outputs: va - 4x4S or 4x8H vector register sequences
 4888   // vb, vc, vtmp and vq must all be disjoint
 4889   // va must be disjoint from all other inputs/temps or must equal vc
 4890   // va must have a non-zero delta i.e. it must not be a constant vseq.
 4891   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 4892   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 4893                    Assembler::SIMD_Arrangement T,
 4894                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4895     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 4896     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4897     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4898     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4899 
 4900     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4901     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4902 
 4903     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4904 
 4905     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4906     assert(vs_disjoint(va, vb), "va and vb overlap");
 4907     assert(vs_disjoint(va, vq), "va and vq overlap");
 4908     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4909     assert(!va.is_constant(), "output vector must identify 4 different registers");
 4910 
 4911     // schedule 4 streams of instructions across the vector sequences
 4912     for (int i = 0; i < 4; i++) {
 4913       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4914       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 4915     }
 4916 
 4917     for (int i = 0; i < 4; i++) {
 4918       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 4919     }
 4920 
 4921     for (int i = 0; i < 4; i++) {
 4922       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 4923     }
 4924 
 4925     for (int i = 0; i < 4; i++) {
 4926       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 4927     }
 4928   }
 4929 
 4930   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 4931   // multiplications in parallel
 4932   //
 4933 
 4934   // See the montMul() method of the sun.security.provider.ML_DSA
 4935   // class.
 4936   //
 4937   // Computes 4x4S results or 8x8H results
 4938   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 4939   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 4940   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 4941   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 4942   // Outputs: va - 4x4S or 4x8H vector register sequences
 4943   // vb, vc, vtmp and vq must all be disjoint
 4944   // va must be disjoint from all other inputs/temps or must equal vc
 4945   // va must have a non-zero delta i.e. it must not be a constant vseq.
 4946   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 4947   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 4948                    Assembler::SIMD_Arrangement T,
 4949                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 4950     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 4951     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4952     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4953     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4954 
 4955     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4956     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4957 
 4958     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4959 
 4960     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4961     assert(vs_disjoint(va, vb), "va and vb overlap");
 4962     assert(vs_disjoint(va, vq), "va and vq overlap");
 4963     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4964     assert(!va.is_constant(), "output vector must identify 2 different registers");
 4965 
 4966     // schedule 2 streams of instructions across the vector sequences
 4967     for (int i = 0; i < 2; i++) {
 4968       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4969       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 4970     }
 4971 
 4972     for (int i = 0; i < 2; i++) {
 4973       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 4974     }
 4975 
 4976     for (int i = 0; i < 2; i++) {
 4977       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 4978     }
 4979 
 4980     for (int i = 0; i < 2; i++) {
 4981       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 4982     }
 4983   }
 4984 
 4985   // Perform 16 16-bit Montgomery multiplications in parallel.
 4986   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 4987                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 4988     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 4989     // It will assert that the register use is valid
 4990     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 4991   }
 4992 
 4993   // Perform 32 16-bit Montgomery multiplications in parallel.
 4994   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 4995                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4996     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 4997     // It will assert that the register use is valid
 4998     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 4999   }
 5000 
 5001   // Perform 64 16-bit Montgomery multiplications in parallel.
 5002   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5003                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5004     // Schedule two successive 4x8H multiplies via the montmul helper
 5005     // on the front and back halves of va, vb and vc. The helper will
 5006     // assert that the register use has no overlap conflicts on each
 5007     // individual call but we also need to ensure that the necessary
 5008     // disjoint/equality constraints are met across both calls.
 5009 
 5010     // vb, vc, vtmp and vq must be disjoint. va must either be
 5011     // disjoint from all other registers or equal vc
 5012 
 5013     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5014     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5015     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5016 
 5017     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5018     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5019 
 5020     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5021 
 5022     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5023     assert(vs_disjoint(va, vb), "va and vb overlap");
 5024     assert(vs_disjoint(va, vq), "va and vq overlap");
 5025     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5026 
 5027     // we multiply the front and back halves of each sequence 4 at a
 5028     // time because
 5029     //
 5030     // 1) we are currently only able to get 4-way instruction
 5031     // parallelism at best
 5032     //
 5033     // 2) we need registers for the constants in vq and temporary
 5034     // scratch registers to hold intermediate results so vtmp can only
 5035     // be a VSeq<4> which means we only have 4 scratch slots
 5036 
 5037     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5038     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5039   }
 5040 
 5041   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5042                                const VSeq<4>& vc,
 5043                                const VSeq<4>& vtmp,
 5044                                const VSeq<2>& vq) {
 5045     // compute a = montmul(a1, c)
 5046     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5047     // ouptut a1 = a0 - a
 5048     vs_subv(va1, __ T8H, va0, vc);
 5049     //    and a0 = a0 + a
 5050     vs_addv(va0, __ T8H, va0, vc);
 5051   }
 5052 
 5053   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5054                                const VSeq<4>& vb,
 5055                                const VSeq<4>& vtmp1,
 5056                                const VSeq<4>& vtmp2,
 5057                                const VSeq<2>& vq) {
 5058     // compute c = a0 - a1
 5059     vs_subv(vtmp1, __ T8H, va0, va1);
 5060     // output a0 = a0 + a1
 5061     vs_addv(va0, __ T8H, va0, va1);
 5062     // output a1 = b montmul c
 5063     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5064   }
 5065 
 5066   void load64shorts(const VSeq<8>& v, Register shorts) {
 5067     vs_ldpq_post(v, shorts);
 5068   }
 5069 
 5070   void load32shorts(const VSeq<4>& v, Register shorts) {
 5071     vs_ldpq_post(v, shorts);
 5072   }
 5073 
 5074   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5075     vs_stpq_post(v, tmpAddr);
 5076   }
 5077 
 5078   // Kyber NTT function.
 5079   // Implements
 5080   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5081   //
 5082   // coeffs (short[256]) = c_rarg0
 5083   // ntt_zetas (short[256]) = c_rarg1
 5084   address generate_kyberNtt() {
 5085 
 5086     __ align(CodeEntryAlignment);
 5087     StubGenStubId stub_id = StubGenStubId::kyberNtt_id;
 5088     StubCodeMark mark(this, stub_id);
 5089     address start = __ pc();
 5090     __ enter();
 5091 
 5092     const Register coeffs = c_rarg0;
 5093     const Register zetas = c_rarg1;
 5094 
 5095     const Register kyberConsts = r10;
 5096     const Register tmpAddr = r11;
 5097 
 5098     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5099     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5100     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5101 
 5102     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5103     // load the montmul constants
 5104     vs_ldpq(vq, kyberConsts);
 5105 
 5106     // Each level corresponds to an iteration of the outermost loop of the
 5107     // Java method seilerNTT(int[] coeffs). There are some differences
 5108     // from what is done in the seilerNTT() method, though:
 5109     // 1. The computation is using 16-bit signed values, we do not convert them
 5110     // to ints here.
 5111     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5112     // this array for each level, it is easier that way to fill up the vector
 5113     // registers.
 5114     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5115     // multiplications (this is because that way there should not be any
 5116     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5117     // that we can use the 16-bit arithmetic in the vector unit.
 5118     //
 5119     // On each level, we fill up the vector registers in such a way that the
 5120     // array elements that need to be multiplied by the zetas go into one
 5121     // set of vector registers while the corresponding ones that don't need to
 5122     // be multiplied, go into another set.
 5123     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5124     // registers interleaving the steps of 4 identical computations,
 5125     // each done on 8 16-bit values per register.
 5126 
 5127     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5128     // to the zetas occur in discrete blocks whose size is some multiple
 5129     // of 32.
 5130 
 5131     // level 0
 5132     __ add(tmpAddr, coeffs, 256);
 5133     load64shorts(vs1, tmpAddr);
 5134     load64shorts(vs2, zetas);
 5135     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5136     __ add(tmpAddr, coeffs, 0);
 5137     load64shorts(vs1, tmpAddr);
 5138     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5139     vs_addv(vs1, __ T8H, vs1, vs2);
 5140     __ add(tmpAddr, coeffs, 0);
 5141     vs_stpq_post(vs1, tmpAddr);
 5142     __ add(tmpAddr, coeffs, 256);
 5143     vs_stpq_post(vs3, tmpAddr);
 5144     // restore montmul constants
 5145     vs_ldpq(vq, kyberConsts);
 5146     load64shorts(vs1, tmpAddr);
 5147     load64shorts(vs2, zetas);
 5148     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5149     __ add(tmpAddr, coeffs, 128);
 5150     load64shorts(vs1, tmpAddr);
 5151     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5152     vs_addv(vs1, __ T8H, vs1, vs2);
 5153     __ add(tmpAddr, coeffs, 128);
 5154     store64shorts(vs1, tmpAddr);
 5155     __ add(tmpAddr, coeffs, 384);
 5156     store64shorts(vs3, tmpAddr);
 5157 
 5158     // level 1
 5159     // restore montmul constants
 5160     vs_ldpq(vq, kyberConsts);
 5161     __ add(tmpAddr, coeffs, 128);
 5162     load64shorts(vs1, tmpAddr);
 5163     load64shorts(vs2, zetas);
 5164     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5165     __ add(tmpAddr, coeffs, 0);
 5166     load64shorts(vs1, tmpAddr);
 5167     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5168     vs_addv(vs1, __ T8H, vs1, vs2);
 5169     __ add(tmpAddr, coeffs, 0);
 5170     store64shorts(vs1, tmpAddr);
 5171     store64shorts(vs3, tmpAddr);
 5172     vs_ldpq(vq, kyberConsts);
 5173     __ add(tmpAddr, coeffs, 384);
 5174     load64shorts(vs1, tmpAddr);
 5175     load64shorts(vs2, zetas);
 5176     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5177     __ add(tmpAddr, coeffs, 256);
 5178     load64shorts(vs1, tmpAddr);
 5179     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5180     vs_addv(vs1, __ T8H, vs1, vs2);
 5181     __ add(tmpAddr, coeffs, 256);
 5182     store64shorts(vs1, tmpAddr);
 5183     store64shorts(vs3, tmpAddr);
 5184 
 5185     // level 2
 5186     vs_ldpq(vq, kyberConsts);
 5187     int offsets1[4] = { 0, 32, 128, 160 };
 5188     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5189     load64shorts(vs2, zetas);
 5190     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5191     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5192     // kyber_subv_addv64();
 5193     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5194     vs_addv(vs1, __ T8H, vs1, vs2);
 5195     __ add(tmpAddr, coeffs, 0);
 5196     vs_stpq_post(vs_front(vs1), tmpAddr);
 5197     vs_stpq_post(vs_front(vs3), tmpAddr);
 5198     vs_stpq_post(vs_back(vs1), tmpAddr);
 5199     vs_stpq_post(vs_back(vs3), tmpAddr);
 5200     vs_ldpq(vq, kyberConsts);
 5201     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5202     load64shorts(vs2, zetas);
 5203     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5204     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5205     // kyber_subv_addv64();
 5206     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5207     vs_addv(vs1, __ T8H, vs1, vs2);
 5208     __ add(tmpAddr, coeffs, 256);
 5209     vs_stpq_post(vs_front(vs1), tmpAddr);
 5210     vs_stpq_post(vs_front(vs3), tmpAddr);
 5211     vs_stpq_post(vs_back(vs1), tmpAddr);
 5212     vs_stpq_post(vs_back(vs3), tmpAddr);
 5213 
 5214     // level 3
 5215     vs_ldpq(vq, kyberConsts);
 5216     int offsets2[4] = { 0, 64, 128, 192 };
 5217     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5218     load64shorts(vs2, zetas);
 5219     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5220     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5221     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5222     vs_addv(vs1, __ T8H, vs1, vs2);
 5223     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5224     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5225 
 5226     vs_ldpq(vq, kyberConsts);
 5227     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5228     load64shorts(vs2, zetas);
 5229     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5230     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5231     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5232     vs_addv(vs1, __ T8H, vs1, vs2);
 5233     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5234     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5235 
 5236     // level 4
 5237     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5238     // so they are loaded using employing an ldr at 8 distinct offsets.
 5239 
 5240     vs_ldpq(vq, kyberConsts);
 5241     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5242     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5243     load64shorts(vs2, zetas);
 5244     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5245     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5246     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5247     vs_addv(vs1, __ T8H, vs1, vs2);
 5248     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5249     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5250 
 5251     vs_ldpq(vq, kyberConsts);
 5252     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5253     load64shorts(vs2, zetas);
 5254     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5255     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5256     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5257     vs_addv(vs1, __ T8H, vs1, vs2);
 5258     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5259     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5260 
 5261     // level 5
 5262     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5263     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5264 
 5265     vs_ldpq(vq, kyberConsts);
 5266     int offsets4[4] = { 0, 32, 64, 96 };
 5267     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5268     load32shorts(vs_front(vs2), zetas);
 5269     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5270     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5271     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5272     load32shorts(vs_front(vs2), zetas);
 5273     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5274     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5275     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5276     load32shorts(vs_front(vs2), zetas);
 5277     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5278     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5279 
 5280     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5281     load32shorts(vs_front(vs2), zetas);
 5282     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5283     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5284 
 5285     // level 6
 5286     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5287     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5288 
 5289     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5290     load32shorts(vs_front(vs2), zetas);
 5291     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5292     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5293     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5294     // __ ldpq(v18, v19, __ post(zetas, 32));
 5295     load32shorts(vs_front(vs2), zetas);
 5296     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5297     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5298 
 5299     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5300     load32shorts(vs_front(vs2), zetas);
 5301     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5302     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5303 
 5304     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5305     load32shorts(vs_front(vs2), zetas);
 5306     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5307     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5308 
 5309     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5310     __ mov(r0, zr); // return 0
 5311     __ ret(lr);
 5312 
 5313     return start;
 5314   }
 5315 
 5316   // Kyber Inverse NTT function
 5317   // Implements
 5318   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5319   //
 5320   // coeffs (short[256]) = c_rarg0
 5321   // ntt_zetas (short[256]) = c_rarg1
 5322   address generate_kyberInverseNtt() {
 5323 
 5324     __ align(CodeEntryAlignment);
 5325     StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id;
 5326     StubCodeMark mark(this, stub_id);
 5327     address start = __ pc();
 5328     __ enter();
 5329 
 5330     const Register coeffs = c_rarg0;
 5331     const Register zetas = c_rarg1;
 5332 
 5333     const Register kyberConsts = r10;
 5334     const Register tmpAddr = r11;
 5335     const Register tmpAddr2 = c_rarg2;
 5336 
 5337     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5338     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5339     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5340 
 5341     __ lea(kyberConsts,
 5342              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5343 
 5344     // level 0
 5345     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5346     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5347 
 5348     vs_ldpq(vq, kyberConsts);
 5349     int offsets4[4] = { 0, 32, 64, 96 };
 5350     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5351     load32shorts(vs_front(vs2), zetas);
 5352     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5353                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5354     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5355     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5356     load32shorts(vs_front(vs2), zetas);
 5357     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5358                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5359     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5360     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5361     load32shorts(vs_front(vs2), zetas);
 5362     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5363                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5364     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5365     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5366     load32shorts(vs_front(vs2), zetas);
 5367     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5368                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5369     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5370 
 5371     // level 1
 5372     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5373     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5374 
 5375     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5376     load32shorts(vs_front(vs2), zetas);
 5377     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5378                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5379     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5380     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5381     load32shorts(vs_front(vs2), zetas);
 5382     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5383                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5384     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5385 
 5386     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5387     load32shorts(vs_front(vs2), zetas);
 5388     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5389                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5390     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5391     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5392     load32shorts(vs_front(vs2), zetas);
 5393     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5394                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5395     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5396 
 5397     // level 2
 5398     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5399     // so they are loaded using employing an ldr at 8 distinct offsets.
 5400 
 5401     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5402     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5403     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5404     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5405     vs_subv(vs1, __ T8H, vs1, vs2);
 5406     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5407     load64shorts(vs2, zetas);
 5408     vs_ldpq(vq, kyberConsts);
 5409     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5410     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5411 
 5412     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5413     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5414     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5415     vs_subv(vs1, __ T8H, vs1, vs2);
 5416     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5417     load64shorts(vs2, zetas);
 5418     vs_ldpq(vq, kyberConsts);
 5419     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5420     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5421 
 5422     // Barrett reduction at indexes where overflow may happen
 5423 
 5424     // load q and the multiplier for the Barrett reduction
 5425     __ add(tmpAddr, kyberConsts, 16);
 5426     vs_ldpq(vq, tmpAddr);
 5427 
 5428     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5429     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5430     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5431     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5432     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5433     vs_sshr(vs2, __ T8H, vs2, 11);
 5434     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5435     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5436     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5437     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5438     vs_sshr(vs2, __ T8H, vs2, 11);
 5439     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5440     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5441 
 5442     // level 3
 5443     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5444     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5445 
 5446     int offsets2[4] = { 0, 64, 128, 192 };
 5447     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5448     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5449     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5450     vs_subv(vs1, __ T8H, vs1, vs2);
 5451     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5452     load64shorts(vs2, zetas);
 5453     vs_ldpq(vq, kyberConsts);
 5454     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5455     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5456 
 5457     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5458     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5459     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5460     vs_subv(vs1, __ T8H, vs1, vs2);
 5461     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5462     load64shorts(vs2, zetas);
 5463     vs_ldpq(vq, kyberConsts);
 5464     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5465     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5466 
 5467     // level 4
 5468 
 5469     int offsets1[4] = { 0, 32, 128, 160 };
 5470     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5471     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5472     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5473     vs_subv(vs1, __ T8H, vs1, vs2);
 5474     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5475     load64shorts(vs2, zetas);
 5476     vs_ldpq(vq, kyberConsts);
 5477     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5478     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5479 
 5480     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5481     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5482     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5483     vs_subv(vs1, __ T8H, vs1, vs2);
 5484     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5485     load64shorts(vs2, zetas);
 5486     vs_ldpq(vq, kyberConsts);
 5487     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5488     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5489 
 5490     // level 5
 5491 
 5492     __ add(tmpAddr, coeffs, 0);
 5493     load64shorts(vs1, tmpAddr);
 5494     __ add(tmpAddr, coeffs, 128);
 5495     load64shorts(vs2, tmpAddr);
 5496     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5497     vs_subv(vs1, __ T8H, vs1, vs2);
 5498     __ add(tmpAddr, coeffs, 0);
 5499     store64shorts(vs3, tmpAddr);
 5500     load64shorts(vs2, zetas);
 5501     vs_ldpq(vq, kyberConsts);
 5502     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5503     __ add(tmpAddr, coeffs, 128);
 5504     store64shorts(vs2, tmpAddr);
 5505 
 5506     load64shorts(vs1, tmpAddr);
 5507     __ add(tmpAddr, coeffs, 384);
 5508     load64shorts(vs2, tmpAddr);
 5509     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5510     vs_subv(vs1, __ T8H, vs1, vs2);
 5511     __ add(tmpAddr, coeffs, 256);
 5512     store64shorts(vs3, tmpAddr);
 5513     load64shorts(vs2, zetas);
 5514     vs_ldpq(vq, kyberConsts);
 5515     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5516     __ add(tmpAddr, coeffs, 384);
 5517     store64shorts(vs2, tmpAddr);
 5518 
 5519     // Barrett reduction at indexes where overflow may happen
 5520 
 5521     // load q and the multiplier for the Barrett reduction
 5522     __ add(tmpAddr, kyberConsts, 16);
 5523     vs_ldpq(vq, tmpAddr);
 5524 
 5525     int offsets0[2] = { 0, 256 };
 5526     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5527     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5528     vs_sshr(vs2, __ T8H, vs2, 11);
 5529     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5530     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5531 
 5532     // level 6
 5533 
 5534     __ add(tmpAddr, coeffs, 0);
 5535     load64shorts(vs1, tmpAddr);
 5536     __ add(tmpAddr, coeffs, 256);
 5537     load64shorts(vs2, tmpAddr);
 5538     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5539     vs_subv(vs1, __ T8H, vs1, vs2);
 5540     __ add(tmpAddr, coeffs, 0);
 5541     store64shorts(vs3, tmpAddr);
 5542     load64shorts(vs2, zetas);
 5543     vs_ldpq(vq, kyberConsts);
 5544     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5545     __ add(tmpAddr, coeffs, 256);
 5546     store64shorts(vs2, tmpAddr);
 5547 
 5548     __ add(tmpAddr, coeffs, 128);
 5549     load64shorts(vs1, tmpAddr);
 5550     __ add(tmpAddr, coeffs, 384);
 5551     load64shorts(vs2, tmpAddr);
 5552     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5553     vs_subv(vs1, __ T8H, vs1, vs2);
 5554     __ add(tmpAddr, coeffs, 128);
 5555     store64shorts(vs3, tmpAddr);
 5556     load64shorts(vs2, zetas);
 5557     vs_ldpq(vq, kyberConsts);
 5558     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5559     __ add(tmpAddr, coeffs, 384);
 5560     store64shorts(vs2, tmpAddr);
 5561 
 5562     // multiply by 2^-n
 5563 
 5564     // load toMont(2^-n mod q)
 5565     __ add(tmpAddr, kyberConsts, 48);
 5566     __ ldr(v29, __ Q, tmpAddr);
 5567 
 5568     vs_ldpq(vq, kyberConsts);
 5569     __ add(tmpAddr, coeffs, 0);
 5570     load64shorts(vs1, tmpAddr);
 5571     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5572     __ add(tmpAddr, coeffs, 0);
 5573     store64shorts(vs2, tmpAddr);
 5574 
 5575     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5576     load64shorts(vs1, tmpAddr);
 5577     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5578     __ add(tmpAddr, coeffs, 128);
 5579     store64shorts(vs2, tmpAddr);
 5580 
 5581     // now tmpAddr contains coeffs + 256
 5582     load64shorts(vs1, tmpAddr);
 5583     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5584     __ add(tmpAddr, coeffs, 256);
 5585     store64shorts(vs2, tmpAddr);
 5586 
 5587     // now tmpAddr contains coeffs + 384
 5588     load64shorts(vs1, tmpAddr);
 5589     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5590     __ add(tmpAddr, coeffs, 384);
 5591     store64shorts(vs2, tmpAddr);
 5592 
 5593     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5594     __ mov(r0, zr); // return 0
 5595     __ ret(lr);
 5596 
 5597     return start;
 5598   }
 5599 
 5600   // Kyber multiply polynomials in the NTT domain.
 5601   // Implements
 5602   // static int implKyberNttMult(
 5603   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5604   //
 5605   // result (short[256]) = c_rarg0
 5606   // ntta (short[256]) = c_rarg1
 5607   // nttb (short[256]) = c_rarg2
 5608   // zetas (short[128]) = c_rarg3
 5609   address generate_kyberNttMult() {
 5610 
 5611     __ align(CodeEntryAlignment);
 5612     StubGenStubId stub_id = StubGenStubId::kyberNttMult_id;
 5613     StubCodeMark mark(this, stub_id);
 5614     address start = __ pc();
 5615     __ enter();
 5616 
 5617     const Register result = c_rarg0;
 5618     const Register ntta = c_rarg1;
 5619     const Register nttb = c_rarg2;
 5620     const Register zetas = c_rarg3;
 5621 
 5622     const Register kyberConsts = r10;
 5623     const Register limit = r11;
 5624 
 5625     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5626     VSeq<4> vs3(16), vs4(20);
 5627     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5628     VSeq<2> vz(28);          // pair of zetas
 5629     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5630 
 5631     __ lea(kyberConsts,
 5632              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5633 
 5634     Label kyberNttMult_loop;
 5635 
 5636     __ add(limit, result, 512);
 5637 
 5638     // load q and qinv
 5639     vs_ldpq(vq, kyberConsts);
 5640 
 5641     // load R^2 mod q (to convert back from Montgomery representation)
 5642     __ add(kyberConsts, kyberConsts, 64);
 5643     __ ldr(v27, __ Q, kyberConsts);
 5644 
 5645     __ BIND(kyberNttMult_loop);
 5646 
 5647     // load 16 zetas
 5648     vs_ldpq_post(vz, zetas);
 5649 
 5650     // load 2 sets of 32 coefficients from the two input arrays
 5651     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5652     // are striped across pairs of vector registers
 5653     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5654     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5655     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5656     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5657 
 5658     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5659     // i.e. montmul the first and second halves of vs1 in order and
 5660     // then with one sequence reversed storing the two results in vs3
 5661     //
 5662     // vs3[0] <- montmul(a0, b0)
 5663     // vs3[1] <- montmul(a1, b1)
 5664     // vs3[2] <- montmul(a0, b1)
 5665     // vs3[3] <- montmul(a1, b0)
 5666     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5667     kyber_montmul16(vs_back(vs3),
 5668                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5669 
 5670     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5671     // i.e. montmul the first and second halves of vs4 in order and
 5672     // then with one sequence reversed storing the two results in vs1
 5673     //
 5674     // vs1[0] <- montmul(a2, b2)
 5675     // vs1[1] <- montmul(a3, b3)
 5676     // vs1[2] <- montmul(a2, b3)
 5677     // vs1[3] <- montmul(a3, b2)
 5678     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5679     kyber_montmul16(vs_back(vs1),
 5680                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5681 
 5682     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5683     // We can schedule two montmuls at a time if we use a suitable vector
 5684     // sequence <vs3[1], vs1[1]>.
 5685     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5686     VSeq<2> vs5(vs3[1], delta);
 5687 
 5688     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5689     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5690     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5691 
 5692     // add results in pairs storing in vs3
 5693     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5694     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5695     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5696 
 5697     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5698     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5699     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5700 
 5701     // vs1 <- montmul(vs3, montRSquareModQ)
 5702     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5703 
 5704     // store back the two pairs of result vectors de-interleaved as 8H elements
 5705     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5706     // in memory
 5707     vs_st2_post(vs1, __ T8H, result);
 5708 
 5709     __ cmp(result, limit);
 5710     __ br(Assembler::NE, kyberNttMult_loop);
 5711 
 5712     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5713     __ mov(r0, zr); // return 0
 5714     __ ret(lr);
 5715 
 5716     return start;
 5717   }
 5718 
 5719   // Kyber add 2 polynomials.
 5720   // Implements
 5721   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5722   //
 5723   // result (short[256]) = c_rarg0
 5724   // a (short[256]) = c_rarg1
 5725   // b (short[256]) = c_rarg2
 5726   address generate_kyberAddPoly_2() {
 5727 
 5728     __ align(CodeEntryAlignment);
 5729     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id;
 5730     StubCodeMark mark(this, stub_id);
 5731     address start = __ pc();
 5732     __ enter();
 5733 
 5734     const Register result = c_rarg0;
 5735     const Register a = c_rarg1;
 5736     const Register b = c_rarg2;
 5737 
 5738     const Register kyberConsts = r11;
 5739 
 5740     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5741     // So, we can load, add and store the data in 3 groups of 11,
 5742     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5743     // registers. A further constraint is that the mapping needs
 5744     // to skip callee saves. So, we allocate the register
 5745     // sequences using two 8 sequences, two 2 sequences and two
 5746     // single registers.
 5747     VSeq<8> vs1_1(0);
 5748     VSeq<2> vs1_2(16);
 5749     FloatRegister vs1_3 = v28;
 5750     VSeq<8> vs2_1(18);
 5751     VSeq<2> vs2_2(26);
 5752     FloatRegister vs2_3 = v29;
 5753 
 5754     // two constant vector sequences
 5755     VSeq<8> vc_1(31, 0);
 5756     VSeq<2> vc_2(31, 0);
 5757 
 5758     FloatRegister vc_3 = v31;
 5759     __ lea(kyberConsts,
 5760              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5761 
 5762     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5763     for (int i = 0; i < 3; i++) {
 5764       // load 80 or 88 values from a into vs1_1/2/3
 5765       vs_ldpq_post(vs1_1, a);
 5766       vs_ldpq_post(vs1_2, a);
 5767       if (i < 2) {
 5768         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5769       }
 5770       // load 80 or 88 values from b into vs2_1/2/3
 5771       vs_ldpq_post(vs2_1, b);
 5772       vs_ldpq_post(vs2_2, b);
 5773       if (i < 2) {
 5774         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5775       }
 5776       // sum 80 or 88 values across vs1 and vs2 into vs1
 5777       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5778       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5779       if (i < 2) {
 5780         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5781       }
 5782       // add constant to all 80 or 88 results
 5783       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5784       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5785       if (i < 2) {
 5786         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5787       }
 5788       // store 80 or 88 values
 5789       vs_stpq_post(vs1_1, result);
 5790       vs_stpq_post(vs1_2, result);
 5791       if (i < 2) {
 5792         __ str(vs1_3, __ Q, __ post(result, 16));
 5793       }
 5794     }
 5795 
 5796     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5797     __ mov(r0, zr); // return 0
 5798     __ ret(lr);
 5799 
 5800     return start;
 5801   }
 5802 
 5803   // Kyber add 3 polynomials.
 5804   // Implements
 5805   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5806   //
 5807   // result (short[256]) = c_rarg0
 5808   // a (short[256]) = c_rarg1
 5809   // b (short[256]) = c_rarg2
 5810   // c (short[256]) = c_rarg3
 5811   address generate_kyberAddPoly_3() {
 5812 
 5813     __ align(CodeEntryAlignment);
 5814     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id;
 5815     StubCodeMark mark(this, stub_id);
 5816     address start = __ pc();
 5817     __ enter();
 5818 
 5819     const Register result = c_rarg0;
 5820     const Register a = c_rarg1;
 5821     const Register b = c_rarg2;
 5822     const Register c = c_rarg3;
 5823 
 5824     const Register kyberConsts = r11;
 5825 
 5826     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5827     // quadwords.  So, we can load, add and store the data in 3
 5828     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5829     // of 10 or 11 registers. A further constraint is that the
 5830     // mapping needs to skip callee saves. So, we allocate the
 5831     // register sequences using two 8 sequences, two 2 sequences
 5832     // and two single registers.
 5833     VSeq<8> vs1_1(0);
 5834     VSeq<2> vs1_2(16);
 5835     FloatRegister vs1_3 = v28;
 5836     VSeq<8> vs2_1(18);
 5837     VSeq<2> vs2_2(26);
 5838     FloatRegister vs2_3 = v29;
 5839 
 5840     // two constant vector sequences
 5841     VSeq<8> vc_1(31, 0);
 5842     VSeq<2> vc_2(31, 0);
 5843 
 5844     FloatRegister vc_3 = v31;
 5845 
 5846     __ lea(kyberConsts,
 5847              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5848 
 5849     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5850     for (int i = 0; i < 3; i++) {
 5851       // load 80 or 88 values from a into vs1_1/2/3
 5852       vs_ldpq_post(vs1_1, a);
 5853       vs_ldpq_post(vs1_2, a);
 5854       if (i < 2) {
 5855         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5856       }
 5857       // load 80 or 88 values from b into vs2_1/2/3
 5858       vs_ldpq_post(vs2_1, b);
 5859       vs_ldpq_post(vs2_2, b);
 5860       if (i < 2) {
 5861         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5862       }
 5863       // sum 80 or 88 values across vs1 and vs2 into vs1
 5864       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5865       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5866       if (i < 2) {
 5867         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5868       }
 5869       // load 80 or 88 values from c into vs2_1/2/3
 5870       vs_ldpq_post(vs2_1, c);
 5871       vs_ldpq_post(vs2_2, c);
 5872       if (i < 2) {
 5873         __ ldr(vs2_3, __ Q, __ post(c, 16));
 5874       }
 5875       // sum 80 or 88 values across vs1 and vs2 into vs1
 5876       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5877       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5878       if (i < 2) {
 5879         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5880       }
 5881       // add constant to all 80 or 88 results
 5882       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5883       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5884       if (i < 2) {
 5885         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5886       }
 5887       // store 80 or 88 values
 5888       vs_stpq_post(vs1_1, result);
 5889       vs_stpq_post(vs1_2, result);
 5890       if (i < 2) {
 5891         __ str(vs1_3, __ Q, __ post(result, 16));
 5892       }
 5893     }
 5894 
 5895     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5896     __ mov(r0, zr); // return 0
 5897     __ ret(lr);
 5898 
 5899     return start;
 5900   }
 5901 
 5902   // Kyber parse XOF output to polynomial coefficient candidates
 5903   // or decodePoly(12, ...).
 5904   // Implements
 5905   // static int implKyber12To16(
 5906   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 5907   //
 5908   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 5909   //
 5910   // condensed (byte[]) = c_rarg0
 5911   // condensedIndex = c_rarg1
 5912   // parsed (short[112 or 256]) = c_rarg2
 5913   // parsedLength (112 or 256) = c_rarg3
 5914   address generate_kyber12To16() {
 5915     Label L_F00, L_loop, L_end;
 5916 
 5917     __ BIND(L_F00);
 5918     __ emit_int64(0x0f000f000f000f00);
 5919     __ emit_int64(0x0f000f000f000f00);
 5920 
 5921     __ align(CodeEntryAlignment);
 5922     StubGenStubId stub_id = StubGenStubId::kyber12To16_id;
 5923     StubCodeMark mark(this, stub_id);
 5924     address start = __ pc();
 5925     __ enter();
 5926 
 5927     const Register condensed = c_rarg0;
 5928     const Register condensedOffs = c_rarg1;
 5929     const Register parsed = c_rarg2;
 5930     const Register parsedLength = c_rarg3;
 5931 
 5932     const Register tmpAddr = r11;
 5933 
 5934     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 5935     // quadwords so we need a 6 vector sequence for the inputs.
 5936     // Parsing produces 64 shorts, employing two 8 vector
 5937     // sequences to store and combine the intermediate data.
 5938     VSeq<6> vin(24);
 5939     VSeq<8> va(0), vb(16);
 5940 
 5941     __ adr(tmpAddr, L_F00);
 5942     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 5943     __ add(condensed, condensed, condensedOffs);
 5944 
 5945     __ BIND(L_loop);
 5946     // load 96 (6 x 16B) byte values
 5947     vs_ld3_post(vin, __ T16B, condensed);
 5948 
 5949     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 5950     // holds 48 (16x3) contiguous bytes from memory striped
 5951     // horizontally across each of the 16 byte lanes. Equivalently,
 5952     // that is 16 pairs of 12-bit integers. Likewise the back half
 5953     // holds the next 48 bytes in the same arrangement.
 5954 
 5955     // Each vector in the front half can also be viewed as a vertical
 5956     // strip across the 16 pairs of 12 bit integers. Each byte in
 5957     // vin[0] stores the low 8 bits of the first int in a pair. Each
 5958     // byte in vin[1] stores the high 4 bits of the first int and the
 5959     // low 4 bits of the second int. Each byte in vin[2] stores the
 5960     // high 8 bits of the second int. Likewise the vectors in second
 5961     // half.
 5962 
 5963     // Converting the data to 16-bit shorts requires first of all
 5964     // expanding each of the 6 x 16B vectors into 6 corresponding
 5965     // pairs of 8H vectors. Mask, shift and add operations on the
 5966     // resulting vector pairs can be used to combine 4 and 8 bit
 5967     // parts of related 8H vector elements.
 5968     //
 5969     // The middle vectors (vin[2] and vin[5]) are actually expanded
 5970     // twice, one copy manipulated to provide the lower 4 bits
 5971     // belonging to the first short in a pair and another copy
 5972     // manipulated to provide the higher 4 bits belonging to the
 5973     // second short in a pair. This is why the the vector sequences va
 5974     // and vb used to hold the expanded 8H elements are of length 8.
 5975 
 5976     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 5977     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 5978     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 5979     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 5980     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 5981     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 5982     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 5983     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 5984 
 5985     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 5986     // and vb[4:5]
 5987     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 5988     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 5989     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 5990     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 5991     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 5992     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 5993 
 5994     // shift lo byte of copy 1 of the middle stripe into the high byte
 5995     __ shl(va[2], __ T8H, va[2], 8);
 5996     __ shl(va[3], __ T8H, va[3], 8);
 5997     __ shl(vb[2], __ T8H, vb[2], 8);
 5998     __ shl(vb[3], __ T8H, vb[3], 8);
 5999 
 6000     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6001     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6002     // are in bit positions [4..11].
 6003     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6004     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6005     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6006     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6007 
 6008     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6009     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6010     // copy2
 6011     __ andr(va[2], __ T16B, va[2], v31);
 6012     __ andr(va[3], __ T16B, va[3], v31);
 6013     __ ushr(va[4], __ T8H, va[4], 4);
 6014     __ ushr(va[5], __ T8H, va[5], 4);
 6015     __ andr(vb[2], __ T16B, vb[2], v31);
 6016     __ andr(vb[3], __ T16B, vb[3], v31);
 6017     __ ushr(vb[4], __ T8H, vb[4], 4);
 6018     __ ushr(vb[5], __ T8H, vb[5], 4);
 6019 
 6020     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6021     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6022     // n.b. the ordering ensures: i) inputs are consumed before they
 6023     // are overwritten ii) the order of 16-bit results across successive
 6024     // pairs of vectors in va and then vb reflects the order of the
 6025     // corresponding 12-bit inputs
 6026     __ addv(va[0], __ T8H, va[0], va[2]);
 6027     __ addv(va[2], __ T8H, va[1], va[3]);
 6028     __ addv(va[1], __ T8H, va[4], va[6]);
 6029     __ addv(va[3], __ T8H, va[5], va[7]);
 6030     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6031     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6032     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6033     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6034 
 6035     // store 64 results interleaved as shorts
 6036     vs_st2_post(vs_front(va), __ T8H, parsed);
 6037     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6038 
 6039     __ sub(parsedLength, parsedLength, 64);
 6040     __ cmp(parsedLength, (u1)64);
 6041     __ br(Assembler::GE, L_loop);
 6042     __ cbz(parsedLength, L_end);
 6043 
 6044     // if anything is left it should be a final 72 bytes of input
 6045     // i.e. a final 48 12-bit values. so we handle this by loading
 6046     // 48 bytes into all 16B lanes of front(vin) and only 24
 6047     // bytes into the lower 8B lane of back(vin)
 6048     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6049     vs_ld3(vs_back(vin), __ T8B, condensed);
 6050 
 6051     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6052     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6053     // 5 and target element 2 of vb duplicates element 4.
 6054     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6055     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6056     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6057     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6058     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6059     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6060 
 6061     // This time expand just the lower 8 lanes
 6062     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6063     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6064     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6065 
 6066     // shift lo byte of copy 1 of the middle stripe into the high byte
 6067     __ shl(va[2], __ T8H, va[2], 8);
 6068     __ shl(va[3], __ T8H, va[3], 8);
 6069     __ shl(vb[2], __ T8H, vb[2], 8);
 6070 
 6071     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6072     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6073     // int are in bit positions [4..11].
 6074     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6075     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6076     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6077 
 6078     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6079     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6080     // copy2
 6081     __ andr(va[2], __ T16B, va[2], v31);
 6082     __ andr(va[3], __ T16B, va[3], v31);
 6083     __ ushr(va[4], __ T8H, va[4], 4);
 6084     __ ushr(va[5], __ T8H, va[5], 4);
 6085     __ andr(vb[2], __ T16B, vb[2], v31);
 6086     __ ushr(vb[4], __ T8H, vb[4], 4);
 6087 
 6088 
 6089 
 6090     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6091     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6092 
 6093     // n.b. ordering ensures: i) inputs are consumed before they are
 6094     // overwritten ii) order of 16-bit results across succsessive
 6095     // pairs of vectors in va and then lower half of vb reflects order
 6096     // of corresponding 12-bit inputs
 6097     __ addv(va[0], __ T8H, va[0], va[2]);
 6098     __ addv(va[2], __ T8H, va[1], va[3]);
 6099     __ addv(va[1], __ T8H, va[4], va[6]);
 6100     __ addv(va[3], __ T8H, va[5], va[7]);
 6101     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6102     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6103 
 6104     // store 48 results interleaved as shorts
 6105     vs_st2_post(vs_front(va), __ T8H, parsed);
 6106     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6107 
 6108     __ BIND(L_end);
 6109 
 6110     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6111     __ mov(r0, zr); // return 0
 6112     __ ret(lr);
 6113 
 6114     return start;
 6115   }
 6116 
 6117   // Kyber Barrett reduce function.
 6118   // Implements
 6119   // static int implKyberBarrettReduce(short[] coeffs) {}
 6120   //
 6121   // coeffs (short[256]) = c_rarg0
 6122   address generate_kyberBarrettReduce() {
 6123 
 6124     __ align(CodeEntryAlignment);
 6125     StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id;
 6126     StubCodeMark mark(this, stub_id);
 6127     address start = __ pc();
 6128     __ enter();
 6129 
 6130     const Register coeffs = c_rarg0;
 6131 
 6132     const Register kyberConsts = r10;
 6133     const Register result = r11;
 6134 
 6135     // As above we process 256 sets of values in total i.e. 32 x
 6136     // 8H quadwords. So, we can load, add and store the data in 3
 6137     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6138     // of 10 or 11 registers. A further constraint is that the
 6139     // mapping needs to skip callee saves. So, we allocate the
 6140     // register sequences using two 8 sequences, two 2 sequences
 6141     // and two single registers.
 6142     VSeq<8> vs1_1(0);
 6143     VSeq<2> vs1_2(16);
 6144     FloatRegister vs1_3 = v28;
 6145     VSeq<8> vs2_1(18);
 6146     VSeq<2> vs2_2(26);
 6147     FloatRegister vs2_3 = v29;
 6148 
 6149     // we also need a pair of corresponding constant sequences
 6150 
 6151     VSeq<8> vc1_1(30, 0);
 6152     VSeq<2> vc1_2(30, 0);
 6153     FloatRegister vc1_3 = v30; // for kyber_q
 6154 
 6155     VSeq<8> vc2_1(31, 0);
 6156     VSeq<2> vc2_2(31, 0);
 6157     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6158 
 6159     __ add(result, coeffs, 0);
 6160     __ lea(kyberConsts,
 6161              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6162 
 6163     // load q and the multiplier for the Barrett reduction
 6164     __ add(kyberConsts, kyberConsts, 16);
 6165     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6166 
 6167     for (int i = 0; i < 3; i++) {
 6168       // load 80 or 88 coefficients
 6169       vs_ldpq_post(vs1_1, coeffs);
 6170       vs_ldpq_post(vs1_2, coeffs);
 6171       if (i < 2) {
 6172         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6173       }
 6174 
 6175       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6176       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6177       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6178       if (i < 2) {
 6179         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6180       }
 6181 
 6182       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6183       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6184       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6185       if (i < 2) {
 6186         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6187       }
 6188 
 6189       // vs1 <- vs1 - vs2 * kyber_q
 6190       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6191       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6192       if (i < 2) {
 6193         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6194       }
 6195 
 6196       vs_stpq_post(vs1_1, result);
 6197       vs_stpq_post(vs1_2, result);
 6198       if (i < 2) {
 6199         __ str(vs1_3, __ Q, __ post(result, 16));
 6200       }
 6201     }
 6202 
 6203     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6204     __ mov(r0, zr); // return 0
 6205     __ ret(lr);
 6206 
 6207     return start;
 6208   }
 6209 
 6210 
 6211   // Dilithium-specific montmul helper routines that generate parallel
 6212   // code for, respectively, a single 4x4s vector sequence montmul or
 6213   // two such multiplies in a row.
 6214 
 6215   // Perform 16 32-bit Montgomery multiplications in parallel
 6216   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6217                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6218     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6219     // It will assert that the register use is valid
 6220     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6221   }
 6222 
 6223   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6224   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6225                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6226     // Schedule two successive 4x4S multiplies via the montmul helper
 6227     // on the front and back halves of va, vb and vc. The helper will
 6228     // assert that the register use has no overlap conflicts on each
 6229     // individual call but we also need to ensure that the necessary
 6230     // disjoint/equality constraints are met across both calls.
 6231 
 6232     // vb, vc, vtmp and vq must be disjoint. va must either be
 6233     // disjoint from all other registers or equal vc
 6234 
 6235     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6236     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6237     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6238 
 6239     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6240     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6241 
 6242     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6243 
 6244     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6245     assert(vs_disjoint(va, vb), "va and vb overlap");
 6246     assert(vs_disjoint(va, vq), "va and vq overlap");
 6247     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6248 
 6249     // We multiply the front and back halves of each sequence 4 at a
 6250     // time because
 6251     //
 6252     // 1) we are currently only able to get 4-way instruction
 6253     // parallelism at best
 6254     //
 6255     // 2) we need registers for the constants in vq and temporary
 6256     // scratch registers to hold intermediate results so vtmp can only
 6257     // be a VSeq<4> which means we only have 4 scratch slots.
 6258 
 6259     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6260     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6261   }
 6262 
 6263   // Perform combined montmul then add/sub on 4x4S vectors.
 6264   void dilithium_montmul16_sub_add(
 6265           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6266           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6267     // compute a = montmul(a1, c)
 6268     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6269     // ouptut a1 = a0 - a
 6270     vs_subv(va1, __ T4S, va0, vc);
 6271     //    and a0 = a0 + a
 6272     vs_addv(va0, __ T4S, va0, vc);
 6273   }
 6274 
 6275   // Perform combined add/sub then montul on 4x4S vectors.
 6276   void dilithium_sub_add_montmul16(
 6277           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6278           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6279     // compute c = a0 - a1
 6280     vs_subv(vtmp1, __ T4S, va0, va1);
 6281     // output a0 = a0 + a1
 6282     vs_addv(va0, __ T4S, va0, va1);
 6283     // output a1 = b montmul c
 6284     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6285   }
 6286 
 6287   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6288   // in the Java implementation come in sequences of at least 8, so we
 6289   // can use ldpq to collect the corresponding data into pairs of vector
 6290   // registers.
 6291   // We collect the coefficients corresponding to the 'j+l' indexes into
 6292   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6293   // then we do the (Montgomery) multiplications by the zetas in parallel
 6294   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6295   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6296   // v0-v7 and finally save the results back to the coeffs array.
 6297   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6298     const Register coeffs, const Register zetas) {
 6299     int c1 = 0;
 6300     int c2 = 512;
 6301     int startIncr;
 6302     // don't use callee save registers v8 - v15
 6303     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6304     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6305     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6306     int offsets[4] = { 0, 32, 64, 96 };
 6307 
 6308     for (int level = 0; level < 5; level++) {
 6309       int c1Start = c1;
 6310       int c2Start = c2;
 6311       if (level == 3) {
 6312         offsets[1] = 32;
 6313         offsets[2] = 128;
 6314         offsets[3] = 160;
 6315       } else if (level == 4) {
 6316         offsets[1] = 64;
 6317         offsets[2] = 128;
 6318         offsets[3] = 192;
 6319       }
 6320 
 6321       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6322       // time at 4 different offsets and multiply them in order by the
 6323       // next set of input values. So we employ indexed load and store
 6324       // pair instructions with arrangement 4S.
 6325       for (int i = 0; i < 4; i++) {
 6326         // reload q and qinv
 6327         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6328         // load 8x4S coefficients via second start pos == c2
 6329         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6330         // load next 8x4S inputs == b
 6331         vs_ldpq_post(vs2, zetas);
 6332         // compute a == c2 * b mod MONT_Q
 6333         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6334         // load 8x4s coefficients via first start pos == c1
 6335         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6336         // compute a1 =  c1 + a
 6337         vs_addv(vs3, __ T4S, vs1, vs2);
 6338         // compute a2 =  c1 - a
 6339         vs_subv(vs1, __ T4S, vs1, vs2);
 6340         // output a1 and a2
 6341         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6342         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6343 
 6344         int k = 4 * level + i;
 6345 
 6346         if (k > 7) {
 6347           startIncr = 256;
 6348         } else if (k == 5) {
 6349           startIncr = 384;
 6350         } else {
 6351           startIncr = 128;
 6352         }
 6353 
 6354         c1Start += startIncr;
 6355         c2Start += startIncr;
 6356       }
 6357 
 6358       c2 /= 2;
 6359     }
 6360   }
 6361 
 6362   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6363   // Implements the method
 6364   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6365   // of the Java class sun.security.provider
 6366   //
 6367   // coeffs (int[256]) = c_rarg0
 6368   // zetas (int[256]) = c_rarg1
 6369   address generate_dilithiumAlmostNtt() {
 6370 
 6371     __ align(CodeEntryAlignment);
 6372     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 6373     StubCodeMark mark(this, stub_id);
 6374     address start = __ pc();
 6375     __ enter();
 6376 
 6377     const Register coeffs = c_rarg0;
 6378     const Register zetas = c_rarg1;
 6379 
 6380     const Register tmpAddr = r9;
 6381     const Register dilithiumConsts = r10;
 6382     const Register result = r11;
 6383     // don't use callee save registers v8 - v15
 6384     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6385     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6386     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6387     int offsets[4] = { 0, 32, 64, 96};
 6388     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6389     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6390     __ add(result, coeffs, 0);
 6391     __ lea(dilithiumConsts,
 6392              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6393 
 6394     // Each level represents one iteration of the outer for loop of the Java version.
 6395 
 6396     // level 0-4
 6397     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6398 
 6399     // level 5
 6400 
 6401     // At level 5 the coefficients we need to combine with the zetas
 6402     // are grouped in memory in blocks of size 4. So, for both sets of
 6403     // coefficients we load 4 adjacent values at 8 different offsets
 6404     // using an indexed ldr with register variant Q and multiply them
 6405     // in sequence order by the next set of inputs. Likewise we store
 6406     // the resuls using an indexed str with register variant Q.
 6407     for (int i = 0; i < 1024; i += 256) {
 6408       // reload constants q, qinv each iteration as they get clobbered later
 6409       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6410       // load 32 (8x4S) coefficients via first offsets = c1
 6411       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6412       // load next 32 (8x4S) inputs = b
 6413       vs_ldpq_post(vs2, zetas);
 6414       // a = b montul c1
 6415       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6416       // load 32 (8x4S) coefficients via second offsets = c2
 6417       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6418       // add/sub with result of multiply
 6419       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6420       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6421       // write back new coefficients using same offsets
 6422       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6423       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6424     }
 6425 
 6426     // level 6
 6427     // At level 6 the coefficients we need to combine with the zetas
 6428     // are grouped in memory in pairs, the first two being montmul
 6429     // inputs and the second add/sub inputs. We can still implement
 6430     // the montmul+sub+add using 4-way parallelism but only if we
 6431     // combine the coefficients with the zetas 16 at a time. We load 8
 6432     // adjacent values at 4 different offsets using an ld2 load with
 6433     // arrangement 2D. That interleaves the lower and upper halves of
 6434     // each pair of quadwords into successive vector registers. We
 6435     // then need to montmul the 4 even elements of the coefficients
 6436     // register sequence by the zetas in order and then add/sub the 4
 6437     // odd elements of the coefficients register sequence. We use an
 6438     // equivalent st2 operation to store the results back into memory
 6439     // de-interleaved.
 6440     for (int i = 0; i < 1024; i += 128) {
 6441       // reload constants q, qinv each iteration as they get clobbered later
 6442       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6443       // load interleaved 16 (4x2D) coefficients via offsets
 6444       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6445       // load next 16 (4x4S) inputs
 6446       vs_ldpq_post(vs_front(vs2), zetas);
 6447       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6448       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6449                                   vs_front(vs2), vtmp, vq);
 6450       // store interleaved 16 (4x2D) coefficients via offsets
 6451       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6452     }
 6453 
 6454     // level 7
 6455     // At level 7 the coefficients we need to combine with the zetas
 6456     // occur singly with montmul inputs alterating with add/sub
 6457     // inputs. Once again we can use 4-way parallelism to combine 16
 6458     // zetas at a time. However, we have to load 8 adjacent values at
 6459     // 4 different offsets using an ld2 load with arrangement 4S. That
 6460     // interleaves the the odd words of each pair into one
 6461     // coefficients vector register and the even words of the pair
 6462     // into the next register. We then need to montmul the 4 even
 6463     // elements of the coefficients register sequence by the zetas in
 6464     // order and then add/sub the 4 odd elements of the coefficients
 6465     // register sequence. We use an equivalent st2 operation to store
 6466     // the results back into memory de-interleaved.
 6467 
 6468     for (int i = 0; i < 1024; i += 128) {
 6469       // reload constants q, qinv each iteration as they get clobbered later
 6470       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6471       // load interleaved 16 (4x4S) coefficients via offsets
 6472       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6473       // load next 16 (4x4S) inputs
 6474       vs_ldpq_post(vs_front(vs2), zetas);
 6475       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6476       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6477                                   vs_front(vs2), vtmp, vq);
 6478       // store interleaved 16 (4x4S) coefficients via offsets
 6479       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6480     }
 6481     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6482     __ mov(r0, zr); // return 0
 6483     __ ret(lr);
 6484 
 6485     return start;
 6486   }
 6487 
 6488   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6489   // in the Java implementation come in sequences of at least 8, so we
 6490   // can use ldpq to collect the corresponding data into pairs of vector
 6491   // registers
 6492   // We collect the coefficients that correspond to the 'j's into vs1
 6493   // the coefficiets that correspond to the 'j+l's into vs2 then
 6494   // do the additions into vs3 and the subtractions into vs1 then
 6495   // save the result of the additions, load the zetas into vs2
 6496   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6497   // finally save the results back to the coeffs array
 6498   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6499     const Register coeffs, const Register zetas) {
 6500     int c1 = 0;
 6501     int c2 = 32;
 6502     int startIncr;
 6503     int offsets[4];
 6504     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6505     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6506     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6507 
 6508     offsets[0] = 0;
 6509 
 6510     for (int level = 3; level < 8; level++) {
 6511       int c1Start = c1;
 6512       int c2Start = c2;
 6513       if (level == 3) {
 6514         offsets[1] = 64;
 6515         offsets[2] = 128;
 6516         offsets[3] = 192;
 6517       } else if (level == 4) {
 6518         offsets[1] = 32;
 6519         offsets[2] = 128;
 6520         offsets[3] = 160;
 6521       } else {
 6522         offsets[1] = 32;
 6523         offsets[2] = 64;
 6524         offsets[3] = 96;
 6525       }
 6526 
 6527       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6528       // time at 4 different offsets and multiply them in order by the
 6529       // next set of input values. So we employ indexed load and store
 6530       // pair instructions with arrangement 4S.
 6531       for (int i = 0; i < 4; i++) {
 6532         // load v1 32 (8x4S) coefficients relative to first start index
 6533         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6534         // load v2 32 (8x4S) coefficients relative to second start index
 6535         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6536         // a0 = v1 + v2 -- n.b. clobbers vqs
 6537         vs_addv(vs3, __ T4S, vs1, vs2);
 6538         // a1 = v1 - v2
 6539         vs_subv(vs1, __ T4S, vs1, vs2);
 6540         // save a1 relative to first start index
 6541         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6542         // load constants q, qinv each iteration as they get clobbered above
 6543         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6544         // load b next 32 (8x4S) inputs
 6545         vs_ldpq_post(vs2, zetas);
 6546         // a = a1 montmul b
 6547         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6548         // save a relative to second start index
 6549         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6550 
 6551         int k = 4 * level + i;
 6552 
 6553         if (k < 24) {
 6554           startIncr = 256;
 6555         } else if (k == 25) {
 6556           startIncr = 384;
 6557         } else {
 6558           startIncr = 128;
 6559         }
 6560 
 6561         c1Start += startIncr;
 6562         c2Start += startIncr;
 6563       }
 6564 
 6565       c2 *= 2;
 6566     }
 6567   }
 6568 
 6569   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6570   // Implements the method
 6571   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6572   // the sun.security.provider.ML_DSA class.
 6573   //
 6574   // coeffs (int[256]) = c_rarg0
 6575   // zetas (int[256]) = c_rarg1
 6576   address generate_dilithiumAlmostInverseNtt() {
 6577 
 6578     __ align(CodeEntryAlignment);
 6579     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 6580     StubCodeMark mark(this, stub_id);
 6581     address start = __ pc();
 6582     __ enter();
 6583 
 6584     const Register coeffs = c_rarg0;
 6585     const Register zetas = c_rarg1;
 6586 
 6587     const Register tmpAddr = r9;
 6588     const Register dilithiumConsts = r10;
 6589     const Register result = r11;
 6590     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6591     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6592     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6593     int offsets[4] = { 0, 32, 64, 96 };
 6594     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6595     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6596 
 6597     __ add(result, coeffs, 0);
 6598     __ lea(dilithiumConsts,
 6599              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6600 
 6601     // Each level represents one iteration of the outer for loop of the Java version
 6602 
 6603     // level 0
 6604     // At level 0 we need to interleave adjacent quartets of
 6605     // coefficients before we multiply and add/sub by the next 16
 6606     // zetas just as we did for level 7 in the multiply code. So we
 6607     // load and store the values using an ld2/st2 with arrangement 4S.
 6608     for (int i = 0; i < 1024; i += 128) {
 6609       // load constants q, qinv
 6610       // n.b. this can be moved out of the loop as they do not get
 6611       // clobbered by first two loops
 6612       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6613       // a0/a1 load interleaved 32 (8x4S) coefficients
 6614       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6615       // b load next 32 (8x4S) inputs
 6616       vs_ldpq_post(vs_front(vs2), zetas);
 6617       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6618       // n.b. second half of vs2 provides temporary register storage
 6619       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6620                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6621       // a0/a1 store interleaved 32 (8x4S) coefficients
 6622       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6623     }
 6624 
 6625     // level 1
 6626     // At level 1 we need to interleave pairs of adjacent pairs of
 6627     // coefficients before we multiply by the next 16 zetas just as we
 6628     // did for level 6 in the multiply code. So we load and store the
 6629     // values an ld2/st2 with arrangement 2D.
 6630     for (int i = 0; i < 1024; i += 128) {
 6631       // a0/a1 load interleaved 32 (8x2D) coefficients
 6632       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6633       // b load next 16 (4x4S) inputs
 6634       vs_ldpq_post(vs_front(vs2), zetas);
 6635       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6636       // n.b. second half of vs2 provides temporary register storage
 6637       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6638                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6639       // a0/a1 store interleaved 32 (8x2D) coefficients
 6640       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6641     }
 6642 
 6643     // level 2
 6644     // At level 2 coefficients come in blocks of 4. So, we load 4
 6645     // adjacent coefficients at 8 distinct offsets for both the first
 6646     // and second coefficient sequences, using an ldr with register
 6647     // variant Q then combine them with next set of 32 zetas. Likewise
 6648     // we store the results using an str with register variant Q.
 6649     for (int i = 0; i < 1024; i += 256) {
 6650       // c0 load 32 (8x4S) coefficients via first offsets
 6651       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6652       // c1 load 32 (8x4S) coefficients via second offsets
 6653       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6654       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6655       vs_addv(vs3, __ T4S, vs1, vs2);
 6656       // c = c0 - c1
 6657       vs_subv(vs1, __ T4S, vs1, vs2);
 6658       // store a0 32 (8x4S) coefficients via first offsets
 6659       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6660       // b load 32 (8x4S) next inputs
 6661       vs_ldpq_post(vs2, zetas);
 6662       // reload constants q, qinv -- they were clobbered earlier
 6663       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6664       // compute a1 = b montmul c
 6665       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6666       // store a1 32 (8x4S) coefficients via second offsets
 6667       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6668     }
 6669 
 6670     // level 3-7
 6671     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6672 
 6673     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6674     __ mov(r0, zr); // return 0
 6675     __ ret(lr);
 6676 
 6677     return start;
 6678   }
 6679 
 6680   // Dilithium multiply polynomials in the NTT domain.
 6681   // Straightforward implementation of the method
 6682   // static int implDilithiumNttMult(
 6683   //              int[] result, int[] ntta, int[] nttb {} of
 6684   // the sun.security.provider.ML_DSA class.
 6685   //
 6686   // result (int[256]) = c_rarg0
 6687   // poly1 (int[256]) = c_rarg1
 6688   // poly2 (int[256]) = c_rarg2
 6689   address generate_dilithiumNttMult() {
 6690 
 6691         __ align(CodeEntryAlignment);
 6692     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 6693     StubCodeMark mark(this, stub_id);
 6694     address start = __ pc();
 6695     __ enter();
 6696 
 6697     Label L_loop;
 6698 
 6699     const Register result = c_rarg0;
 6700     const Register poly1 = c_rarg1;
 6701     const Register poly2 = c_rarg2;
 6702 
 6703     const Register dilithiumConsts = r10;
 6704     const Register len = r11;
 6705 
 6706     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6707     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6708     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6709     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6710 
 6711     __ lea(dilithiumConsts,
 6712              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6713 
 6714     // load constants q, qinv
 6715     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6716     // load constant rSquare into v29
 6717     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6718 
 6719     __ mov(len, zr);
 6720     __ add(len, len, 1024);
 6721 
 6722     __ BIND(L_loop);
 6723 
 6724     // b load 32 (8x4S) next inputs from poly1
 6725     vs_ldpq_post(vs1, poly1);
 6726     // c load 32 (8x4S) next inputs from poly2
 6727     vs_ldpq_post(vs2, poly2);
 6728     // compute a = b montmul c
 6729     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6730     // compute a = rsquare montmul a
 6731     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6732     // save a 32 (8x4S) results
 6733     vs_stpq_post(vs2, result);
 6734 
 6735     __ sub(len, len, 128);
 6736     __ cmp(len, (u1)128);
 6737     __ br(Assembler::GE, L_loop);
 6738 
 6739     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6740     __ mov(r0, zr); // return 0
 6741     __ ret(lr);
 6742 
 6743     return start;
 6744   }
 6745 
 6746   // Dilithium Motgomery multiply an array by a constant.
 6747   // A straightforward implementation of the method
 6748   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6749   // of the sun.security.provider.MLDSA class
 6750   //
 6751   // coeffs (int[256]) = c_rarg0
 6752   // constant (int) = c_rarg1
 6753   address generate_dilithiumMontMulByConstant() {
 6754 
 6755     __ align(CodeEntryAlignment);
 6756     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 6757     StubCodeMark mark(this, stub_id);
 6758     address start = __ pc();
 6759     __ enter();
 6760 
 6761     Label L_loop;
 6762 
 6763     const Register coeffs = c_rarg0;
 6764     const Register constant = c_rarg1;
 6765 
 6766     const Register dilithiumConsts = r10;
 6767     const Register result = r11;
 6768     const Register len = r12;
 6769 
 6770     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6771     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6772     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6773     VSeq<8> vconst(29, 0);             // for montmul by constant
 6774 
 6775     // results track inputs
 6776     __ add(result, coeffs, 0);
 6777     __ lea(dilithiumConsts,
 6778              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6779 
 6780     // load constants q, qinv -- they do not get clobbered by first two loops
 6781     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6782     // copy caller supplied constant across vconst
 6783     __ dup(vconst[0], __ T4S, constant);
 6784     __ mov(len, zr);
 6785     __ add(len, len, 1024);
 6786 
 6787     __ BIND(L_loop);
 6788 
 6789     // load next 32 inputs
 6790     vs_ldpq_post(vs2, coeffs);
 6791     // mont mul by constant
 6792     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6793     // write next 32 results
 6794     vs_stpq_post(vs2, result);
 6795 
 6796     __ sub(len, len, 128);
 6797     __ cmp(len, (u1)128);
 6798     __ br(Assembler::GE, L_loop);
 6799 
 6800     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6801     __ mov(r0, zr); // return 0
 6802     __ ret(lr);
 6803 
 6804     return start;
 6805   }
 6806 
 6807   // Dilithium decompose poly.
 6808   // Implements the method
 6809   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6810   // of the sun.security.provider.ML_DSA class
 6811   //
 6812   // input (int[256]) = c_rarg0
 6813   // lowPart (int[256]) = c_rarg1
 6814   // highPart (int[256]) = c_rarg2
 6815   // twoGamma2  (int) = c_rarg3
 6816   // multiplier (int) = c_rarg4
 6817   address generate_dilithiumDecomposePoly() {
 6818 
 6819     __ align(CodeEntryAlignment);
 6820     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 6821     StubCodeMark mark(this, stub_id);
 6822     address start = __ pc();
 6823     Label L_loop;
 6824 
 6825     const Register input = c_rarg0;
 6826     const Register lowPart = c_rarg1;
 6827     const Register highPart = c_rarg2;
 6828     const Register twoGamma2 = c_rarg3;
 6829     const Register multiplier = c_rarg4;
 6830 
 6831     const Register len = r9;
 6832     const Register dilithiumConsts = r10;
 6833     const Register tmp = r11;
 6834 
 6835     // 6 independent sets of 4x4s values
 6836     VSeq<4> vs1(0), vs2(4), vs3(8);
 6837     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6838 
 6839     // 7 constants for cross-multiplying
 6840     VSeq<4> one(25, 0);
 6841     VSeq<4> qminus1(26, 0);
 6842     VSeq<4> g2(27, 0);
 6843     VSeq<4> twog2(28, 0);
 6844     VSeq<4> mult(29, 0);
 6845     VSeq<4> q(30, 0);
 6846     VSeq<4> qadd(31, 0);
 6847 
 6848     __ enter();
 6849 
 6850     __ lea(dilithiumConsts,
 6851              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6852 
 6853     // save callee-saved registers
 6854     __ stpd(v8, v9, __ pre(sp, -64));
 6855     __ stpd(v10, v11, Address(sp, 16));
 6856     __ stpd(v12, v13, Address(sp, 32));
 6857     __ stpd(v14, v15, Address(sp, 48));
 6858 
 6859     // populate constant registers
 6860     __ mov(tmp, zr);
 6861     __ add(tmp, tmp, 1);
 6862     __ dup(one[0], __ T4S, tmp); // 1
 6863     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6864     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6865     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6866     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6867     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6868     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6869 
 6870     __ mov(len, zr);
 6871     __ add(len, len, 1024);
 6872 
 6873     __ BIND(L_loop);
 6874 
 6875     // load next 4x4S inputs interleaved: rplus --> vs1
 6876     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 6877 
 6878     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 6879     vs_addv(vtmp, __ T4S, vs1, qadd);
 6880     vs_sshr(vtmp, __ T4S, vtmp, 23);
 6881     vs_mulv(vtmp, __ T4S, vtmp, q);
 6882     vs_subv(vs1, __ T4S, vs1, vtmp);
 6883 
 6884     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 6885     vs_sshr(vtmp, __ T4S, vs1, 31);
 6886     vs_andr(vtmp, vtmp, q);
 6887     vs_addv(vs1, __ T4S, vs1, vtmp);
 6888 
 6889     // quotient --> vs2
 6890     // int quotient = (rplus * multiplier) >> 22;
 6891     vs_mulv(vtmp, __ T4S, vs1, mult);
 6892     vs_sshr(vs2, __ T4S, vtmp, 22);
 6893 
 6894     // r0 --> vs3
 6895     // int r0 = rplus - quotient * twoGamma2;
 6896     vs_mulv(vtmp, __ T4S, vs2, twog2);
 6897     vs_subv(vs3, __ T4S, vs1, vtmp);
 6898 
 6899     // mask --> vs4
 6900     // int mask = (twoGamma2 - r0) >> 22;
 6901     vs_subv(vtmp, __ T4S, twog2, vs3);
 6902     vs_sshr(vs4, __ T4S, vtmp, 22);
 6903 
 6904     // r0 -= (mask & twoGamma2);
 6905     vs_andr(vtmp, vs4, twog2);
 6906     vs_subv(vs3, __ T4S, vs3, vtmp);
 6907 
 6908     //  quotient += (mask & 1);
 6909     vs_andr(vtmp, vs4, one);
 6910     vs_addv(vs2, __ T4S, vs2, vtmp);
 6911 
 6912     // mask = (twoGamma2 / 2 - r0) >> 31;
 6913     vs_subv(vtmp, __ T4S, g2, vs3);
 6914     vs_sshr(vs4, __ T4S, vtmp, 31);
 6915 
 6916     // r0 -= (mask & twoGamma2);
 6917     vs_andr(vtmp, vs4, twog2);
 6918     vs_subv(vs3, __ T4S, vs3, vtmp);
 6919 
 6920     // quotient += (mask & 1);
 6921     vs_andr(vtmp, vs4, one);
 6922     vs_addv(vs2, __ T4S, vs2, vtmp);
 6923 
 6924     // r1 --> vs5
 6925     // int r1 = rplus - r0 - (dilithium_q - 1);
 6926     vs_subv(vtmp, __ T4S, vs1, vs3);
 6927     vs_subv(vs5, __ T4S, vtmp, qminus1);
 6928 
 6929     // r1 --> vs1 (overwriting rplus)
 6930     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 6931     vs_negr(vtmp, __ T4S, vs5);
 6932     vs_orr(vtmp, vs5, vtmp);
 6933     vs_sshr(vs1, __ T4S, vtmp, 31);
 6934 
 6935     // r0 += ~r1;
 6936     vs_notr(vtmp, vs1);
 6937     vs_addv(vs3, __ T4S, vs3, vtmp);
 6938 
 6939     // r1 = r1 & quotient;
 6940     vs_andr(vs1, vs2, vs1);
 6941 
 6942     // store results inteleaved
 6943     // lowPart[m] = r0;
 6944     // highPart[m] = r1;
 6945     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 6946     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 6947 
 6948     __ sub(len, len, 64);
 6949     __ cmp(len, (u1)64);
 6950     __ br(Assembler::GE, L_loop);
 6951 
 6952     // restore callee-saved vector registers
 6953     __ ldpd(v14, v15, Address(sp, 48));
 6954     __ ldpd(v12, v13, Address(sp, 32));
 6955     __ ldpd(v10, v11, Address(sp, 16));
 6956     __ ldpd(v8, v9, __ post(sp, 64));
 6957 
 6958     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6959     __ mov(r0, zr); // return 0
 6960     __ ret(lr);
 6961 
 6962     return start;
 6963   }
 6964 
 6965   /**
 6966    *  Arguments:
 6967    *
 6968    * Inputs:
 6969    *   c_rarg0   - int crc
 6970    *   c_rarg1   - byte* buf
 6971    *   c_rarg2   - int length
 6972    *
 6973    * Output:
 6974    *       rax   - int crc result
 6975    */
 6976   address generate_updateBytesCRC32() {
 6977     assert(UseCRC32Intrinsics, "what are we doing here?");
 6978 
 6979     __ align(CodeEntryAlignment);
 6980     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 6981     StubCodeMark mark(this, stub_id);
 6982 
 6983     address start = __ pc();
 6984 
 6985     const Register crc   = c_rarg0;  // crc
 6986     const Register buf   = c_rarg1;  // source java byte array address
 6987     const Register len   = c_rarg2;  // length
 6988     const Register table0 = c_rarg3; // crc_table address
 6989     const Register table1 = c_rarg4;
 6990     const Register table2 = c_rarg5;
 6991     const Register table3 = c_rarg6;
 6992     const Register tmp3 = c_rarg7;
 6993 
 6994     BLOCK_COMMENT("Entry:");
 6995     __ enter(); // required for proper stackwalking of RuntimeStub frame
 6996 
 6997     __ kernel_crc32(crc, buf, len,
 6998               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 6999 
 7000     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7001     __ ret(lr);
 7002 
 7003     return start;
 7004   }
 7005 
 7006   /**
 7007    *  Arguments:
 7008    *
 7009    * Inputs:
 7010    *   c_rarg0   - int crc
 7011    *   c_rarg1   - byte* buf
 7012    *   c_rarg2   - int length
 7013    *   c_rarg3   - int* table
 7014    *
 7015    * Output:
 7016    *       r0   - int crc result
 7017    */
 7018   address generate_updateBytesCRC32C() {
 7019     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7020 
 7021     __ align(CodeEntryAlignment);
 7022     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 7023     StubCodeMark mark(this, stub_id);
 7024 
 7025     address start = __ pc();
 7026 
 7027     const Register crc   = c_rarg0;  // crc
 7028     const Register buf   = c_rarg1;  // source java byte array address
 7029     const Register len   = c_rarg2;  // length
 7030     const Register table0 = c_rarg3; // crc_table address
 7031     const Register table1 = c_rarg4;
 7032     const Register table2 = c_rarg5;
 7033     const Register table3 = c_rarg6;
 7034     const Register tmp3 = c_rarg7;
 7035 
 7036     BLOCK_COMMENT("Entry:");
 7037     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7038 
 7039     __ kernel_crc32c(crc, buf, len,
 7040               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7041 
 7042     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7043     __ ret(lr);
 7044 
 7045     return start;
 7046   }
 7047 
 7048   /***
 7049    *  Arguments:
 7050    *
 7051    *  Inputs:
 7052    *   c_rarg0   - int   adler
 7053    *   c_rarg1   - byte* buff
 7054    *   c_rarg2   - int   len
 7055    *
 7056    * Output:
 7057    *   c_rarg0   - int adler result
 7058    */
 7059   address generate_updateBytesAdler32() {
 7060     __ align(CodeEntryAlignment);
 7061     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 7062     StubCodeMark mark(this, stub_id);
 7063     address start = __ pc();
 7064 
 7065     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7066 
 7067     // Aliases
 7068     Register adler  = c_rarg0;
 7069     Register s1     = c_rarg0;
 7070     Register s2     = c_rarg3;
 7071     Register buff   = c_rarg1;
 7072     Register len    = c_rarg2;
 7073     Register nmax  = r4;
 7074     Register base  = r5;
 7075     Register count = r6;
 7076     Register temp0 = rscratch1;
 7077     Register temp1 = rscratch2;
 7078     FloatRegister vbytes = v0;
 7079     FloatRegister vs1acc = v1;
 7080     FloatRegister vs2acc = v2;
 7081     FloatRegister vtable = v3;
 7082 
 7083     // Max number of bytes we can process before having to take the mod
 7084     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7085     uint64_t BASE = 0xfff1;
 7086     uint64_t NMAX = 0x15B0;
 7087 
 7088     __ mov(base, BASE);
 7089     __ mov(nmax, NMAX);
 7090 
 7091     // Load accumulation coefficients for the upper 16 bits
 7092     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7093     __ ld1(vtable, __ T16B, Address(temp0));
 7094 
 7095     // s1 is initialized to the lower 16 bits of adler
 7096     // s2 is initialized to the upper 16 bits of adler
 7097     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7098     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7099 
 7100     // The pipelined loop needs at least 16 elements for 1 iteration
 7101     // It does check this, but it is more effective to skip to the cleanup loop
 7102     __ cmp(len, (u1)16);
 7103     __ br(Assembler::HS, L_nmax);
 7104     __ cbz(len, L_combine);
 7105 
 7106     __ bind(L_simple_by1_loop);
 7107     __ ldrb(temp0, Address(__ post(buff, 1)));
 7108     __ add(s1, s1, temp0);
 7109     __ add(s2, s2, s1);
 7110     __ subs(len, len, 1);
 7111     __ br(Assembler::HI, L_simple_by1_loop);
 7112 
 7113     // s1 = s1 % BASE
 7114     __ subs(temp0, s1, base);
 7115     __ csel(s1, temp0, s1, Assembler::HS);
 7116 
 7117     // s2 = s2 % BASE
 7118     __ lsr(temp0, s2, 16);
 7119     __ lsl(temp1, temp0, 4);
 7120     __ sub(temp1, temp1, temp0);
 7121     __ add(s2, temp1, s2, ext::uxth);
 7122 
 7123     __ subs(temp0, s2, base);
 7124     __ csel(s2, temp0, s2, Assembler::HS);
 7125 
 7126     __ b(L_combine);
 7127 
 7128     __ bind(L_nmax);
 7129     __ subs(len, len, nmax);
 7130     __ sub(count, nmax, 16);
 7131     __ br(Assembler::LO, L_by16);
 7132 
 7133     __ bind(L_nmax_loop);
 7134 
 7135     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7136                                       vbytes, vs1acc, vs2acc, vtable);
 7137 
 7138     __ subs(count, count, 16);
 7139     __ br(Assembler::HS, L_nmax_loop);
 7140 
 7141     // s1 = s1 % BASE
 7142     __ lsr(temp0, s1, 16);
 7143     __ lsl(temp1, temp0, 4);
 7144     __ sub(temp1, temp1, temp0);
 7145     __ add(temp1, temp1, s1, ext::uxth);
 7146 
 7147     __ lsr(temp0, temp1, 16);
 7148     __ lsl(s1, temp0, 4);
 7149     __ sub(s1, s1, temp0);
 7150     __ add(s1, s1, temp1, ext:: uxth);
 7151 
 7152     __ subs(temp0, s1, base);
 7153     __ csel(s1, temp0, s1, Assembler::HS);
 7154 
 7155     // s2 = s2 % BASE
 7156     __ lsr(temp0, s2, 16);
 7157     __ lsl(temp1, temp0, 4);
 7158     __ sub(temp1, temp1, temp0);
 7159     __ add(temp1, temp1, s2, ext::uxth);
 7160 
 7161     __ lsr(temp0, temp1, 16);
 7162     __ lsl(s2, temp0, 4);
 7163     __ sub(s2, s2, temp0);
 7164     __ add(s2, s2, temp1, ext:: uxth);
 7165 
 7166     __ subs(temp0, s2, base);
 7167     __ csel(s2, temp0, s2, Assembler::HS);
 7168 
 7169     __ subs(len, len, nmax);
 7170     __ sub(count, nmax, 16);
 7171     __ br(Assembler::HS, L_nmax_loop);
 7172 
 7173     __ bind(L_by16);
 7174     __ adds(len, len, count);
 7175     __ br(Assembler::LO, L_by1);
 7176 
 7177     __ bind(L_by16_loop);
 7178 
 7179     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7180                                       vbytes, vs1acc, vs2acc, vtable);
 7181 
 7182     __ subs(len, len, 16);
 7183     __ br(Assembler::HS, L_by16_loop);
 7184 
 7185     __ bind(L_by1);
 7186     __ adds(len, len, 15);
 7187     __ br(Assembler::LO, L_do_mod);
 7188 
 7189     __ bind(L_by1_loop);
 7190     __ ldrb(temp0, Address(__ post(buff, 1)));
 7191     __ add(s1, temp0, s1);
 7192     __ add(s2, s2, s1);
 7193     __ subs(len, len, 1);
 7194     __ br(Assembler::HS, L_by1_loop);
 7195 
 7196     __ bind(L_do_mod);
 7197     // s1 = s1 % BASE
 7198     __ lsr(temp0, s1, 16);
 7199     __ lsl(temp1, temp0, 4);
 7200     __ sub(temp1, temp1, temp0);
 7201     __ add(temp1, temp1, s1, ext::uxth);
 7202 
 7203     __ lsr(temp0, temp1, 16);
 7204     __ lsl(s1, temp0, 4);
 7205     __ sub(s1, s1, temp0);
 7206     __ add(s1, s1, temp1, ext:: uxth);
 7207 
 7208     __ subs(temp0, s1, base);
 7209     __ csel(s1, temp0, s1, Assembler::HS);
 7210 
 7211     // s2 = s2 % BASE
 7212     __ lsr(temp0, s2, 16);
 7213     __ lsl(temp1, temp0, 4);
 7214     __ sub(temp1, temp1, temp0);
 7215     __ add(temp1, temp1, s2, ext::uxth);
 7216 
 7217     __ lsr(temp0, temp1, 16);
 7218     __ lsl(s2, temp0, 4);
 7219     __ sub(s2, s2, temp0);
 7220     __ add(s2, s2, temp1, ext:: uxth);
 7221 
 7222     __ subs(temp0, s2, base);
 7223     __ csel(s2, temp0, s2, Assembler::HS);
 7224 
 7225     // Combine lower bits and higher bits
 7226     __ bind(L_combine);
 7227     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7228 
 7229     __ ret(lr);
 7230 
 7231     return start;
 7232   }
 7233 
 7234   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7235           Register temp0, Register temp1, FloatRegister vbytes,
 7236           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7237     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7238     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7239     // In non-vectorized code, we update s1 and s2 as:
 7240     //   s1 <- s1 + b1
 7241     //   s2 <- s2 + s1
 7242     //   s1 <- s1 + b2
 7243     //   s2 <- s2 + b1
 7244     //   ...
 7245     //   s1 <- s1 + b16
 7246     //   s2 <- s2 + s1
 7247     // Putting above assignments together, we have:
 7248     //   s1_new = s1 + b1 + b2 + ... + b16
 7249     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7250     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7251     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7252     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7253 
 7254     // s2 = s2 + s1 * 16
 7255     __ add(s2, s2, s1, Assembler::LSL, 4);
 7256 
 7257     // vs1acc = b1 + b2 + b3 + ... + b16
 7258     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7259     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7260     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7261     __ uaddlv(vs1acc, __ T16B, vbytes);
 7262     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7263 
 7264     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7265     __ fmovd(temp0, vs1acc);
 7266     __ fmovd(temp1, vs2acc);
 7267     __ add(s1, s1, temp0);
 7268     __ add(s2, s2, temp1);
 7269   }
 7270 
 7271   /**
 7272    *  Arguments:
 7273    *
 7274    *  Input:
 7275    *    c_rarg0   - x address
 7276    *    c_rarg1   - x length
 7277    *    c_rarg2   - y address
 7278    *    c_rarg3   - y length
 7279    *    c_rarg4   - z address
 7280    */
 7281   address generate_multiplyToLen() {
 7282     __ align(CodeEntryAlignment);
 7283     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 7284     StubCodeMark mark(this, stub_id);
 7285 
 7286     address start = __ pc();
 7287  
 7288     if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
 7289       return start;
 7290     }
 7291     const Register x     = r0;
 7292     const Register xlen  = r1;
 7293     const Register y     = r2;
 7294     const Register ylen  = r3;
 7295     const Register z     = r4;
 7296 
 7297     const Register tmp0  = r5;
 7298     const Register tmp1  = r10;
 7299     const Register tmp2  = r11;
 7300     const Register tmp3  = r12;
 7301     const Register tmp4  = r13;
 7302     const Register tmp5  = r14;
 7303     const Register tmp6  = r15;
 7304     const Register tmp7  = r16;
 7305 
 7306     BLOCK_COMMENT("Entry:");
 7307     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7308     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7309     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7310     __ ret(lr);
 7311 
 7312     AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
 7313     return start;
 7314   }
 7315 
 7316   address generate_squareToLen() {
 7317     // squareToLen algorithm for sizes 1..127 described in java code works
 7318     // faster than multiply_to_len on some CPUs and slower on others, but
 7319     // multiply_to_len shows a bit better overall results
 7320     __ align(CodeEntryAlignment);
 7321     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 7322     StubCodeMark mark(this, stub_id);
 7323     address start = __ pc();
 7324 
 7325     if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
 7326       return start;
 7327     }
 7328     const Register x     = r0;
 7329     const Register xlen  = r1;
 7330     const Register z     = r2;
 7331     const Register y     = r4; // == x
 7332     const Register ylen  = r5; // == xlen
 7333 
 7334     const Register tmp0  = r3;
 7335     const Register tmp1  = r10;
 7336     const Register tmp2  = r11;
 7337     const Register tmp3  = r12;
 7338     const Register tmp4  = r13;
 7339     const Register tmp5  = r14;
 7340     const Register tmp6  = r15;
 7341     const Register tmp7  = r16;
 7342 
 7343     RegSet spilled_regs = RegSet::of(y, ylen);
 7344     BLOCK_COMMENT("Entry:");
 7345     __ enter();
 7346     __ push(spilled_regs, sp);
 7347     __ mov(y, x);
 7348     __ mov(ylen, xlen);
 7349     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7350     __ pop(spilled_regs, sp);
 7351     __ leave();
 7352     __ ret(lr);
 7353 
 7354     AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
 7355     return start;
 7356   }
 7357 
 7358   address generate_mulAdd() {
 7359     __ align(CodeEntryAlignment);
 7360     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 7361     StubCodeMark mark(this, stub_id);
 7362 
 7363     address start = __ pc();
 7364 
 7365     if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
 7366       return start;
 7367     }
 7368     const Register out     = r0;
 7369     const Register in      = r1;
 7370     const Register offset  = r2;
 7371     const Register len     = r3;
 7372     const Register k       = r4;
 7373 
 7374     BLOCK_COMMENT("Entry:");
 7375     __ enter();
 7376     __ mul_add(out, in, offset, len, k);
 7377     __ leave();
 7378     __ ret(lr);
 7379 
 7380     AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
 7381     return start;
 7382   }
 7383 
 7384   // Arguments:
 7385   //
 7386   // Input:
 7387   //   c_rarg0   - newArr address
 7388   //   c_rarg1   - oldArr address
 7389   //   c_rarg2   - newIdx
 7390   //   c_rarg3   - shiftCount
 7391   //   c_rarg4   - numIter
 7392   //
 7393   address generate_bigIntegerRightShift() {
 7394     __ align(CodeEntryAlignment);
 7395     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 7396     StubCodeMark mark(this, stub_id);
 7397     address start = __ pc();
 7398 
 7399     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7400 
 7401     Register newArr        = c_rarg0;
 7402     Register oldArr        = c_rarg1;
 7403     Register newIdx        = c_rarg2;
 7404     Register shiftCount    = c_rarg3;
 7405     Register numIter       = c_rarg4;
 7406     Register idx           = numIter;
 7407 
 7408     Register newArrCur     = rscratch1;
 7409     Register shiftRevCount = rscratch2;
 7410     Register oldArrCur     = r13;
 7411     Register oldArrNext    = r14;
 7412 
 7413     FloatRegister oldElem0        = v0;
 7414     FloatRegister oldElem1        = v1;
 7415     FloatRegister newElem         = v2;
 7416     FloatRegister shiftVCount     = v3;
 7417     FloatRegister shiftVRevCount  = v4;
 7418 
 7419     __ cbz(idx, Exit);
 7420 
 7421     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7422 
 7423     // left shift count
 7424     __ movw(shiftRevCount, 32);
 7425     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7426 
 7427     // numIter too small to allow a 4-words SIMD loop, rolling back
 7428     __ cmp(numIter, (u1)4);
 7429     __ br(Assembler::LT, ShiftThree);
 7430 
 7431     __ dup(shiftVCount,    __ T4S, shiftCount);
 7432     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7433     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7434 
 7435     __ BIND(ShiftSIMDLoop);
 7436 
 7437     // Calculate the load addresses
 7438     __ sub(idx, idx, 4);
 7439     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7440     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7441     __ add(oldArrCur,  oldArrNext, 4);
 7442 
 7443     // Load 4 words and process
 7444     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7445     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7446     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7447     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7448     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7449     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7450 
 7451     __ cmp(idx, (u1)4);
 7452     __ br(Assembler::LT, ShiftTwoLoop);
 7453     __ b(ShiftSIMDLoop);
 7454 
 7455     __ BIND(ShiftTwoLoop);
 7456     __ cbz(idx, Exit);
 7457     __ cmp(idx, (u1)1);
 7458     __ br(Assembler::EQ, ShiftOne);
 7459 
 7460     // Calculate the load addresses
 7461     __ sub(idx, idx, 2);
 7462     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7463     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7464     __ add(oldArrCur,  oldArrNext, 4);
 7465 
 7466     // Load 2 words and process
 7467     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7468     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7469     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7470     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7471     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7472     __ st1(newElem,   __ T2S, Address(newArrCur));
 7473     __ b(ShiftTwoLoop);
 7474 
 7475     __ BIND(ShiftThree);
 7476     __ tbz(idx, 1, ShiftOne);
 7477     __ tbz(idx, 0, ShiftTwo);
 7478     __ ldrw(r10,  Address(oldArr, 12));
 7479     __ ldrw(r11,  Address(oldArr, 8));
 7480     __ lsrvw(r10, r10, shiftCount);
 7481     __ lslvw(r11, r11, shiftRevCount);
 7482     __ orrw(r12,  r10, r11);
 7483     __ strw(r12,  Address(newArr, 8));
 7484 
 7485     __ BIND(ShiftTwo);
 7486     __ ldrw(r10,  Address(oldArr, 8));
 7487     __ ldrw(r11,  Address(oldArr, 4));
 7488     __ lsrvw(r10, r10, shiftCount);
 7489     __ lslvw(r11, r11, shiftRevCount);
 7490     __ orrw(r12,  r10, r11);
 7491     __ strw(r12,  Address(newArr, 4));
 7492 
 7493     __ BIND(ShiftOne);
 7494     __ ldrw(r10,  Address(oldArr, 4));
 7495     __ ldrw(r11,  Address(oldArr));
 7496     __ lsrvw(r10, r10, shiftCount);
 7497     __ lslvw(r11, r11, shiftRevCount);
 7498     __ orrw(r12,  r10, r11);
 7499     __ strw(r12,  Address(newArr));
 7500 
 7501     __ BIND(Exit);
 7502     __ ret(lr);
 7503 
 7504     return start;
 7505   }
 7506 
 7507   // Arguments:
 7508   //
 7509   // Input:
 7510   //   c_rarg0   - newArr address
 7511   //   c_rarg1   - oldArr address
 7512   //   c_rarg2   - newIdx
 7513   //   c_rarg3   - shiftCount
 7514   //   c_rarg4   - numIter
 7515   //
 7516   address generate_bigIntegerLeftShift() {
 7517     __ align(CodeEntryAlignment);
 7518     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 7519     StubCodeMark mark(this, stub_id);
 7520     address start = __ pc();
 7521 
 7522     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7523 
 7524     Register newArr        = c_rarg0;
 7525     Register oldArr        = c_rarg1;
 7526     Register newIdx        = c_rarg2;
 7527     Register shiftCount    = c_rarg3;
 7528     Register numIter       = c_rarg4;
 7529 
 7530     Register shiftRevCount = rscratch1;
 7531     Register oldArrNext    = rscratch2;
 7532 
 7533     FloatRegister oldElem0        = v0;
 7534     FloatRegister oldElem1        = v1;
 7535     FloatRegister newElem         = v2;
 7536     FloatRegister shiftVCount     = v3;
 7537     FloatRegister shiftVRevCount  = v4;
 7538 
 7539     __ cbz(numIter, Exit);
 7540 
 7541     __ add(oldArrNext, oldArr, 4);
 7542     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7543 
 7544     // right shift count
 7545     __ movw(shiftRevCount, 32);
 7546     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7547 
 7548     // numIter too small to allow a 4-words SIMD loop, rolling back
 7549     __ cmp(numIter, (u1)4);
 7550     __ br(Assembler::LT, ShiftThree);
 7551 
 7552     __ dup(shiftVCount,     __ T4S, shiftCount);
 7553     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 7554     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 7555 
 7556     __ BIND(ShiftSIMDLoop);
 7557 
 7558     // load 4 words and process
 7559     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 7560     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 7561     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7562     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7563     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7564     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 7565     __ sub(numIter,   numIter, 4);
 7566 
 7567     __ cmp(numIter, (u1)4);
 7568     __ br(Assembler::LT, ShiftTwoLoop);
 7569     __ b(ShiftSIMDLoop);
 7570 
 7571     __ BIND(ShiftTwoLoop);
 7572     __ cbz(numIter, Exit);
 7573     __ cmp(numIter, (u1)1);
 7574     __ br(Assembler::EQ, ShiftOne);
 7575 
 7576     // load 2 words and process
 7577     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 7578     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 7579     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 7580     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 7581     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 7582     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 7583     __ sub(numIter,   numIter, 2);
 7584     __ b(ShiftTwoLoop);
 7585 
 7586     __ BIND(ShiftThree);
 7587     __ ldrw(r10,  __ post(oldArr, 4));
 7588     __ ldrw(r11,  __ post(oldArrNext, 4));
 7589     __ lslvw(r10, r10, shiftCount);
 7590     __ lsrvw(r11, r11, shiftRevCount);
 7591     __ orrw(r12,  r10, r11);
 7592     __ strw(r12,  __ post(newArr, 4));
 7593     __ tbz(numIter, 1, Exit);
 7594     __ tbz(numIter, 0, ShiftOne);
 7595 
 7596     __ BIND(ShiftTwo);
 7597     __ ldrw(r10,  __ post(oldArr, 4));
 7598     __ ldrw(r11,  __ post(oldArrNext, 4));
 7599     __ lslvw(r10, r10, shiftCount);
 7600     __ lsrvw(r11, r11, shiftRevCount);
 7601     __ orrw(r12,  r10, r11);
 7602     __ strw(r12,  __ post(newArr, 4));
 7603 
 7604     __ BIND(ShiftOne);
 7605     __ ldrw(r10,  Address(oldArr));
 7606     __ ldrw(r11,  Address(oldArrNext));
 7607     __ lslvw(r10, r10, shiftCount);
 7608     __ lsrvw(r11, r11, shiftRevCount);
 7609     __ orrw(r12,  r10, r11);
 7610     __ strw(r12,  Address(newArr));
 7611 
 7612     __ BIND(Exit);
 7613     __ ret(lr);
 7614 
 7615     return start;
 7616   }
 7617 
 7618   address generate_count_positives(address &count_positives_long) {
 7619     const u1 large_loop_size = 64;
 7620     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 7621     int dcache_line = VM_Version::dcache_line_size();
 7622 
 7623     Register ary1 = r1, len = r2, result = r0;
 7624 
 7625     __ align(CodeEntryAlignment);
 7626 
 7627     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 7628     StubCodeMark mark(this, stub_id);
 7629 
 7630     address entry = __ pc();
 7631 
 7632     __ enter();
 7633     // precondition: a copy of len is already in result
 7634     // __ mov(result, len);
 7635 
 7636   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 7637         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 7638 
 7639   __ cmp(len, (u1)15);
 7640   __ br(Assembler::GT, LEN_OVER_15);
 7641   // The only case when execution falls into this code is when pointer is near
 7642   // the end of memory page and we have to avoid reading next page
 7643   __ add(ary1, ary1, len);
 7644   __ subs(len, len, 8);
 7645   __ br(Assembler::GT, LEN_OVER_8);
 7646   __ ldr(rscratch2, Address(ary1, -8));
 7647   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 7648   __ lsrv(rscratch2, rscratch2, rscratch1);
 7649   __ tst(rscratch2, UPPER_BIT_MASK);
 7650   __ csel(result, zr, result, Assembler::NE);
 7651   __ leave();
 7652   __ ret(lr);
 7653   __ bind(LEN_OVER_8);
 7654   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 7655   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 7656   __ tst(rscratch2, UPPER_BIT_MASK);
 7657   __ br(Assembler::NE, RET_NO_POP);
 7658   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 7659   __ lsrv(rscratch1, rscratch1, rscratch2);
 7660   __ tst(rscratch1, UPPER_BIT_MASK);
 7661   __ bind(RET_NO_POP);
 7662   __ csel(result, zr, result, Assembler::NE);
 7663   __ leave();
 7664   __ ret(lr);
 7665 
 7666   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 7667   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 7668 
 7669   count_positives_long = __ pc(); // 2nd entry point
 7670 
 7671   __ enter();
 7672 
 7673   __ bind(LEN_OVER_15);
 7674     __ push(spilled_regs, sp);
 7675     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 7676     __ cbz(rscratch2, ALIGNED);
 7677     __ ldp(tmp6, tmp1, Address(ary1));
 7678     __ mov(tmp5, 16);
 7679     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 7680     __ add(ary1, ary1, rscratch1);
 7681     __ orr(tmp6, tmp6, tmp1);
 7682     __ tst(tmp6, UPPER_BIT_MASK);
 7683     __ br(Assembler::NE, RET_ADJUST);
 7684     __ sub(len, len, rscratch1);
 7685 
 7686   __ bind(ALIGNED);
 7687     __ cmp(len, large_loop_size);
 7688     __ br(Assembler::LT, CHECK_16);
 7689     // Perform 16-byte load as early return in pre-loop to handle situation
 7690     // when initially aligned large array has negative values at starting bytes,
 7691     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 7692     // slower. Cases with negative bytes further ahead won't be affected that
 7693     // much. In fact, it'll be faster due to early loads, less instructions and
 7694     // less branches in LARGE_LOOP.
 7695     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 7696     __ sub(len, len, 16);
 7697     __ orr(tmp6, tmp6, tmp1);
 7698     __ tst(tmp6, UPPER_BIT_MASK);
 7699     __ br(Assembler::NE, RET_ADJUST_16);
 7700     __ cmp(len, large_loop_size);
 7701     __ br(Assembler::LT, CHECK_16);
 7702 
 7703     if (SoftwarePrefetchHintDistance >= 0
 7704         && SoftwarePrefetchHintDistance >= dcache_line) {
 7705       // initial prefetch
 7706       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 7707     }
 7708   __ bind(LARGE_LOOP);
 7709     if (SoftwarePrefetchHintDistance >= 0) {
 7710       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 7711     }
 7712     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 7713     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 7714     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 7715     // instructions per cycle and have less branches, but this approach disables
 7716     // early return, thus, all 64 bytes are loaded and checked every time.
 7717     __ ldp(tmp2, tmp3, Address(ary1));
 7718     __ ldp(tmp4, tmp5, Address(ary1, 16));
 7719     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 7720     __ ldp(tmp6, tmp1, Address(ary1, 48));
 7721     __ add(ary1, ary1, large_loop_size);
 7722     __ sub(len, len, large_loop_size);
 7723     __ orr(tmp2, tmp2, tmp3);
 7724     __ orr(tmp4, tmp4, tmp5);
 7725     __ orr(rscratch1, rscratch1, rscratch2);
 7726     __ orr(tmp6, tmp6, tmp1);
 7727     __ orr(tmp2, tmp2, tmp4);
 7728     __ orr(rscratch1, rscratch1, tmp6);
 7729     __ orr(tmp2, tmp2, rscratch1);
 7730     __ tst(tmp2, UPPER_BIT_MASK);
 7731     __ br(Assembler::NE, RET_ADJUST_LONG);
 7732     __ cmp(len, large_loop_size);
 7733     __ br(Assembler::GE, LARGE_LOOP);
 7734 
 7735   __ bind(CHECK_16); // small 16-byte load pre-loop
 7736     __ cmp(len, (u1)16);
 7737     __ br(Assembler::LT, POST_LOOP16);
 7738 
 7739   __ bind(LOOP16); // small 16-byte load loop
 7740     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 7741     __ sub(len, len, 16);
 7742     __ orr(tmp2, tmp2, tmp3);
 7743     __ tst(tmp2, UPPER_BIT_MASK);
 7744     __ br(Assembler::NE, RET_ADJUST_16);
 7745     __ cmp(len, (u1)16);
 7746     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 7747 
 7748   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 7749     __ cmp(len, (u1)8);
 7750     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 7751     __ ldr(tmp3, Address(__ post(ary1, 8)));
 7752     __ tst(tmp3, UPPER_BIT_MASK);
 7753     __ br(Assembler::NE, RET_ADJUST);
 7754     __ sub(len, len, 8);
 7755 
 7756   __ bind(POST_LOOP16_LOAD_TAIL);
 7757     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 7758     __ ldr(tmp1, Address(ary1));
 7759     __ mov(tmp2, 64);
 7760     __ sub(tmp4, tmp2, len, __ LSL, 3);
 7761     __ lslv(tmp1, tmp1, tmp4);
 7762     __ tst(tmp1, UPPER_BIT_MASK);
 7763     __ br(Assembler::NE, RET_ADJUST);
 7764     // Fallthrough
 7765 
 7766   __ bind(RET_LEN);
 7767     __ pop(spilled_regs, sp);
 7768     __ leave();
 7769     __ ret(lr);
 7770 
 7771     // difference result - len is the count of guaranteed to be
 7772     // positive bytes
 7773 
 7774   __ bind(RET_ADJUST_LONG);
 7775     __ add(len, len, (u1)(large_loop_size - 16));
 7776   __ bind(RET_ADJUST_16);
 7777     __ add(len, len, 16);
 7778   __ bind(RET_ADJUST);
 7779     __ pop(spilled_regs, sp);
 7780     __ leave();
 7781     __ sub(result, result, len);
 7782     __ ret(lr);
 7783 
 7784     return entry;
 7785   }
 7786 
 7787   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 7788         bool usePrefetch, Label &NOT_EQUAL) {
 7789     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7790         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7791         tmp7 = r12, tmp8 = r13;
 7792     Label LOOP;
 7793 
 7794     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7795     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7796     __ bind(LOOP);
 7797     if (usePrefetch) {
 7798       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7799       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7800     }
 7801     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7802     __ eor(tmp1, tmp1, tmp2);
 7803     __ eor(tmp3, tmp3, tmp4);
 7804     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7805     __ orr(tmp1, tmp1, tmp3);
 7806     __ cbnz(tmp1, NOT_EQUAL);
 7807     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7808     __ eor(tmp5, tmp5, tmp6);
 7809     __ eor(tmp7, tmp7, tmp8);
 7810     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7811     __ orr(tmp5, tmp5, tmp7);
 7812     __ cbnz(tmp5, NOT_EQUAL);
 7813     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7814     __ eor(tmp1, tmp1, tmp2);
 7815     __ eor(tmp3, tmp3, tmp4);
 7816     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7817     __ orr(tmp1, tmp1, tmp3);
 7818     __ cbnz(tmp1, NOT_EQUAL);
 7819     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7820     __ eor(tmp5, tmp5, tmp6);
 7821     __ sub(cnt1, cnt1, 8 * wordSize);
 7822     __ eor(tmp7, tmp7, tmp8);
 7823     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7824     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 7825     // cmp) because subs allows an unlimited range of immediate operand.
 7826     __ subs(tmp6, cnt1, loopThreshold);
 7827     __ orr(tmp5, tmp5, tmp7);
 7828     __ cbnz(tmp5, NOT_EQUAL);
 7829     __ br(__ GE, LOOP);
 7830     // post-loop
 7831     __ eor(tmp1, tmp1, tmp2);
 7832     __ eor(tmp3, tmp3, tmp4);
 7833     __ orr(tmp1, tmp1, tmp3);
 7834     __ sub(cnt1, cnt1, 2 * wordSize);
 7835     __ cbnz(tmp1, NOT_EQUAL);
 7836   }
 7837 
 7838   void generate_large_array_equals_loop_simd(int loopThreshold,
 7839         bool usePrefetch, Label &NOT_EQUAL) {
 7840     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7841         tmp2 = rscratch2;
 7842     Label LOOP;
 7843 
 7844     __ bind(LOOP);
 7845     if (usePrefetch) {
 7846       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7847       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7848     }
 7849     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 7850     __ sub(cnt1, cnt1, 8 * wordSize);
 7851     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 7852     __ subs(tmp1, cnt1, loopThreshold);
 7853     __ eor(v0, __ T16B, v0, v4);
 7854     __ eor(v1, __ T16B, v1, v5);
 7855     __ eor(v2, __ T16B, v2, v6);
 7856     __ eor(v3, __ T16B, v3, v7);
 7857     __ orr(v0, __ T16B, v0, v1);
 7858     __ orr(v1, __ T16B, v2, v3);
 7859     __ orr(v0, __ T16B, v0, v1);
 7860     __ umov(tmp1, v0, __ D, 0);
 7861     __ umov(tmp2, v0, __ D, 1);
 7862     __ orr(tmp1, tmp1, tmp2);
 7863     __ cbnz(tmp1, NOT_EQUAL);
 7864     __ br(__ GE, LOOP);
 7865   }
 7866 
 7867   // a1 = r1 - array1 address
 7868   // a2 = r2 - array2 address
 7869   // result = r0 - return value. Already contains "false"
 7870   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 7871   // r3-r5 are reserved temporary registers
 7872   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 7873   address generate_large_array_equals() {
 7874     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7875         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7876         tmp7 = r12, tmp8 = r13;
 7877     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 7878         SMALL_LOOP, POST_LOOP;
 7879     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 7880     // calculate if at least 32 prefetched bytes are used
 7881     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 7882     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 7883     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 7884     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 7885         tmp5, tmp6, tmp7, tmp8);
 7886 
 7887     __ align(CodeEntryAlignment);
 7888 
 7889     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 7890     StubCodeMark mark(this, stub_id);
 7891 
 7892     address entry = __ pc();
 7893     __ enter();
 7894     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 7895     // also advance pointers to use post-increment instead of pre-increment
 7896     __ add(a1, a1, wordSize);
 7897     __ add(a2, a2, wordSize);
 7898     if (AvoidUnalignedAccesses) {
 7899       // both implementations (SIMD/nonSIMD) are using relatively large load
 7900       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 7901       // on some CPUs in case of address is not at least 16-byte aligned.
 7902       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 7903       // load if needed at least for 1st address and make if 16-byte aligned.
 7904       Label ALIGNED16;
 7905       __ tbz(a1, 3, ALIGNED16);
 7906       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 7907       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 7908       __ sub(cnt1, cnt1, wordSize);
 7909       __ eor(tmp1, tmp1, tmp2);
 7910       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 7911       __ bind(ALIGNED16);
 7912     }
 7913     if (UseSIMDForArrayEquals) {
 7914       if (SoftwarePrefetchHintDistance >= 0) {
 7915         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 7916         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 7917         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 7918             /* prfm = */ true, NOT_EQUAL);
 7919         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 7920         __ br(__ LT, TAIL);
 7921       }
 7922       __ bind(NO_PREFETCH_LARGE_LOOP);
 7923       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 7924           /* prfm = */ false, NOT_EQUAL);
 7925     } else {
 7926       __ push(spilled_regs, sp);
 7927       if (SoftwarePrefetchHintDistance >= 0) {
 7928         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 7929         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 7930         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 7931             /* prfm = */ true, NOT_EQUAL);
 7932         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 7933         __ br(__ LT, TAIL);
 7934       }
 7935       __ bind(NO_PREFETCH_LARGE_LOOP);
 7936       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 7937           /* prfm = */ false, NOT_EQUAL);
 7938     }
 7939     __ bind(TAIL);
 7940       __ cbz(cnt1, EQUAL);
 7941       __ subs(cnt1, cnt1, wordSize);
 7942       __ br(__ LE, POST_LOOP);
 7943     __ bind(SMALL_LOOP);
 7944       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 7945       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 7946       __ subs(cnt1, cnt1, wordSize);
 7947       __ eor(tmp1, tmp1, tmp2);
 7948       __ cbnz(tmp1, NOT_EQUAL);
 7949       __ br(__ GT, SMALL_LOOP);
 7950     __ bind(POST_LOOP);
 7951       __ ldr(tmp1, Address(a1, cnt1));
 7952       __ ldr(tmp2, Address(a2, cnt1));
 7953       __ eor(tmp1, tmp1, tmp2);
 7954       __ cbnz(tmp1, NOT_EQUAL);
 7955     __ bind(EQUAL);
 7956       __ mov(result, true);
 7957     __ bind(NOT_EQUAL);
 7958       if (!UseSIMDForArrayEquals) {
 7959         __ pop(spilled_regs, sp);
 7960       }
 7961     __ bind(NOT_EQUAL_NO_POP);
 7962     __ leave();
 7963     __ ret(lr);
 7964     return entry;
 7965   }
 7966 
 7967   // result = r0 - return value. Contains initial hashcode value on entry.
 7968   // ary = r1 - array address
 7969   // cnt = r2 - elements count
 7970   // Clobbers: v0-v13, rscratch1, rscratch2
 7971   address generate_large_arrays_hashcode(BasicType eltype) {
 7972     const Register result = r0, ary = r1, cnt = r2;
 7973     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 7974     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 7975     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 7976     const FloatRegister vpowm = v13;
 7977 
 7978     ARRAYS_HASHCODE_REGISTERS;
 7979 
 7980     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 7981 
 7982     unsigned int vf; // vectorization factor
 7983     bool multiply_by_halves;
 7984     Assembler::SIMD_Arrangement load_arrangement;
 7985     switch (eltype) {
 7986     case T_BOOLEAN:
 7987     case T_BYTE:
 7988       load_arrangement = Assembler::T8B;
 7989       multiply_by_halves = true;
 7990       vf = 8;
 7991       break;
 7992     case T_CHAR:
 7993     case T_SHORT:
 7994       load_arrangement = Assembler::T8H;
 7995       multiply_by_halves = true;
 7996       vf = 8;
 7997       break;
 7998     case T_INT:
 7999       load_arrangement = Assembler::T4S;
 8000       multiply_by_halves = false;
 8001       vf = 4;
 8002       break;
 8003     default:
 8004       ShouldNotReachHere();
 8005     }
 8006 
 8007     // Unroll factor
 8008     const unsigned uf = 4;
 8009 
 8010     // Effective vectorization factor
 8011     const unsigned evf = vf * uf;
 8012 
 8013     __ align(CodeEntryAlignment);
 8014 
 8015     StubGenStubId stub_id;
 8016     switch (eltype) {
 8017     case T_BOOLEAN:
 8018       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 8019       break;
 8020     case T_BYTE:
 8021       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 8022       break;
 8023     case T_CHAR:
 8024       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 8025       break;
 8026     case T_SHORT:
 8027       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 8028       break;
 8029     case T_INT:
 8030       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 8031       break;
 8032     default:
 8033       stub_id = StubGenStubId::NO_STUBID;
 8034       ShouldNotReachHere();
 8035     };
 8036 
 8037     StubCodeMark mark(this, stub_id);
 8038 
 8039     address entry = __ pc();
 8040     __ enter();
 8041 
 8042     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8043     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8044     // value shouldn't change throughout both loops.
 8045     __ movw(rscratch1, intpow(31U, 3));
 8046     __ mov(vpow, Assembler::S, 0, rscratch1);
 8047     __ movw(rscratch1, intpow(31U, 2));
 8048     __ mov(vpow, Assembler::S, 1, rscratch1);
 8049     __ movw(rscratch1, intpow(31U, 1));
 8050     __ mov(vpow, Assembler::S, 2, rscratch1);
 8051     __ movw(rscratch1, intpow(31U, 0));
 8052     __ mov(vpow, Assembler::S, 3, rscratch1);
 8053 
 8054     __ mov(vmul0, Assembler::T16B, 0);
 8055     __ mov(vmul0, Assembler::S, 3, result);
 8056 
 8057     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8058     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8059 
 8060     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8061     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8062 
 8063     // SMALL LOOP
 8064     __ bind(SMALL_LOOP);
 8065 
 8066     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8067     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8068     __ subsw(rscratch2, rscratch2, vf);
 8069 
 8070     if (load_arrangement == Assembler::T8B) {
 8071       // Extend 8B to 8H to be able to use vector multiply
 8072       // instructions
 8073       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8074       if (is_signed_subword_type(eltype)) {
 8075         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8076       } else {
 8077         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8078       }
 8079     }
 8080 
 8081     switch (load_arrangement) {
 8082     case Assembler::T4S:
 8083       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8084       break;
 8085     case Assembler::T8B:
 8086     case Assembler::T8H:
 8087       assert(is_subword_type(eltype), "subword type expected");
 8088       if (is_signed_subword_type(eltype)) {
 8089         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8090       } else {
 8091         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8092       }
 8093       break;
 8094     default:
 8095       __ should_not_reach_here();
 8096     }
 8097 
 8098     // Process the upper half of a vector
 8099     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8100       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8101       if (is_signed_subword_type(eltype)) {
 8102         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8103       } else {
 8104         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8105       }
 8106     }
 8107 
 8108     __ br(Assembler::HI, SMALL_LOOP);
 8109 
 8110     // SMALL LOOP'S EPILOQUE
 8111     __ lsr(rscratch2, cnt, exact_log2(evf));
 8112     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8113 
 8114     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8115     __ addv(vmul0, Assembler::T4S, vmul0);
 8116     __ umov(result, vmul0, Assembler::S, 0);
 8117 
 8118     // TAIL
 8119     __ bind(TAIL);
 8120 
 8121     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8122     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8123     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8124     __ andr(rscratch2, cnt, vf - 1);
 8125     __ bind(TAIL_SHORTCUT);
 8126     __ adr(rscratch1, BR_BASE);
 8127     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 8128     __ movw(rscratch2, 0x1f);
 8129     __ br(rscratch1);
 8130 
 8131     for (size_t i = 0; i < vf - 1; ++i) {
 8132       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8133                                    eltype);
 8134       __ maddw(result, result, rscratch2, rscratch1);
 8135     }
 8136     __ bind(BR_BASE);
 8137 
 8138     __ leave();
 8139     __ ret(lr);
 8140 
 8141     // LARGE LOOP
 8142     __ bind(LARGE_LOOP_PREHEADER);
 8143 
 8144     __ lsr(rscratch2, cnt, exact_log2(evf));
 8145 
 8146     if (multiply_by_halves) {
 8147       // 31^4 - multiplier between lower and upper parts of a register
 8148       __ movw(rscratch1, intpow(31U, vf / 2));
 8149       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8150       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8151       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8152       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8153     } else {
 8154       // 31^16
 8155       __ movw(rscratch1, intpow(31U, evf));
 8156       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8157     }
 8158 
 8159     __ mov(vmul3, Assembler::T16B, 0);
 8160     __ mov(vmul2, Assembler::T16B, 0);
 8161     __ mov(vmul1, Assembler::T16B, 0);
 8162 
 8163     __ bind(LARGE_LOOP);
 8164 
 8165     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8166     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8167     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8168     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8169 
 8170     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8171            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8172 
 8173     if (load_arrangement == Assembler::T8B) {
 8174       // Extend 8B to 8H to be able to use vector multiply
 8175       // instructions
 8176       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8177       if (is_signed_subword_type(eltype)) {
 8178         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8179         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8180         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8181         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8182       } else {
 8183         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8184         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8185         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8186         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8187       }
 8188     }
 8189 
 8190     switch (load_arrangement) {
 8191     case Assembler::T4S:
 8192       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8193       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8194       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8195       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8196       break;
 8197     case Assembler::T8B:
 8198     case Assembler::T8H:
 8199       assert(is_subword_type(eltype), "subword type expected");
 8200       if (is_signed_subword_type(eltype)) {
 8201         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8202         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8203         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8204         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8205       } else {
 8206         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8207         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8208         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8209         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8210       }
 8211       break;
 8212     default:
 8213       __ should_not_reach_here();
 8214     }
 8215 
 8216     // Process the upper half of a vector
 8217     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8218       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8219       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8220       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8221       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8222       if (is_signed_subword_type(eltype)) {
 8223         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8224         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8225         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8226         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8227       } else {
 8228         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8229         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8230         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8231         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8232       }
 8233     }
 8234 
 8235     __ subsw(rscratch2, rscratch2, 1);
 8236     __ br(Assembler::HI, LARGE_LOOP);
 8237 
 8238     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8239     __ addv(vmul3, Assembler::T4S, vmul3);
 8240     __ umov(result, vmul3, Assembler::S, 0);
 8241 
 8242     __ mov(rscratch2, intpow(31U, vf));
 8243 
 8244     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8245     __ addv(vmul2, Assembler::T4S, vmul2);
 8246     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8247     __ maddw(result, result, rscratch2, rscratch1);
 8248 
 8249     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8250     __ addv(vmul1, Assembler::T4S, vmul1);
 8251     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8252     __ maddw(result, result, rscratch2, rscratch1);
 8253 
 8254     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8255     __ addv(vmul0, Assembler::T4S, vmul0);
 8256     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8257     __ maddw(result, result, rscratch2, rscratch1);
 8258 
 8259     __ andr(rscratch2, cnt, vf - 1);
 8260     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8261 
 8262     __ leave();
 8263     __ ret(lr);
 8264 
 8265     return entry;
 8266   }
 8267 
 8268   address generate_dsin_dcos(bool isCos) {
 8269     __ align(CodeEntryAlignment);
 8270     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 8271     StubCodeMark mark(this, stub_id);
 8272     address start = __ pc();
 8273     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8274         (address)StubRoutines::aarch64::_two_over_pi,
 8275         (address)StubRoutines::aarch64::_pio2,
 8276         (address)StubRoutines::aarch64::_dsin_coef,
 8277         (address)StubRoutines::aarch64::_dcos_coef);
 8278     return start;
 8279   }
 8280 
 8281   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8282   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8283       Label &DIFF2) {
 8284     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8285     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8286 
 8287     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8288     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8289     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8290     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8291 
 8292     __ fmovd(tmpL, vtmp3);
 8293     __ eor(rscratch2, tmp3, tmpL);
 8294     __ cbnz(rscratch2, DIFF2);
 8295 
 8296     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8297     __ umov(tmpL, vtmp3, __ D, 1);
 8298     __ eor(rscratch2, tmpU, tmpL);
 8299     __ cbnz(rscratch2, DIFF1);
 8300 
 8301     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8302     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8303     __ fmovd(tmpL, vtmp);
 8304     __ eor(rscratch2, tmp3, tmpL);
 8305     __ cbnz(rscratch2, DIFF2);
 8306 
 8307     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8308     __ umov(tmpL, vtmp, __ D, 1);
 8309     __ eor(rscratch2, tmpU, tmpL);
 8310     __ cbnz(rscratch2, DIFF1);
 8311   }
 8312 
 8313   // r0  = result
 8314   // r1  = str1
 8315   // r2  = cnt1
 8316   // r3  = str2
 8317   // r4  = cnt2
 8318   // r10 = tmp1
 8319   // r11 = tmp2
 8320   address generate_compare_long_string_different_encoding(bool isLU) {
 8321     __ align(CodeEntryAlignment);
 8322     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 8323     StubCodeMark mark(this, stub_id);
 8324     address entry = __ pc();
 8325     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8326         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8327         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8328     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8329         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8330     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8331     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8332 
 8333     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8334 
 8335     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8336     // cnt2 == amount of characters left to compare
 8337     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8338     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8339     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8340     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8341     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8342     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8343     __ eor(rscratch2, tmp1, tmp2);
 8344     __ mov(rscratch1, tmp2);
 8345     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8346     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8347              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8348     __ push(spilled_regs, sp);
 8349     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8350     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8351 
 8352     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8353 
 8354     if (SoftwarePrefetchHintDistance >= 0) {
 8355       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8356       __ br(__ LT, NO_PREFETCH);
 8357       __ bind(LARGE_LOOP_PREFETCH);
 8358         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8359         __ mov(tmp4, 2);
 8360         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8361         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8362           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8363           __ subs(tmp4, tmp4, 1);
 8364           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8365           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8366           __ mov(tmp4, 2);
 8367         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8368           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8369           __ subs(tmp4, tmp4, 1);
 8370           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8371           __ sub(cnt2, cnt2, 64);
 8372           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8373           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8374     }
 8375     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8376     __ bind(NO_PREFETCH);
 8377     __ subs(cnt2, cnt2, 16);
 8378     __ br(__ LT, TAIL);
 8379     __ align(OptoLoopAlignment);
 8380     __ bind(SMALL_LOOP); // smaller loop
 8381       __ subs(cnt2, cnt2, 16);
 8382       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8383       __ br(__ GE, SMALL_LOOP);
 8384       __ cmn(cnt2, (u1)16);
 8385       __ br(__ EQ, LOAD_LAST);
 8386     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8387       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8388       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8389       __ ldr(tmp3, Address(cnt1, -8));
 8390       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8391       __ b(LOAD_LAST);
 8392     __ bind(DIFF2);
 8393       __ mov(tmpU, tmp3);
 8394     __ bind(DIFF1);
 8395       __ pop(spilled_regs, sp);
 8396       __ b(CALCULATE_DIFFERENCE);
 8397     __ bind(LOAD_LAST);
 8398       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8399       // No need to load it again
 8400       __ mov(tmpU, tmp3);
 8401       __ pop(spilled_regs, sp);
 8402 
 8403       // tmp2 points to the address of the last 4 Latin1 characters right now
 8404       __ ldrs(vtmp, Address(tmp2));
 8405       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8406       __ fmovd(tmpL, vtmp);
 8407 
 8408       __ eor(rscratch2, tmpU, tmpL);
 8409       __ cbz(rscratch2, DONE);
 8410 
 8411     // Find the first different characters in the longwords and
 8412     // compute their difference.
 8413     __ bind(CALCULATE_DIFFERENCE);
 8414       __ rev(rscratch2, rscratch2);
 8415       __ clz(rscratch2, rscratch2);
 8416       __ andr(rscratch2, rscratch2, -16);
 8417       __ lsrv(tmp1, tmp1, rscratch2);
 8418       __ uxthw(tmp1, tmp1);
 8419       __ lsrv(rscratch1, rscratch1, rscratch2);
 8420       __ uxthw(rscratch1, rscratch1);
 8421       __ subw(result, tmp1, rscratch1);
 8422     __ bind(DONE);
 8423       __ ret(lr);
 8424     return entry;
 8425   }
 8426 
 8427   // r0 = input (float16)
 8428   // v0 = result (float)
 8429   // v1 = temporary float register
 8430   address generate_float16ToFloat() {
 8431     __ align(CodeEntryAlignment);
 8432     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 8433     StubCodeMark mark(this, stub_id);
 8434     address entry = __ pc();
 8435     BLOCK_COMMENT("Entry:");
 8436     __ flt16_to_flt(v0, r0, v1);
 8437     __ ret(lr);
 8438     return entry;
 8439   }
 8440 
 8441   // v0 = input (float)
 8442   // r0 = result (float16)
 8443   // v1 = temporary float register
 8444   address generate_floatToFloat16() {
 8445     __ align(CodeEntryAlignment);
 8446     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 8447     StubCodeMark mark(this, stub_id);
 8448     address entry = __ pc();
 8449     BLOCK_COMMENT("Entry:");
 8450     __ flt_to_flt16(r0, v0, v1);
 8451     __ ret(lr);
 8452     return entry;
 8453   }
 8454 
 8455   address generate_method_entry_barrier() {
 8456     __ align(CodeEntryAlignment);
 8457     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 8458     StubCodeMark mark(this, stub_id);
 8459 
 8460     Label deoptimize_label;
 8461 
 8462     address start = __ pc();
 8463 
 8464     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8465 
 8466     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8467       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8468       // We can get here despite the nmethod being good, if we have not
 8469       // yet applied our cross modification fence (or data fence).
 8470       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8471       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8472       __ ldrw(rscratch2, rscratch2);
 8473       __ strw(rscratch2, thread_epoch_addr);
 8474       __ isb();
 8475       __ membar(__ LoadLoad);
 8476     }
 8477 
 8478     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8479 
 8480     __ enter();
 8481     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8482 
 8483     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8484 
 8485     __ push_call_clobbered_registers();
 8486 
 8487     __ mov(c_rarg0, rscratch2);
 8488     __ call_VM_leaf
 8489          (CAST_FROM_FN_PTR
 8490           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8491 
 8492     __ reset_last_Java_frame(true);
 8493 
 8494     __ mov(rscratch1, r0);
 8495 
 8496     __ pop_call_clobbered_registers();
 8497 
 8498     __ cbnz(rscratch1, deoptimize_label);
 8499 
 8500     __ leave();
 8501     __ ret(lr);
 8502 
 8503     __ BIND(deoptimize_label);
 8504 
 8505     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8506     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8507 
 8508     __ mov(sp, rscratch1);
 8509     __ br(rscratch2);
 8510 
 8511     return start;
 8512   }
 8513 
 8514   // r0  = result
 8515   // r1  = str1
 8516   // r2  = cnt1
 8517   // r3  = str2
 8518   // r4  = cnt2
 8519   // r10 = tmp1
 8520   // r11 = tmp2
 8521   address generate_compare_long_string_same_encoding(bool isLL) {
 8522     __ align(CodeEntryAlignment);
 8523     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 8524     StubCodeMark mark(this, stub_id);
 8525     address entry = __ pc();
 8526     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8527         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 8528 
 8529     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 8530 
 8531     // exit from large loop when less than 64 bytes left to read or we're about
 8532     // to prefetch memory behind array border
 8533     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 8534 
 8535     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 8536     __ eor(rscratch2, tmp1, tmp2);
 8537     __ cbnz(rscratch2, CAL_DIFFERENCE);
 8538 
 8539     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 8540     // update pointers, because of previous read
 8541     __ add(str1, str1, wordSize);
 8542     __ add(str2, str2, wordSize);
 8543     if (SoftwarePrefetchHintDistance >= 0) {
 8544       __ align(OptoLoopAlignment);
 8545       __ bind(LARGE_LOOP_PREFETCH);
 8546         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 8547         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 8548 
 8549         for (int i = 0; i < 4; i++) {
 8550           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 8551           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 8552           __ cmp(tmp1, tmp2);
 8553           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8554           __ br(Assembler::NE, DIFF);
 8555         }
 8556         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 8557         __ add(str1, str1, 64);
 8558         __ add(str2, str2, 64);
 8559         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 8560         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 8561         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 8562     }
 8563 
 8564     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 8565     __ br(Assembler::LE, LESS16);
 8566     __ align(OptoLoopAlignment);
 8567     __ bind(LOOP_COMPARE16);
 8568       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8569       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8570       __ cmp(tmp1, tmp2);
 8571       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8572       __ br(Assembler::NE, DIFF);
 8573       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8574       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8575       __ br(Assembler::LT, LESS16);
 8576 
 8577       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8578       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8579       __ cmp(tmp1, tmp2);
 8580       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8581       __ br(Assembler::NE, DIFF);
 8582       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8583       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8584       __ br(Assembler::GE, LOOP_COMPARE16);
 8585       __ cbz(cnt2, LENGTH_DIFF);
 8586 
 8587     __ bind(LESS16);
 8588       // each 8 compare
 8589       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 8590       __ br(Assembler::LE, LESS8);
 8591       __ ldr(tmp1, Address(__ post(str1, 8)));
 8592       __ ldr(tmp2, Address(__ post(str2, 8)));
 8593       __ eor(rscratch2, tmp1, tmp2);
 8594       __ cbnz(rscratch2, CAL_DIFFERENCE);
 8595       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 8596 
 8597     __ bind(LESS8); // directly load last 8 bytes
 8598       if (!isLL) {
 8599         __ add(cnt2, cnt2, cnt2);
 8600       }
 8601       __ ldr(tmp1, Address(str1, cnt2));
 8602       __ ldr(tmp2, Address(str2, cnt2));
 8603       __ eor(rscratch2, tmp1, tmp2);
 8604       __ cbz(rscratch2, LENGTH_DIFF);
 8605       __ b(CAL_DIFFERENCE);
 8606 
 8607     __ bind(DIFF);
 8608       __ cmp(tmp1, tmp2);
 8609       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 8610       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 8611       // reuse rscratch2 register for the result of eor instruction
 8612       __ eor(rscratch2, tmp1, tmp2);
 8613 
 8614     __ bind(CAL_DIFFERENCE);
 8615       __ rev(rscratch2, rscratch2);
 8616       __ clz(rscratch2, rscratch2);
 8617       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 8618       __ lsrv(tmp1, tmp1, rscratch2);
 8619       __ lsrv(tmp2, tmp2, rscratch2);
 8620       if (isLL) {
 8621         __ uxtbw(tmp1, tmp1);
 8622         __ uxtbw(tmp2, tmp2);
 8623       } else {
 8624         __ uxthw(tmp1, tmp1);
 8625         __ uxthw(tmp2, tmp2);
 8626       }
 8627       __ subw(result, tmp1, tmp2);
 8628 
 8629     __ bind(LENGTH_DIFF);
 8630       __ ret(lr);
 8631     return entry;
 8632   }
 8633 
 8634   enum string_compare_mode {
 8635     LL,
 8636     LU,
 8637     UL,
 8638     UU,
 8639   };
 8640 
 8641   // The following registers are declared in aarch64.ad
 8642   // r0  = result
 8643   // r1  = str1
 8644   // r2  = cnt1
 8645   // r3  = str2
 8646   // r4  = cnt2
 8647   // r10 = tmp1
 8648   // r11 = tmp2
 8649   // z0  = ztmp1
 8650   // z1  = ztmp2
 8651   // p0  = pgtmp1
 8652   // p1  = pgtmp2
 8653   address generate_compare_long_string_sve(string_compare_mode mode) {
 8654     StubGenStubId stub_id;
 8655     switch (mode) {
 8656       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 8657       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 8658       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 8659       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 8660       default: ShouldNotReachHere();
 8661     }
 8662 
 8663     __ align(CodeEntryAlignment);
 8664     address entry = __ pc();
 8665     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8666              tmp1 = r10, tmp2 = r11;
 8667 
 8668     Label LOOP, DONE, MISMATCH;
 8669     Register vec_len = tmp1;
 8670     Register idx = tmp2;
 8671     // The minimum of the string lengths has been stored in cnt2.
 8672     Register cnt = cnt2;
 8673     FloatRegister ztmp1 = z0, ztmp2 = z1;
 8674     PRegister pgtmp1 = p0, pgtmp2 = p1;
 8675 
 8676 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 8677     switch (mode) {                                                            \
 8678       case LL:                                                                 \
 8679         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 8680         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 8681         break;                                                                 \
 8682       case LU:                                                                 \
 8683         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 8684         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8685         break;                                                                 \
 8686       case UL:                                                                 \
 8687         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8688         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 8689         break;                                                                 \
 8690       case UU:                                                                 \
 8691         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8692         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8693         break;                                                                 \
 8694       default:                                                                 \
 8695         ShouldNotReachHere();                                                  \
 8696     }
 8697 
 8698     StubCodeMark mark(this, stub_id);
 8699 
 8700     __ mov(idx, 0);
 8701     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8702 
 8703     if (mode == LL) {
 8704       __ sve_cntb(vec_len);
 8705     } else {
 8706       __ sve_cnth(vec_len);
 8707     }
 8708 
 8709     __ sub(rscratch1, cnt, vec_len);
 8710 
 8711     __ bind(LOOP);
 8712 
 8713       // main loop
 8714       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8715       __ add(idx, idx, vec_len);
 8716       // Compare strings.
 8717       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8718       __ br(__ NE, MISMATCH);
 8719       __ cmp(idx, rscratch1);
 8720       __ br(__ LT, LOOP);
 8721 
 8722     // post loop, last iteration
 8723     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8724 
 8725     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8726     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8727     __ br(__ EQ, DONE);
 8728 
 8729     __ bind(MISMATCH);
 8730 
 8731     // Crop the vector to find its location.
 8732     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 8733     // Extract the first different characters of each string.
 8734     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 8735     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 8736 
 8737     // Compute the difference of the first different characters.
 8738     __ sub(result, rscratch1, rscratch2);
 8739 
 8740     __ bind(DONE);
 8741     __ ret(lr);
 8742 #undef LOAD_PAIR
 8743     return entry;
 8744   }
 8745 
 8746   void generate_compare_long_strings() {
 8747     if (UseSVE == 0) {
 8748       StubRoutines::aarch64::_compare_long_string_LL
 8749           = generate_compare_long_string_same_encoding(true);
 8750       StubRoutines::aarch64::_compare_long_string_UU
 8751           = generate_compare_long_string_same_encoding(false);
 8752       StubRoutines::aarch64::_compare_long_string_LU
 8753           = generate_compare_long_string_different_encoding(true);
 8754       StubRoutines::aarch64::_compare_long_string_UL
 8755           = generate_compare_long_string_different_encoding(false);
 8756     } else {
 8757       StubRoutines::aarch64::_compare_long_string_LL
 8758           = generate_compare_long_string_sve(LL);
 8759       StubRoutines::aarch64::_compare_long_string_UU
 8760           = generate_compare_long_string_sve(UU);
 8761       StubRoutines::aarch64::_compare_long_string_LU
 8762           = generate_compare_long_string_sve(LU);
 8763       StubRoutines::aarch64::_compare_long_string_UL
 8764           = generate_compare_long_string_sve(UL);
 8765     }
 8766   }
 8767 
 8768   // R0 = result
 8769   // R1 = str2
 8770   // R2 = cnt1
 8771   // R3 = str1
 8772   // R4 = cnt2
 8773   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 8774   //
 8775   // This generic linear code use few additional ideas, which makes it faster:
 8776   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 8777   // in order to skip initial loading(help in systems with 1 ld pipeline)
 8778   // 2) we can use "fast" algorithm of finding single character to search for
 8779   // first symbol with less branches(1 branch per each loaded register instead
 8780   // of branch for each symbol), so, this is where constants like
 8781   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 8782   // 3) after loading and analyzing 1st register of source string, it can be
 8783   // used to search for every 1st character entry, saving few loads in
 8784   // comparison with "simplier-but-slower" implementation
 8785   // 4) in order to avoid lots of push/pop operations, code below is heavily
 8786   // re-using/re-initializing/compressing register values, which makes code
 8787   // larger and a bit less readable, however, most of extra operations are
 8788   // issued during loads or branches, so, penalty is minimal
 8789   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 8790     StubGenStubId stub_id;
 8791     if (str1_isL) {
 8792       if (str2_isL) {
 8793         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 8794       } else {
 8795         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 8796       }
 8797     } else {
 8798       if (str2_isL) {
 8799         ShouldNotReachHere();
 8800       } else {
 8801         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 8802       }
 8803     }
 8804     __ align(CodeEntryAlignment);
 8805     StubCodeMark mark(this, stub_id);
 8806     address entry = __ pc();
 8807 
 8808     int str1_chr_size = str1_isL ? 1 : 2;
 8809     int str2_chr_size = str2_isL ? 1 : 2;
 8810     int str1_chr_shift = str1_isL ? 0 : 1;
 8811     int str2_chr_shift = str2_isL ? 0 : 1;
 8812     bool isL = str1_isL && str2_isL;
 8813    // parameters
 8814     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 8815     // temporary registers
 8816     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 8817     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 8818     // redefinitions
 8819     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 8820 
 8821     __ push(spilled_regs, sp);
 8822     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 8823         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 8824         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 8825         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 8826         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 8827         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 8828     // Read whole register from str1. It is safe, because length >=8 here
 8829     __ ldr(ch1, Address(str1));
 8830     // Read whole register from str2. It is safe, because length >=8 here
 8831     __ ldr(ch2, Address(str2));
 8832     __ sub(cnt2, cnt2, cnt1);
 8833     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 8834     if (str1_isL != str2_isL) {
 8835       __ eor(v0, __ T16B, v0, v0);
 8836     }
 8837     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 8838     __ mul(first, first, tmp1);
 8839     // check if we have less than 1 register to check
 8840     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 8841     if (str1_isL != str2_isL) {
 8842       __ fmovd(v1, ch1);
 8843     }
 8844     __ br(__ LE, L_SMALL);
 8845     __ eor(ch2, first, ch2);
 8846     if (str1_isL != str2_isL) {
 8847       __ zip1(v1, __ T16B, v1, v0);
 8848     }
 8849     __ sub(tmp2, ch2, tmp1);
 8850     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8851     __ bics(tmp2, tmp2, ch2);
 8852     if (str1_isL != str2_isL) {
 8853       __ fmovd(ch1, v1);
 8854     }
 8855     __ br(__ NE, L_HAS_ZERO);
 8856     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8857     __ add(result, result, wordSize/str2_chr_size);
 8858     __ add(str2, str2, wordSize);
 8859     __ br(__ LT, L_POST_LOOP);
 8860     __ BIND(L_LOOP);
 8861       __ ldr(ch2, Address(str2));
 8862       __ eor(ch2, first, ch2);
 8863       __ sub(tmp2, ch2, tmp1);
 8864       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8865       __ bics(tmp2, tmp2, ch2);
 8866       __ br(__ NE, L_HAS_ZERO);
 8867     __ BIND(L_LOOP_PROCEED);
 8868       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8869       __ add(str2, str2, wordSize);
 8870       __ add(result, result, wordSize/str2_chr_size);
 8871       __ br(__ GE, L_LOOP);
 8872     __ BIND(L_POST_LOOP);
 8873       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 8874       __ br(__ LE, NOMATCH);
 8875       __ ldr(ch2, Address(str2));
 8876       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 8877       __ eor(ch2, first, ch2);
 8878       __ sub(tmp2, ch2, tmp1);
 8879       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8880       __ mov(tmp4, -1); // all bits set
 8881       __ b(L_SMALL_PROCEED);
 8882     __ align(OptoLoopAlignment);
 8883     __ BIND(L_SMALL);
 8884       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 8885       __ eor(ch2, first, ch2);
 8886       if (str1_isL != str2_isL) {
 8887         __ zip1(v1, __ T16B, v1, v0);
 8888       }
 8889       __ sub(tmp2, ch2, tmp1);
 8890       __ mov(tmp4, -1); // all bits set
 8891       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8892       if (str1_isL != str2_isL) {
 8893         __ fmovd(ch1, v1); // move converted 4 symbols
 8894       }
 8895     __ BIND(L_SMALL_PROCEED);
 8896       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 8897       __ bic(tmp2, tmp2, ch2);
 8898       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 8899       __ rbit(tmp2, tmp2);
 8900       __ br(__ EQ, NOMATCH);
 8901     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 8902       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 8903       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 8904       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 8905       if (str2_isL) { // LL
 8906         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 8907         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 8908         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 8909         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 8910         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8911       } else {
 8912         __ mov(ch2, 0xE); // all bits in byte set except last one
 8913         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8914         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8915         __ lslv(tmp2, tmp2, tmp4);
 8916         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8917         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8918         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8919         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8920       }
 8921       __ cmp(ch1, ch2);
 8922       __ mov(tmp4, wordSize/str2_chr_size);
 8923       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8924     __ BIND(L_SMALL_CMP_LOOP);
 8925       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 8926                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 8927       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 8928                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 8929       __ add(tmp4, tmp4, 1);
 8930       __ cmp(tmp4, cnt1);
 8931       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 8932       __ cmp(first, ch2);
 8933       __ br(__ EQ, L_SMALL_CMP_LOOP);
 8934     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 8935       __ cbz(tmp2, NOMATCH); // no more matches. exit
 8936       __ clz(tmp4, tmp2);
 8937       __ add(result, result, 1); // advance index
 8938       __ add(str2, str2, str2_chr_size); // advance pointer
 8939       __ b(L_SMALL_HAS_ZERO_LOOP);
 8940     __ align(OptoLoopAlignment);
 8941     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 8942       __ cmp(first, ch2);
 8943       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8944       __ b(DONE);
 8945     __ align(OptoLoopAlignment);
 8946     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 8947       if (str2_isL) { // LL
 8948         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 8949         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 8950         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 8951         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 8952         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8953       } else {
 8954         __ mov(ch2, 0xE); // all bits in byte set except last one
 8955         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8956         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8957         __ lslv(tmp2, tmp2, tmp4);
 8958         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8959         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8960         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8961         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8962       }
 8963       __ cmp(ch1, ch2);
 8964       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8965       __ b(DONE);
 8966     __ align(OptoLoopAlignment);
 8967     __ BIND(L_HAS_ZERO);
 8968       __ rbit(tmp2, tmp2);
 8969       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 8970       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 8971       // It's fine because both counters are 32bit and are not changed in this
 8972       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 8973       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 8974       __ sub(result, result, 1);
 8975     __ BIND(L_HAS_ZERO_LOOP);
 8976       __ mov(cnt1, wordSize/str2_chr_size);
 8977       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 8978       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 8979       if (str2_isL) {
 8980         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 8981         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8982         __ lslv(tmp2, tmp2, tmp4);
 8983         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8984         __ add(tmp4, tmp4, 1);
 8985         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8986         __ lsl(tmp2, tmp2, 1);
 8987         __ mov(tmp4, wordSize/str2_chr_size);
 8988       } else {
 8989         __ mov(ch2, 0xE);
 8990         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8991         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8992         __ lslv(tmp2, tmp2, tmp4);
 8993         __ add(tmp4, tmp4, 1);
 8994         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8995         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 8996         __ lsl(tmp2, tmp2, 1);
 8997         __ mov(tmp4, wordSize/str2_chr_size);
 8998         __ sub(str2, str2, str2_chr_size);
 8999       }
 9000       __ cmp(ch1, ch2);
 9001       __ mov(tmp4, wordSize/str2_chr_size);
 9002       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9003     __ BIND(L_CMP_LOOP);
 9004       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9005                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9006       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9007                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9008       __ add(tmp4, tmp4, 1);
 9009       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9010       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9011       __ cmp(cnt1, ch2);
 9012       __ br(__ EQ, L_CMP_LOOP);
 9013     __ BIND(L_CMP_LOOP_NOMATCH);
 9014       // here we're not matched
 9015       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9016       __ clz(tmp4, tmp2);
 9017       __ add(str2, str2, str2_chr_size); // advance pointer
 9018       __ b(L_HAS_ZERO_LOOP);
 9019     __ align(OptoLoopAlignment);
 9020     __ BIND(L_CMP_LOOP_LAST_CMP);
 9021       __ cmp(cnt1, ch2);
 9022       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9023       __ b(DONE);
 9024     __ align(OptoLoopAlignment);
 9025     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9026       if (str2_isL) {
 9027         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9028         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9029         __ lslv(tmp2, tmp2, tmp4);
 9030         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9031         __ add(tmp4, tmp4, 1);
 9032         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9033         __ lsl(tmp2, tmp2, 1);
 9034       } else {
 9035         __ mov(ch2, 0xE);
 9036         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9037         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9038         __ lslv(tmp2, tmp2, tmp4);
 9039         __ add(tmp4, tmp4, 1);
 9040         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9041         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9042         __ lsl(tmp2, tmp2, 1);
 9043         __ sub(str2, str2, str2_chr_size);
 9044       }
 9045       __ cmp(ch1, ch2);
 9046       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9047       __ b(DONE);
 9048     __ align(OptoLoopAlignment);
 9049     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9050       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9051       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9052       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9053       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9054       // result by analyzed characters value, so, we can just reset lower bits
 9055       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9056       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9057       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9058       // index of last analyzed substring inside current octet. So, str2 in at
 9059       // respective start address. We need to advance it to next octet
 9060       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9061       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9062       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9063       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9064       __ movw(cnt2, cnt2);
 9065       __ b(L_LOOP_PROCEED);
 9066     __ align(OptoLoopAlignment);
 9067     __ BIND(NOMATCH);
 9068       __ mov(result, -1);
 9069     __ BIND(DONE);
 9070       __ pop(spilled_regs, sp);
 9071       __ ret(lr);
 9072     return entry;
 9073   }
 9074 
 9075   void generate_string_indexof_stubs() {
 9076     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9077     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9078     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9079   }
 9080 
 9081   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9082       FloatRegister src1, FloatRegister src2) {
 9083     Register dst = r1;
 9084     __ zip1(v1, __ T16B, src1, v0);
 9085     __ zip2(v2, __ T16B, src1, v0);
 9086     if (generatePrfm) {
 9087       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9088     }
 9089     __ zip1(v3, __ T16B, src2, v0);
 9090     __ zip2(v4, __ T16B, src2, v0);
 9091     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9092   }
 9093 
 9094   // R0 = src
 9095   // R1 = dst
 9096   // R2 = len
 9097   // R3 = len >> 3
 9098   // V0 = 0
 9099   // v1 = loaded 8 bytes
 9100   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9101   address generate_large_byte_array_inflate() {
 9102     __ align(CodeEntryAlignment);
 9103     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 9104     StubCodeMark mark(this, stub_id);
 9105     address entry = __ pc();
 9106     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9107     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9108     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9109 
 9110     // do one more 8-byte read to have address 16-byte aligned in most cases
 9111     // also use single store instruction
 9112     __ ldrd(v2, __ post(src, 8));
 9113     __ sub(octetCounter, octetCounter, 2);
 9114     __ zip1(v1, __ T16B, v1, v0);
 9115     __ zip1(v2, __ T16B, v2, v0);
 9116     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9117     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9118     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9119     __ br(__ LE, LOOP_START);
 9120     __ b(LOOP_PRFM_START);
 9121     __ bind(LOOP_PRFM);
 9122       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9123     __ bind(LOOP_PRFM_START);
 9124       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9125       __ sub(octetCounter, octetCounter, 8);
 9126       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9127       inflate_and_store_2_fp_registers(true, v3, v4);
 9128       inflate_and_store_2_fp_registers(true, v5, v6);
 9129       __ br(__ GT, LOOP_PRFM);
 9130       __ cmp(octetCounter, (u1)8);
 9131       __ br(__ LT, DONE);
 9132     __ bind(LOOP);
 9133       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9134       __ bind(LOOP_START);
 9135       __ sub(octetCounter, octetCounter, 8);
 9136       __ cmp(octetCounter, (u1)8);
 9137       inflate_and_store_2_fp_registers(false, v3, v4);
 9138       inflate_and_store_2_fp_registers(false, v5, v6);
 9139       __ br(__ GE, LOOP);
 9140     __ bind(DONE);
 9141       __ ret(lr);
 9142     return entry;
 9143   }
 9144 
 9145   /**
 9146    *  Arguments:
 9147    *
 9148    *  Input:
 9149    *  c_rarg0   - current state address
 9150    *  c_rarg1   - H key address
 9151    *  c_rarg2   - data address
 9152    *  c_rarg3   - number of blocks
 9153    *
 9154    *  Output:
 9155    *  Updated state at c_rarg0
 9156    */
 9157   address generate_ghash_processBlocks() {
 9158     // Bafflingly, GCM uses little-endian for the byte order, but
 9159     // big-endian for the bit order.  For example, the polynomial 1 is
 9160     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9161     //
 9162     // So, we must either reverse the bytes in each word and do
 9163     // everything big-endian or reverse the bits in each byte and do
 9164     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9165     // the bits in each byte (we have an instruction, RBIT, to do
 9166     // that) and keep the data in little-endian bit order through the
 9167     // calculation, bit-reversing the inputs and outputs.
 9168 
 9169     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 9170     StubCodeMark mark(this, stub_id);
 9171     __ align(wordSize * 2);
 9172     address p = __ pc();
 9173     __ emit_int64(0x87);  // The low-order bits of the field
 9174                           // polynomial (i.e. p = z^7+z^2+z+1)
 9175                           // repeated in the low and high parts of a
 9176                           // 128-bit vector
 9177     __ emit_int64(0x87);
 9178 
 9179     __ align(CodeEntryAlignment);
 9180     address start = __ pc();
 9181 
 9182     Register state   = c_rarg0;
 9183     Register subkeyH = c_rarg1;
 9184     Register data    = c_rarg2;
 9185     Register blocks  = c_rarg3;
 9186 
 9187     FloatRegister vzr = v30;
 9188     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9189 
 9190     __ ldrq(v24, p);    // The field polynomial
 9191 
 9192     __ ldrq(v0, Address(state));
 9193     __ ldrq(v1, Address(subkeyH));
 9194 
 9195     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9196     __ rbit(v0, __ T16B, v0);
 9197     __ rev64(v1, __ T16B, v1);
 9198     __ rbit(v1, __ T16B, v1);
 9199 
 9200     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9201     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9202 
 9203     {
 9204       Label L_ghash_loop;
 9205       __ bind(L_ghash_loop);
 9206 
 9207       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9208                                                  // reversing each byte
 9209       __ rbit(v2, __ T16B, v2);
 9210       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9211 
 9212       // Multiply state in v2 by subkey in v1
 9213       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9214                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9215                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9216       // Reduce v7:v5 by the field polynomial
 9217       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9218 
 9219       __ sub(blocks, blocks, 1);
 9220       __ cbnz(blocks, L_ghash_loop);
 9221     }
 9222 
 9223     // The bit-reversed result is at this point in v0
 9224     __ rev64(v0, __ T16B, v0);
 9225     __ rbit(v0, __ T16B, v0);
 9226 
 9227     __ st1(v0, __ T16B, state);
 9228     __ ret(lr);
 9229 
 9230     return start;
 9231   }
 9232 
 9233   address generate_ghash_processBlocks_wide() {
 9234     address small = generate_ghash_processBlocks();
 9235 
 9236     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 9237     StubCodeMark mark(this, stub_id);
 9238     __ align(wordSize * 2);
 9239     address p = __ pc();
 9240     __ emit_int64(0x87);  // The low-order bits of the field
 9241                           // polynomial (i.e. p = z^7+z^2+z+1)
 9242                           // repeated in the low and high parts of a
 9243                           // 128-bit vector
 9244     __ emit_int64(0x87);
 9245 
 9246     __ align(CodeEntryAlignment);
 9247     address start = __ pc();
 9248 
 9249     Register state   = c_rarg0;
 9250     Register subkeyH = c_rarg1;
 9251     Register data    = c_rarg2;
 9252     Register blocks  = c_rarg3;
 9253 
 9254     const int unroll = 4;
 9255 
 9256     __ cmp(blocks, (unsigned char)(unroll * 2));
 9257     __ br(__ LT, small);
 9258 
 9259     if (unroll > 1) {
 9260     // Save state before entering routine
 9261       __ sub(sp, sp, 4 * 16);
 9262       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9263       __ sub(sp, sp, 4 * 16);
 9264       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9265     }
 9266 
 9267     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9268 
 9269     if (unroll > 1) {
 9270       // And restore state
 9271       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9272       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9273     }
 9274 
 9275     __ cmp(blocks, (unsigned char)0);
 9276     __ br(__ GT, small);
 9277 
 9278     __ ret(lr);
 9279 
 9280     return start;
 9281   }
 9282 
 9283   void generate_base64_encode_simdround(Register src, Register dst,
 9284         FloatRegister codec, u8 size) {
 9285 
 9286     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9287     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9288     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9289 
 9290     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9291 
 9292     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9293 
 9294     __ ushr(ind0, arrangement, in0,  2);
 9295 
 9296     __ ushr(ind1, arrangement, in1,  2);
 9297     __ shl(in0,   arrangement, in0,  6);
 9298     __ orr(ind1,  arrangement, ind1, in0);
 9299     __ ushr(ind1, arrangement, ind1, 2);
 9300 
 9301     __ ushr(ind2, arrangement, in2,  4);
 9302     __ shl(in1,   arrangement, in1,  4);
 9303     __ orr(ind2,  arrangement, in1,  ind2);
 9304     __ ushr(ind2, arrangement, ind2, 2);
 9305 
 9306     __ shl(ind3,  arrangement, in2,  2);
 9307     __ ushr(ind3, arrangement, ind3, 2);
 9308 
 9309     __ tbl(out0,  arrangement, codec,  4, ind0);
 9310     __ tbl(out1,  arrangement, codec,  4, ind1);
 9311     __ tbl(out2,  arrangement, codec,  4, ind2);
 9312     __ tbl(out3,  arrangement, codec,  4, ind3);
 9313 
 9314     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9315   }
 9316 
 9317    /**
 9318    *  Arguments:
 9319    *
 9320    *  Input:
 9321    *  c_rarg0   - src_start
 9322    *  c_rarg1   - src_offset
 9323    *  c_rarg2   - src_length
 9324    *  c_rarg3   - dest_start
 9325    *  c_rarg4   - dest_offset
 9326    *  c_rarg5   - isURL
 9327    *
 9328    */
 9329   address generate_base64_encodeBlock() {
 9330 
 9331     static const char toBase64[64] = {
 9332       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9333       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9334       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9335       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9336       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9337     };
 9338 
 9339     static const char toBase64URL[64] = {
 9340       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9341       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9342       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9343       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9344       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9345     };
 9346 
 9347     __ align(CodeEntryAlignment);
 9348     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 9349     StubCodeMark mark(this, stub_id);
 9350     address start = __ pc();
 9351 
 9352     Register src   = c_rarg0;  // source array
 9353     Register soff  = c_rarg1;  // source start offset
 9354     Register send  = c_rarg2;  // source end offset
 9355     Register dst   = c_rarg3;  // dest array
 9356     Register doff  = c_rarg4;  // position for writing to dest array
 9357     Register isURL = c_rarg5;  // Base64 or URL character set
 9358 
 9359     // c_rarg6 and c_rarg7 are free to use as temps
 9360     Register codec  = c_rarg6;
 9361     Register length = c_rarg7;
 9362 
 9363     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9364 
 9365     __ add(src, src, soff);
 9366     __ add(dst, dst, doff);
 9367     __ sub(length, send, soff);
 9368 
 9369     // load the codec base address
 9370     __ lea(codec, ExternalAddress((address) toBase64));
 9371     __ cbz(isURL, ProcessData);
 9372     __ lea(codec, ExternalAddress((address) toBase64URL));
 9373 
 9374     __ BIND(ProcessData);
 9375 
 9376     // too short to formup a SIMD loop, roll back
 9377     __ cmp(length, (u1)24);
 9378     __ br(Assembler::LT, Process3B);
 9379 
 9380     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9381 
 9382     __ BIND(Process48B);
 9383     __ cmp(length, (u1)48);
 9384     __ br(Assembler::LT, Process24B);
 9385     generate_base64_encode_simdround(src, dst, v0, 16);
 9386     __ sub(length, length, 48);
 9387     __ b(Process48B);
 9388 
 9389     __ BIND(Process24B);
 9390     __ cmp(length, (u1)24);
 9391     __ br(Assembler::LT, SIMDExit);
 9392     generate_base64_encode_simdround(src, dst, v0, 8);
 9393     __ sub(length, length, 24);
 9394 
 9395     __ BIND(SIMDExit);
 9396     __ cbz(length, Exit);
 9397 
 9398     __ BIND(Process3B);
 9399     //  3 src bytes, 24 bits
 9400     __ ldrb(r10, __ post(src, 1));
 9401     __ ldrb(r11, __ post(src, 1));
 9402     __ ldrb(r12, __ post(src, 1));
 9403     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9404     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9405     // codec index
 9406     __ ubfmw(r15, r12, 18, 23);
 9407     __ ubfmw(r14, r12, 12, 17);
 9408     __ ubfmw(r13, r12, 6,  11);
 9409     __ andw(r12,  r12, 63);
 9410     // get the code based on the codec
 9411     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9412     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9413     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9414     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9415     __ strb(r15, __ post(dst, 1));
 9416     __ strb(r14, __ post(dst, 1));
 9417     __ strb(r13, __ post(dst, 1));
 9418     __ strb(r12, __ post(dst, 1));
 9419     __ sub(length, length, 3);
 9420     __ cbnz(length, Process3B);
 9421 
 9422     __ BIND(Exit);
 9423     __ ret(lr);
 9424 
 9425     return start;
 9426   }
 9427 
 9428   void generate_base64_decode_simdround(Register src, Register dst,
 9429         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9430 
 9431     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9432     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9433 
 9434     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9435     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9436 
 9437     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9438 
 9439     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9440 
 9441     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9442 
 9443     // we need unsigned saturating subtract, to make sure all input values
 9444     // in range [0, 63] will have 0U value in the higher half lookup
 9445     __ uqsubv(decH0, __ T16B, in0, v27);
 9446     __ uqsubv(decH1, __ T16B, in1, v27);
 9447     __ uqsubv(decH2, __ T16B, in2, v27);
 9448     __ uqsubv(decH3, __ T16B, in3, v27);
 9449 
 9450     // lower half lookup
 9451     __ tbl(decL0, arrangement, codecL, 4, in0);
 9452     __ tbl(decL1, arrangement, codecL, 4, in1);
 9453     __ tbl(decL2, arrangement, codecL, 4, in2);
 9454     __ tbl(decL3, arrangement, codecL, 4, in3);
 9455 
 9456     // higher half lookup
 9457     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9458     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9459     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9460     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9461 
 9462     // combine lower and higher
 9463     __ orr(decL0, arrangement, decL0, decH0);
 9464     __ orr(decL1, arrangement, decL1, decH1);
 9465     __ orr(decL2, arrangement, decL2, decH2);
 9466     __ orr(decL3, arrangement, decL3, decH3);
 9467 
 9468     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9469     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9470     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9471     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9472     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9473     __ orr(in0, arrangement, decH0, decH1);
 9474     __ orr(in1, arrangement, decH2, decH3);
 9475     __ orr(in2, arrangement, in0,   in1);
 9476     __ umaxv(in3, arrangement, in2);
 9477     __ umov(rscratch2, in3, __ B, 0);
 9478 
 9479     // get the data to output
 9480     __ shl(out0,  arrangement, decL0, 2);
 9481     __ ushr(out1, arrangement, decL1, 4);
 9482     __ orr(out0,  arrangement, out0,  out1);
 9483     __ shl(out1,  arrangement, decL1, 4);
 9484     __ ushr(out2, arrangement, decL2, 2);
 9485     __ orr(out1,  arrangement, out1,  out2);
 9486     __ shl(out2,  arrangement, decL2, 6);
 9487     __ orr(out2,  arrangement, out2,  decL3);
 9488 
 9489     __ cbz(rscratch2, NoIllegalData);
 9490 
 9491     // handle illegal input
 9492     __ umov(r10, in2, __ D, 0);
 9493     if (size == 16) {
 9494       __ cbnz(r10, ErrorInLowerHalf);
 9495 
 9496       // illegal input is in higher half, store the lower half now.
 9497       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9498 
 9499       __ umov(r10, in2,  __ D, 1);
 9500       __ umov(r11, out0, __ D, 1);
 9501       __ umov(r12, out1, __ D, 1);
 9502       __ umov(r13, out2, __ D, 1);
 9503       __ b(StoreLegalData);
 9504 
 9505       __ BIND(ErrorInLowerHalf);
 9506     }
 9507     __ umov(r11, out0, __ D, 0);
 9508     __ umov(r12, out1, __ D, 0);
 9509     __ umov(r13, out2, __ D, 0);
 9510 
 9511     __ BIND(StoreLegalData);
 9512     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 9513     __ strb(r11, __ post(dst, 1));
 9514     __ strb(r12, __ post(dst, 1));
 9515     __ strb(r13, __ post(dst, 1));
 9516     __ lsr(r10, r10, 8);
 9517     __ lsr(r11, r11, 8);
 9518     __ lsr(r12, r12, 8);
 9519     __ lsr(r13, r13, 8);
 9520     __ b(StoreLegalData);
 9521 
 9522     __ BIND(NoIllegalData);
 9523     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 9524   }
 9525 
 9526 
 9527    /**
 9528    *  Arguments:
 9529    *
 9530    *  Input:
 9531    *  c_rarg0   - src_start
 9532    *  c_rarg1   - src_offset
 9533    *  c_rarg2   - src_length
 9534    *  c_rarg3   - dest_start
 9535    *  c_rarg4   - dest_offset
 9536    *  c_rarg5   - isURL
 9537    *  c_rarg6   - isMIME
 9538    *
 9539    */
 9540   address generate_base64_decodeBlock() {
 9541 
 9542     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 9543     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 9544     // titled "Base64 decoding".
 9545 
 9546     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 9547     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 9548     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 9549     static const uint8_t fromBase64ForNoSIMD[256] = {
 9550       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9551       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9552       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9553        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9554       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9555        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 9556       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9557        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9558       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9559       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9560       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9561       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9562       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9563       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9564       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9565       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9566     };
 9567 
 9568     static const uint8_t fromBase64URLForNoSIMD[256] = {
 9569       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9570       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9571       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9572        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9573       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9574        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 9575       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9576        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9577       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9578       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9579       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9580       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9581       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9582       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9583       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9584       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9585     };
 9586 
 9587     // A legal value of base64 code is in range [0, 127].  We need two lookups
 9588     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 9589     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 9590     // table vector lookup use tbx, out of range indices are unchanged in
 9591     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 9592     // The value of index 64 is set to 0, so that we know that we already get the
 9593     // decoded data with the 1st lookup.
 9594     static const uint8_t fromBase64ForSIMD[128] = {
 9595       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9596       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9597       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9598        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9599         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9600        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9601       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9602        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9603     };
 9604 
 9605     static const uint8_t fromBase64URLForSIMD[128] = {
 9606       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9607       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9608       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9609        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9610         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9611        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9612        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9613        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9614     };
 9615 
 9616     __ align(CodeEntryAlignment);
 9617     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 9618     StubCodeMark mark(this, stub_id);
 9619     address start = __ pc();
 9620 
 9621     Register src    = c_rarg0;  // source array
 9622     Register soff   = c_rarg1;  // source start offset
 9623     Register send   = c_rarg2;  // source end offset
 9624     Register dst    = c_rarg3;  // dest array
 9625     Register doff   = c_rarg4;  // position for writing to dest array
 9626     Register isURL  = c_rarg5;  // Base64 or URL character set
 9627     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 9628 
 9629     Register length = send;    // reuse send as length of source data to process
 9630 
 9631     Register simd_codec   = c_rarg6;
 9632     Register nosimd_codec = c_rarg7;
 9633 
 9634     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 9635 
 9636     __ enter();
 9637 
 9638     __ add(src, src, soff);
 9639     __ add(dst, dst, doff);
 9640 
 9641     __ mov(doff, dst);
 9642 
 9643     __ sub(length, send, soff);
 9644     __ bfm(length, zr, 0, 1);
 9645 
 9646     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 9647     __ cbz(isURL, ProcessData);
 9648     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 9649 
 9650     __ BIND(ProcessData);
 9651     __ mov(rscratch1, length);
 9652     __ cmp(length, (u1)144); // 144 = 80 + 64
 9653     __ br(Assembler::LT, Process4B);
 9654 
 9655     // In the MIME case, the line length cannot be more than 76
 9656     // bytes (see RFC 2045). This is too short a block for SIMD
 9657     // to be worthwhile, so we use non-SIMD here.
 9658     __ movw(rscratch1, 79);
 9659 
 9660     __ BIND(Process4B);
 9661     __ ldrw(r14, __ post(src, 4));
 9662     __ ubfxw(r10, r14, 0,  8);
 9663     __ ubfxw(r11, r14, 8,  8);
 9664     __ ubfxw(r12, r14, 16, 8);
 9665     __ ubfxw(r13, r14, 24, 8);
 9666     // get the de-code
 9667     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 9668     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 9669     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 9670     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 9671     // error detection, 255u indicates an illegal input
 9672     __ orrw(r14, r10, r11);
 9673     __ orrw(r15, r12, r13);
 9674     __ orrw(r14, r14, r15);
 9675     __ tbnz(r14, 7, Exit);
 9676     // recover the data
 9677     __ lslw(r14, r10, 10);
 9678     __ bfiw(r14, r11, 4, 6);
 9679     __ bfmw(r14, r12, 2, 5);
 9680     __ rev16w(r14, r14);
 9681     __ bfiw(r13, r12, 6, 2);
 9682     __ strh(r14, __ post(dst, 2));
 9683     __ strb(r13, __ post(dst, 1));
 9684     // non-simd loop
 9685     __ subsw(rscratch1, rscratch1, 4);
 9686     __ br(Assembler::GT, Process4B);
 9687 
 9688     // if exiting from PreProcess80B, rscratch1 == -1;
 9689     // otherwise, rscratch1 == 0.
 9690     __ cbzw(rscratch1, Exit);
 9691     __ sub(length, length, 80);
 9692 
 9693     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 9694     __ cbz(isURL, SIMDEnter);
 9695     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 9696 
 9697     __ BIND(SIMDEnter);
 9698     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 9699     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 9700     __ mov(rscratch1, 63);
 9701     __ dup(v27, __ T16B, rscratch1);
 9702 
 9703     __ BIND(Process64B);
 9704     __ cmp(length, (u1)64);
 9705     __ br(Assembler::LT, Process32B);
 9706     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 9707     __ sub(length, length, 64);
 9708     __ b(Process64B);
 9709 
 9710     __ BIND(Process32B);
 9711     __ cmp(length, (u1)32);
 9712     __ br(Assembler::LT, SIMDExit);
 9713     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 9714     __ sub(length, length, 32);
 9715     __ b(Process32B);
 9716 
 9717     __ BIND(SIMDExit);
 9718     __ cbz(length, Exit);
 9719     __ movw(rscratch1, length);
 9720     __ b(Process4B);
 9721 
 9722     __ BIND(Exit);
 9723     __ sub(c_rarg0, dst, doff);
 9724 
 9725     __ leave();
 9726     __ ret(lr);
 9727 
 9728     return start;
 9729   }
 9730 
 9731   // Support for spin waits.
 9732   address generate_spin_wait() {
 9733     __ align(CodeEntryAlignment);
 9734     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 9735     StubCodeMark mark(this, stub_id);
 9736     address start = __ pc();
 9737 
 9738     __ spin_wait();
 9739     __ ret(lr);
 9740 
 9741     return start;
 9742   }
 9743 
 9744   void generate_lookup_secondary_supers_table_stub() {
 9745     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 9746     StubCodeMark mark(this, stub_id);
 9747 
 9748     const Register
 9749       r_super_klass  = r0,
 9750       r_array_base   = r1,
 9751       r_array_length = r2,
 9752       r_array_index  = r3,
 9753       r_sub_klass    = r4,
 9754       r_bitmap       = rscratch2,
 9755       result         = r5;
 9756     const FloatRegister
 9757       vtemp          = v0;
 9758 
 9759     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 9760       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 9761       Label L_success;
 9762       __ enter();
 9763       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 9764                                              r_array_base, r_array_length, r_array_index,
 9765                                              vtemp, result, slot,
 9766                                              /*stub_is_near*/true);
 9767       __ leave();
 9768       __ ret(lr);
 9769     }
 9770   }
 9771 
 9772   // Slow path implementation for UseSecondarySupersTable.
 9773   address generate_lookup_secondary_supers_table_slow_path_stub() {
 9774     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 9775     StubCodeMark mark(this, stub_id);
 9776 
 9777     address start = __ pc();
 9778     const Register
 9779       r_super_klass  = r0,        // argument
 9780       r_array_base   = r1,        // argument
 9781       temp1          = r2,        // temp
 9782       r_array_index  = r3,        // argument
 9783       r_bitmap       = rscratch2, // argument
 9784       result         = r5;        // argument
 9785 
 9786     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 9787     __ ret(lr);
 9788 
 9789     return start;
 9790   }
 9791 
 9792 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9793 
 9794   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 9795   //
 9796   // If LSE is in use, generate LSE versions of all the stubs. The
 9797   // non-LSE versions are in atomic_aarch64.S.
 9798 
 9799   // class AtomicStubMark records the entry point of a stub and the
 9800   // stub pointer which will point to it. The stub pointer is set to
 9801   // the entry point when ~AtomicStubMark() is called, which must be
 9802   // after ICache::invalidate_range. This ensures safe publication of
 9803   // the generated code.
 9804   class AtomicStubMark {
 9805     address _entry_point;
 9806     aarch64_atomic_stub_t *_stub;
 9807     MacroAssembler *_masm;
 9808   public:
 9809     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 9810       _masm = masm;
 9811       __ align(32);
 9812       _entry_point = __ pc();
 9813       _stub = stub;
 9814     }
 9815     ~AtomicStubMark() {
 9816       *_stub = (aarch64_atomic_stub_t)_entry_point;
 9817     }
 9818   };
 9819 
 9820   // NB: For memory_order_conservative we need a trailing membar after
 9821   // LSE atomic operations but not a leading membar.
 9822   //
 9823   // We don't need a leading membar because a clause in the Arm ARM
 9824   // says:
 9825   //
 9826   //   Barrier-ordered-before
 9827   //
 9828   //   Barrier instructions order prior Memory effects before subsequent
 9829   //   Memory effects generated by the same Observer. A read or a write
 9830   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 9831   //   Observer if and only if RW1 appears in program order before RW 2
 9832   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 9833   //   instruction with both Acquire and Release semantics.
 9834   //
 9835   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 9836   // and Release semantics, therefore we don't need a leading
 9837   // barrier. However, there is no corresponding Barrier-ordered-after
 9838   // relationship, therefore we need a trailing membar to prevent a
 9839   // later store or load from being reordered with the store in an
 9840   // atomic instruction.
 9841   //
 9842   // This was checked by using the herd7 consistency model simulator
 9843   // (http://diy.inria.fr/) with this test case:
 9844   //
 9845   // AArch64 LseCas
 9846   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 9847   // P0 | P1;
 9848   // LDR W4, [X2] | MOV W3, #0;
 9849   // DMB LD       | MOV W4, #1;
 9850   // LDR W3, [X1] | CASAL W3, W4, [X1];
 9851   //              | DMB ISH;
 9852   //              | STR W4, [X2];
 9853   // exists
 9854   // (0:X3=0 /\ 0:X4=1)
 9855   //
 9856   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 9857   // with the store to x in P1. Without the DMB in P1 this may happen.
 9858   //
 9859   // At the time of writing we don't know of any AArch64 hardware that
 9860   // reorders stores in this way, but the Reference Manual permits it.
 9861 
 9862   void gen_cas_entry(Assembler::operand_size size,
 9863                      atomic_memory_order order) {
 9864     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 9865       exchange_val = c_rarg2;
 9866     bool acquire, release;
 9867     switch (order) {
 9868       case memory_order_relaxed:
 9869         acquire = false;
 9870         release = false;
 9871         break;
 9872       case memory_order_release:
 9873         acquire = false;
 9874         release = true;
 9875         break;
 9876       default:
 9877         acquire = true;
 9878         release = true;
 9879         break;
 9880     }
 9881     __ mov(prev, compare_val);
 9882     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 9883     if (order == memory_order_conservative) {
 9884       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9885     }
 9886     if (size == Assembler::xword) {
 9887       __ mov(r0, prev);
 9888     } else {
 9889       __ movw(r0, prev);
 9890     }
 9891     __ ret(lr);
 9892   }
 9893 
 9894   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 9895     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 9896     // If not relaxed, then default to conservative.  Relaxed is the only
 9897     // case we use enough to be worth specializing.
 9898     if (order == memory_order_relaxed) {
 9899       __ ldadd(size, incr, prev, addr);
 9900     } else {
 9901       __ ldaddal(size, incr, prev, addr);
 9902       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9903     }
 9904     if (size == Assembler::xword) {
 9905       __ mov(r0, prev);
 9906     } else {
 9907       __ movw(r0, prev);
 9908     }
 9909     __ ret(lr);
 9910   }
 9911 
 9912   void gen_swpal_entry(Assembler::operand_size size) {
 9913     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 9914     __ swpal(size, incr, prev, addr);
 9915     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9916     if (size == Assembler::xword) {
 9917       __ mov(r0, prev);
 9918     } else {
 9919       __ movw(r0, prev);
 9920     }
 9921     __ ret(lr);
 9922   }
 9923 
 9924   void generate_atomic_entry_points() {
 9925     if (! UseLSE) {
 9926       return;
 9927     }
 9928     __ align(CodeEntryAlignment);
 9929     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 9930     StubCodeMark mark(this, stub_id);
 9931     address first_entry = __ pc();
 9932 
 9933     // ADD, memory_order_conservative
 9934     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 9935     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 9936     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 9937     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 9938 
 9939     // ADD, memory_order_relaxed
 9940     AtomicStubMark mark_fetch_add_4_relaxed
 9941       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 9942     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 9943     AtomicStubMark mark_fetch_add_8_relaxed
 9944       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 9945     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 9946 
 9947     // XCHG, memory_order_conservative
 9948     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 9949     gen_swpal_entry(Assembler::word);
 9950     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 9951     gen_swpal_entry(Assembler::xword);
 9952 
 9953     // CAS, memory_order_conservative
 9954     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 9955     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 9956     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 9957     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 9958     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 9959     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 9960 
 9961     // CAS, memory_order_relaxed
 9962     AtomicStubMark mark_cmpxchg_1_relaxed
 9963       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 9964     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 9965     AtomicStubMark mark_cmpxchg_4_relaxed
 9966       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 9967     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 9968     AtomicStubMark mark_cmpxchg_8_relaxed
 9969       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 9970     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 9971 
 9972     AtomicStubMark mark_cmpxchg_4_release
 9973       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 9974     gen_cas_entry(MacroAssembler::word, memory_order_release);
 9975     AtomicStubMark mark_cmpxchg_8_release
 9976       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 9977     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 9978 
 9979     AtomicStubMark mark_cmpxchg_4_seq_cst
 9980       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 9981     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 9982     AtomicStubMark mark_cmpxchg_8_seq_cst
 9983       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 9984     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 9985 
 9986     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 9987   }
 9988 #endif // LINUX
 9989 
 9990   address generate_cont_thaw(Continuation::thaw_kind kind) {
 9991     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 9992     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
 9993 
 9994     address start = __ pc();
 9995 
 9996     if (return_barrier) {
 9997       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
 9998       __ mov(sp, rscratch1);
 9999     }
10000     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10001 
10002     if (return_barrier) {
10003       // preserve possible return value from a method returning to the return barrier
10004       __ fmovd(rscratch1, v0);
10005       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10006     }
10007 
10008     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10009     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10010     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10011 
10012     if (return_barrier) {
10013       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10014       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10015       __ fmovd(v0, rscratch1);
10016     }
10017     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10018 
10019 
10020     Label thaw_success;
10021     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10022     __ cbnz(rscratch2, thaw_success);
10023     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10024     __ br(rscratch1);
10025     __ bind(thaw_success);
10026 
10027     // make room for the thawed frames
10028     __ sub(rscratch1, sp, rscratch2);
10029     __ andr(rscratch1, rscratch1, -16); // align
10030     __ mov(sp, rscratch1);
10031 
10032     if (return_barrier) {
10033       // save original return value -- again
10034       __ fmovd(rscratch1, v0);
10035       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10036     }
10037 
10038     // If we want, we can templatize thaw by kind, and have three different entries
10039     __ movw(c_rarg1, (uint32_t)kind);
10040 
10041     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10042     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10043 
10044     if (return_barrier) {
10045       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10046       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10047       __ fmovd(v0, rscratch1);
10048     } else {
10049       __ mov(r0, zr); // return 0 (success) from doYield
10050     }
10051 
10052     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10053     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10054     __ mov(rfp, sp);
10055 
10056     if (return_barrier_exception) {
10057       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10058       __ authenticate_return_address(c_rarg1);
10059       __ verify_oop(r0);
10060       // save return value containing the exception oop in callee-saved R19
10061       __ mov(r19, r0);
10062 
10063       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10064 
10065       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10066       // __ reinitialize_ptrue();
10067 
10068       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10069 
10070       __ mov(r1, r0); // the exception handler
10071       __ mov(r0, r19); // restore return value containing the exception oop
10072       __ verify_oop(r0);
10073 
10074       __ leave();
10075       __ mov(r3, lr);
10076       __ br(r1); // the exception handler
10077     } else {
10078       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10079       __ leave();
10080       __ ret(lr);
10081     }
10082 
10083     return start;
10084   }
10085 
10086   address generate_cont_thaw() {
10087     if (!Continuations::enabled()) return nullptr;
10088 
10089     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
10090     StubCodeMark mark(this, stub_id);
10091     address start = __ pc();
10092     generate_cont_thaw(Continuation::thaw_top);
10093     return start;
10094   }
10095 
10096   address generate_cont_returnBarrier() {
10097     if (!Continuations::enabled()) return nullptr;
10098 
10099     // TODO: will probably need multiple return barriers depending on return type
10100     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
10101     StubCodeMark mark(this, stub_id);
10102     address start = __ pc();
10103 
10104     generate_cont_thaw(Continuation::thaw_return_barrier);
10105 
10106     return start;
10107   }
10108 
10109   address generate_cont_returnBarrier_exception() {
10110     if (!Continuations::enabled()) return nullptr;
10111 
10112     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
10113     StubCodeMark mark(this, stub_id);
10114     address start = __ pc();
10115 
10116     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10117 
10118     return start;
10119   }
10120 
10121   address generate_cont_preempt_stub() {
10122     if (!Continuations::enabled()) return nullptr;
10123     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
10124     StubCodeMark mark(this, stub_id);
10125     address start = __ pc();
10126 
10127     __ reset_last_Java_frame(true);
10128 
10129     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10130     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10131     __ mov(sp, rscratch2);
10132 
10133     Label preemption_cancelled;
10134     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10135     __ cbnz(rscratch1, preemption_cancelled);
10136 
10137     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10138     SharedRuntime::continuation_enter_cleanup(_masm);
10139     __ leave();
10140     __ ret(lr);
10141 
10142     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10143     __ bind(preemption_cancelled);
10144     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10145     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10146     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10147     __ ldr(rscratch1, Address(rscratch1));
10148     __ br(rscratch1);
10149 
10150     return start;
10151   }
10152 
10153   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10154   // are represented as long[5], with BITS_PER_LIMB = 26.
10155   // Pack five 26-bit limbs into three 64-bit registers.
10156   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10157     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10158     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10159     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10160     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10161 
10162     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10163     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10164     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10165     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10166 
10167     if (dest2->is_valid()) {
10168       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10169     } else {
10170 #ifdef ASSERT
10171       Label OK;
10172       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10173       __ br(__ EQ, OK);
10174       __ stop("high bits of Poly1305 integer should be zero");
10175       __ should_not_reach_here();
10176       __ bind(OK);
10177 #endif
10178     }
10179   }
10180 
10181   // As above, but return only a 128-bit integer, packed into two
10182   // 64-bit registers.
10183   void pack_26(Register dest0, Register dest1, Register src) {
10184     pack_26(dest0, dest1, noreg, src);
10185   }
10186 
10187   // Multiply and multiply-accumulate unsigned 64-bit registers.
10188   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10189     __ mul(prod_lo, n, m);
10190     __ umulh(prod_hi, n, m);
10191   }
10192   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10193     wide_mul(rscratch1, rscratch2, n, m);
10194     __ adds(sum_lo, sum_lo, rscratch1);
10195     __ adc(sum_hi, sum_hi, rscratch2);
10196   }
10197 
10198   // Poly1305, RFC 7539
10199 
10200   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10201   // description of the tricks used to simplify and accelerate this
10202   // computation.
10203 
10204   address generate_poly1305_processBlocks() {
10205     __ align(CodeEntryAlignment);
10206     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
10207     StubCodeMark mark(this, stub_id);
10208     address start = __ pc();
10209     Label here;
10210     __ enter();
10211     RegSet callee_saved = RegSet::range(r19, r28);
10212     __ push(callee_saved, sp);
10213 
10214     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10215 
10216     // Arguments
10217     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10218 
10219     // R_n is the 128-bit randomly-generated key, packed into two
10220     // registers.  The caller passes this key to us as long[5], with
10221     // BITS_PER_LIMB = 26.
10222     const Register R_0 = *++regs, R_1 = *++regs;
10223     pack_26(R_0, R_1, r_start);
10224 
10225     // RR_n is (R_n >> 2) * 5
10226     const Register RR_0 = *++regs, RR_1 = *++regs;
10227     __ lsr(RR_0, R_0, 2);
10228     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10229     __ lsr(RR_1, R_1, 2);
10230     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10231 
10232     // U_n is the current checksum
10233     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10234     pack_26(U_0, U_1, U_2, acc_start);
10235 
10236     static constexpr int BLOCK_LENGTH = 16;
10237     Label DONE, LOOP;
10238 
10239     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10240     __ br(Assembler::LT, DONE); {
10241       __ bind(LOOP);
10242 
10243       // S_n is to be the sum of U_n and the next block of data
10244       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10245       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10246       __ adds(S_0, U_0, S_0);
10247       __ adcs(S_1, U_1, S_1);
10248       __ adc(S_2, U_2, zr);
10249       __ add(S_2, S_2, 1);
10250 
10251       const Register U_0HI = *++regs, U_1HI = *++regs;
10252 
10253       // NB: this logic depends on some of the special properties of
10254       // Poly1305 keys. In particular, because we know that the top
10255       // four bits of R_0 and R_1 are zero, we can add together
10256       // partial products without any risk of needing to propagate a
10257       // carry out.
10258       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10259       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10260       __ andr(U_2, R_0, 3);
10261       __ mul(U_2, S_2, U_2);
10262 
10263       // Recycle registers S_0, S_1, S_2
10264       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10265 
10266       // Partial reduction mod 2**130 - 5
10267       __ adds(U_1, U_0HI, U_1);
10268       __ adc(U_2, U_1HI, U_2);
10269       // Sum now in U_2:U_1:U_0.
10270       // Dead: U_0HI, U_1HI.
10271       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10272 
10273       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10274 
10275       // First, U_2:U_1:U_0 += (U_2 >> 2)
10276       __ lsr(rscratch1, U_2, 2);
10277       __ andr(U_2, U_2, (u8)3);
10278       __ adds(U_0, U_0, rscratch1);
10279       __ adcs(U_1, U_1, zr);
10280       __ adc(U_2, U_2, zr);
10281       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10282       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10283       __ adcs(U_1, U_1, zr);
10284       __ adc(U_2, U_2, zr);
10285 
10286       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10287       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10288       __ br(~ Assembler::LT, LOOP);
10289     }
10290 
10291     // Further reduce modulo 2^130 - 5
10292     __ lsr(rscratch1, U_2, 2);
10293     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10294     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10295     __ adcs(U_1, U_1, zr);
10296     __ andr(U_2, U_2, (u1)3);
10297     __ adc(U_2, U_2, zr);
10298 
10299     // Unpack the sum into five 26-bit limbs and write to memory.
10300     __ ubfiz(rscratch1, U_0, 0, 26);
10301     __ ubfx(rscratch2, U_0, 26, 26);
10302     __ stp(rscratch1, rscratch2, Address(acc_start));
10303     __ ubfx(rscratch1, U_0, 52, 12);
10304     __ bfi(rscratch1, U_1, 12, 14);
10305     __ ubfx(rscratch2, U_1, 14, 26);
10306     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10307     __ ubfx(rscratch1, U_1, 40, 24);
10308     __ bfi(rscratch1, U_2, 24, 3);
10309     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10310 
10311     __ bind(DONE);
10312     __ pop(callee_saved, sp);
10313     __ leave();
10314     __ ret(lr);
10315 
10316     return start;
10317   }
10318 
10319   // exception handler for upcall stubs
10320   address generate_upcall_stub_exception_handler() {
10321     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
10322     StubCodeMark mark(this, stub_id);
10323     address start = __ pc();
10324 
10325     // Native caller has no idea how to handle exceptions,
10326     // so we just crash here. Up to callee to catch exceptions.
10327     __ verify_oop(r0);
10328     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10329     __ blr(rscratch1);
10330     __ should_not_reach_here();
10331 
10332     return start;
10333   }
10334 
10335   // load Method* target of MethodHandle
10336   // j_rarg0 = jobject receiver
10337   // rmethod = result
10338   address generate_upcall_stub_load_target() {
10339     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
10340     StubCodeMark mark(this, stub_id);
10341     address start = __ pc();
10342 
10343     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10344       // Load target method from receiver
10345     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10346     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10347     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10348     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10349                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10350                       noreg, noreg);
10351     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10352 
10353     __ ret(lr);
10354 
10355     return start;
10356   }
10357 
10358 #undef __
10359 #define __ masm->
10360 
10361   class MontgomeryMultiplyGenerator : public MacroAssembler {
10362 
10363     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10364       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10365 
10366     RegSet _toSave;
10367     bool _squaring;
10368 
10369   public:
10370     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10371       : MacroAssembler(as->code()), _squaring(squaring) {
10372 
10373       // Register allocation
10374 
10375       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10376       Pa_base = *regs;       // Argument registers
10377       if (squaring)
10378         Pb_base = Pa_base;
10379       else
10380         Pb_base = *++regs;
10381       Pn_base = *++regs;
10382       Rlen= *++regs;
10383       inv = *++regs;
10384       Pm_base = *++regs;
10385 
10386                           // Working registers:
10387       Ra =  *++regs;        // The current digit of a, b, n, and m.
10388       Rb =  *++regs;
10389       Rm =  *++regs;
10390       Rn =  *++regs;
10391 
10392       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10393       Pb =  *++regs;
10394       Pm =  *++regs;
10395       Pn =  *++regs;
10396 
10397       t0 =  *++regs;        // Three registers which form a
10398       t1 =  *++regs;        // triple-precision accumuator.
10399       t2 =  *++regs;
10400 
10401       Ri =  *++regs;        // Inner and outer loop indexes.
10402       Rj =  *++regs;
10403 
10404       Rhi_ab = *++regs;     // Product registers: low and high parts
10405       Rlo_ab = *++regs;     // of a*b and m*n.
10406       Rhi_mn = *++regs;
10407       Rlo_mn = *++regs;
10408 
10409       // r19 and up are callee-saved.
10410       _toSave = RegSet::range(r19, *regs) + Pm_base;
10411     }
10412 
10413   private:
10414     void save_regs() {
10415       push(_toSave, sp);
10416     }
10417 
10418     void restore_regs() {
10419       pop(_toSave, sp);
10420     }
10421 
10422     template <typename T>
10423     void unroll_2(Register count, T block) {
10424       Label loop, end, odd;
10425       tbnz(count, 0, odd);
10426       cbz(count, end);
10427       align(16);
10428       bind(loop);
10429       (this->*block)();
10430       bind(odd);
10431       (this->*block)();
10432       subs(count, count, 2);
10433       br(Assembler::GT, loop);
10434       bind(end);
10435     }
10436 
10437     template <typename T>
10438     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10439       Label loop, end, odd;
10440       tbnz(count, 0, odd);
10441       cbz(count, end);
10442       align(16);
10443       bind(loop);
10444       (this->*block)(d, s, tmp);
10445       bind(odd);
10446       (this->*block)(d, s, tmp);
10447       subs(count, count, 2);
10448       br(Assembler::GT, loop);
10449       bind(end);
10450     }
10451 
10452     void pre1(RegisterOrConstant i) {
10453       block_comment("pre1");
10454       // Pa = Pa_base;
10455       // Pb = Pb_base + i;
10456       // Pm = Pm_base;
10457       // Pn = Pn_base + i;
10458       // Ra = *Pa;
10459       // Rb = *Pb;
10460       // Rm = *Pm;
10461       // Rn = *Pn;
10462       ldr(Ra, Address(Pa_base));
10463       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10464       ldr(Rm, Address(Pm_base));
10465       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10466       lea(Pa, Address(Pa_base));
10467       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10468       lea(Pm, Address(Pm_base));
10469       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10470 
10471       // Zero the m*n result.
10472       mov(Rhi_mn, zr);
10473       mov(Rlo_mn, zr);
10474     }
10475 
10476     // The core multiply-accumulate step of a Montgomery
10477     // multiplication.  The idea is to schedule operations as a
10478     // pipeline so that instructions with long latencies (loads and
10479     // multiplies) have time to complete before their results are
10480     // used.  This most benefits in-order implementations of the
10481     // architecture but out-of-order ones also benefit.
10482     void step() {
10483       block_comment("step");
10484       // MACC(Ra, Rb, t0, t1, t2);
10485       // Ra = *++Pa;
10486       // Rb = *--Pb;
10487       umulh(Rhi_ab, Ra, Rb);
10488       mul(Rlo_ab, Ra, Rb);
10489       ldr(Ra, pre(Pa, wordSize));
10490       ldr(Rb, pre(Pb, -wordSize));
10491       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10492                                        // previous iteration.
10493       // MACC(Rm, Rn, t0, t1, t2);
10494       // Rm = *++Pm;
10495       // Rn = *--Pn;
10496       umulh(Rhi_mn, Rm, Rn);
10497       mul(Rlo_mn, Rm, Rn);
10498       ldr(Rm, pre(Pm, wordSize));
10499       ldr(Rn, pre(Pn, -wordSize));
10500       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10501     }
10502 
10503     void post1() {
10504       block_comment("post1");
10505 
10506       // MACC(Ra, Rb, t0, t1, t2);
10507       // Ra = *++Pa;
10508       // Rb = *--Pb;
10509       umulh(Rhi_ab, Ra, Rb);
10510       mul(Rlo_ab, Ra, Rb);
10511       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10512       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10513 
10514       // *Pm = Rm = t0 * inv;
10515       mul(Rm, t0, inv);
10516       str(Rm, Address(Pm));
10517 
10518       // MACC(Rm, Rn, t0, t1, t2);
10519       // t0 = t1; t1 = t2; t2 = 0;
10520       umulh(Rhi_mn, Rm, Rn);
10521 
10522 #ifndef PRODUCT
10523       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10524       {
10525         mul(Rlo_mn, Rm, Rn);
10526         add(Rlo_mn, t0, Rlo_mn);
10527         Label ok;
10528         cbz(Rlo_mn, ok); {
10529           stop("broken Montgomery multiply");
10530         } bind(ok);
10531       }
10532 #endif
10533       // We have very carefully set things up so that
10534       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10535       // the lower half of Rm * Rn because we know the result already:
10536       // it must be -t0.  t0 + (-t0) must generate a carry iff
10537       // t0 != 0.  So, rather than do a mul and an adds we just set
10538       // the carry flag iff t0 is nonzero.
10539       //
10540       // mul(Rlo_mn, Rm, Rn);
10541       // adds(zr, t0, Rlo_mn);
10542       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10543       adcs(t0, t1, Rhi_mn);
10544       adc(t1, t2, zr);
10545       mov(t2, zr);
10546     }
10547 
10548     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
10549       block_comment("pre2");
10550       // Pa = Pa_base + i-len;
10551       // Pb = Pb_base + len;
10552       // Pm = Pm_base + i-len;
10553       // Pn = Pn_base + len;
10554 
10555       if (i.is_register()) {
10556         sub(Rj, i.as_register(), len);
10557       } else {
10558         mov(Rj, i.as_constant());
10559         sub(Rj, Rj, len);
10560       }
10561       // Rj == i-len
10562 
10563       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
10564       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
10565       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10566       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
10567 
10568       // Ra = *++Pa;
10569       // Rb = *--Pb;
10570       // Rm = *++Pm;
10571       // Rn = *--Pn;
10572       ldr(Ra, pre(Pa, wordSize));
10573       ldr(Rb, pre(Pb, -wordSize));
10574       ldr(Rm, pre(Pm, wordSize));
10575       ldr(Rn, pre(Pn, -wordSize));
10576 
10577       mov(Rhi_mn, zr);
10578       mov(Rlo_mn, zr);
10579     }
10580 
10581     void post2(RegisterOrConstant i, RegisterOrConstant len) {
10582       block_comment("post2");
10583       if (i.is_constant()) {
10584         mov(Rj, i.as_constant()-len.as_constant());
10585       } else {
10586         sub(Rj, i.as_register(), len);
10587       }
10588 
10589       adds(t0, t0, Rlo_mn); // The pending m*n, low part
10590 
10591       // As soon as we know the least significant digit of our result,
10592       // store it.
10593       // Pm_base[i-len] = t0;
10594       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10595 
10596       // t0 = t1; t1 = t2; t2 = 0;
10597       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
10598       adc(t1, t2, zr);
10599       mov(t2, zr);
10600     }
10601 
10602     // A carry in t0 after Montgomery multiplication means that we
10603     // should subtract multiples of n from our result in m.  We'll
10604     // keep doing that until there is no carry.
10605     void normalize(RegisterOrConstant len) {
10606       block_comment("normalize");
10607       // while (t0)
10608       //   t0 = sub(Pm_base, Pn_base, t0, len);
10609       Label loop, post, again;
10610       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
10611       cbz(t0, post); {
10612         bind(again); {
10613           mov(i, zr);
10614           mov(cnt, len);
10615           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10616           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10617           subs(zr, zr, zr); // set carry flag, i.e. no borrow
10618           align(16);
10619           bind(loop); {
10620             sbcs(Rm, Rm, Rn);
10621             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10622             add(i, i, 1);
10623             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10624             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10625             sub(cnt, cnt, 1);
10626           } cbnz(cnt, loop);
10627           sbc(t0, t0, zr);
10628         } cbnz(t0, again);
10629       } bind(post);
10630     }
10631 
10632     // Move memory at s to d, reversing words.
10633     //    Increments d to end of copied memory
10634     //    Destroys tmp1, tmp2
10635     //    Preserves len
10636     //    Leaves s pointing to the address which was in d at start
10637     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
10638       assert(tmp1->encoding() < r19->encoding(), "register corruption");
10639       assert(tmp2->encoding() < r19->encoding(), "register corruption");
10640 
10641       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
10642       mov(tmp1, len);
10643       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
10644       sub(s, d, len, ext::uxtw, LogBytesPerWord);
10645     }
10646     // where
10647     void reverse1(Register d, Register s, Register tmp) {
10648       ldr(tmp, pre(s, -wordSize));
10649       ror(tmp, tmp, 32);
10650       str(tmp, post(d, wordSize));
10651     }
10652 
10653     void step_squaring() {
10654       // An extra ACC
10655       step();
10656       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10657     }
10658 
10659     void last_squaring(RegisterOrConstant i) {
10660       Label dont;
10661       // if ((i & 1) == 0) {
10662       tbnz(i.as_register(), 0, dont); {
10663         // MACC(Ra, Rb, t0, t1, t2);
10664         // Ra = *++Pa;
10665         // Rb = *--Pb;
10666         umulh(Rhi_ab, Ra, Rb);
10667         mul(Rlo_ab, Ra, Rb);
10668         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10669       } bind(dont);
10670     }
10671 
10672     void extra_step_squaring() {
10673       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10674 
10675       // MACC(Rm, Rn, t0, t1, t2);
10676       // Rm = *++Pm;
10677       // Rn = *--Pn;
10678       umulh(Rhi_mn, Rm, Rn);
10679       mul(Rlo_mn, Rm, Rn);
10680       ldr(Rm, pre(Pm, wordSize));
10681       ldr(Rn, pre(Pn, -wordSize));
10682     }
10683 
10684     void post1_squaring() {
10685       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10686 
10687       // *Pm = Rm = t0 * inv;
10688       mul(Rm, t0, inv);
10689       str(Rm, Address(Pm));
10690 
10691       // MACC(Rm, Rn, t0, t1, t2);
10692       // t0 = t1; t1 = t2; t2 = 0;
10693       umulh(Rhi_mn, Rm, Rn);
10694 
10695 #ifndef PRODUCT
10696       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10697       {
10698         mul(Rlo_mn, Rm, Rn);
10699         add(Rlo_mn, t0, Rlo_mn);
10700         Label ok;
10701         cbz(Rlo_mn, ok); {
10702           stop("broken Montgomery multiply");
10703         } bind(ok);
10704       }
10705 #endif
10706       // We have very carefully set things up so that
10707       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10708       // the lower half of Rm * Rn because we know the result already:
10709       // it must be -t0.  t0 + (-t0) must generate a carry iff
10710       // t0 != 0.  So, rather than do a mul and an adds we just set
10711       // the carry flag iff t0 is nonzero.
10712       //
10713       // mul(Rlo_mn, Rm, Rn);
10714       // adds(zr, t0, Rlo_mn);
10715       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10716       adcs(t0, t1, Rhi_mn);
10717       adc(t1, t2, zr);
10718       mov(t2, zr);
10719     }
10720 
10721     void acc(Register Rhi, Register Rlo,
10722              Register t0, Register t1, Register t2) {
10723       adds(t0, t0, Rlo);
10724       adcs(t1, t1, Rhi);
10725       adc(t2, t2, zr);
10726     }
10727 
10728   public:
10729     /**
10730      * Fast Montgomery multiplication.  The derivation of the
10731      * algorithm is in A Cryptographic Library for the Motorola
10732      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
10733      *
10734      * Arguments:
10735      *
10736      * Inputs for multiplication:
10737      *   c_rarg0   - int array elements a
10738      *   c_rarg1   - int array elements b
10739      *   c_rarg2   - int array elements n (the modulus)
10740      *   c_rarg3   - int length
10741      *   c_rarg4   - int inv
10742      *   c_rarg5   - int array elements m (the result)
10743      *
10744      * Inputs for squaring:
10745      *   c_rarg0   - int array elements a
10746      *   c_rarg1   - int array elements n (the modulus)
10747      *   c_rarg2   - int length
10748      *   c_rarg3   - int inv
10749      *   c_rarg4   - int array elements m (the result)
10750      *
10751      */
10752     address generate_multiply() {
10753       Label argh, nothing;
10754       bind(argh);
10755       stop("MontgomeryMultiply total_allocation must be <= 8192");
10756 
10757       align(CodeEntryAlignment);
10758       address entry = pc();
10759 
10760       cbzw(Rlen, nothing);
10761 
10762       enter();
10763 
10764       // Make room.
10765       cmpw(Rlen, 512);
10766       br(Assembler::HI, argh);
10767       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
10768       andr(sp, Ra, -2 * wordSize);
10769 
10770       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
10771 
10772       {
10773         // Copy input args, reversing as we go.  We use Ra as a
10774         // temporary variable.
10775         reverse(Ra, Pa_base, Rlen, t0, t1);
10776         if (!_squaring)
10777           reverse(Ra, Pb_base, Rlen, t0, t1);
10778         reverse(Ra, Pn_base, Rlen, t0, t1);
10779       }
10780 
10781       // Push all call-saved registers and also Pm_base which we'll need
10782       // at the end.
10783       save_regs();
10784 
10785 #ifndef PRODUCT
10786       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
10787       {
10788         ldr(Rn, Address(Pn_base, 0));
10789         mul(Rlo_mn, Rn, inv);
10790         subs(zr, Rlo_mn, -1);
10791         Label ok;
10792         br(EQ, ok); {
10793           stop("broken inverse in Montgomery multiply");
10794         } bind(ok);
10795       }
10796 #endif
10797 
10798       mov(Pm_base, Ra);
10799 
10800       mov(t0, zr);
10801       mov(t1, zr);
10802       mov(t2, zr);
10803 
10804       block_comment("for (int i = 0; i < len; i++) {");
10805       mov(Ri, zr); {
10806         Label loop, end;
10807         cmpw(Ri, Rlen);
10808         br(Assembler::GE, end);
10809 
10810         bind(loop);
10811         pre1(Ri);
10812 
10813         block_comment("  for (j = i; j; j--) {"); {
10814           movw(Rj, Ri);
10815           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10816         } block_comment("  } // j");
10817 
10818         post1();
10819         addw(Ri, Ri, 1);
10820         cmpw(Ri, Rlen);
10821         br(Assembler::LT, loop);
10822         bind(end);
10823         block_comment("} // i");
10824       }
10825 
10826       block_comment("for (int i = len; i < 2*len; i++) {");
10827       mov(Ri, Rlen); {
10828         Label loop, end;
10829         cmpw(Ri, Rlen, Assembler::LSL, 1);
10830         br(Assembler::GE, end);
10831 
10832         bind(loop);
10833         pre2(Ri, Rlen);
10834 
10835         block_comment("  for (j = len*2-i-1; j; j--) {"); {
10836           lslw(Rj, Rlen, 1);
10837           subw(Rj, Rj, Ri);
10838           subw(Rj, Rj, 1);
10839           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10840         } block_comment("  } // j");
10841 
10842         post2(Ri, Rlen);
10843         addw(Ri, Ri, 1);
10844         cmpw(Ri, Rlen, Assembler::LSL, 1);
10845         br(Assembler::LT, loop);
10846         bind(end);
10847       }
10848       block_comment("} // i");
10849 
10850       normalize(Rlen);
10851 
10852       mov(Ra, Pm_base);  // Save Pm_base in Ra
10853       restore_regs();  // Restore caller's Pm_base
10854 
10855       // Copy our result into caller's Pm_base
10856       reverse(Pm_base, Ra, Rlen, t0, t1);
10857 
10858       leave();
10859       bind(nothing);
10860       ret(lr);
10861 
10862       return entry;
10863     }
10864     // In C, approximately:
10865 
10866     // void
10867     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
10868     //                     julong Pn_base[], julong Pm_base[],
10869     //                     julong inv, int len) {
10870     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
10871     //   julong *Pa, *Pb, *Pn, *Pm;
10872     //   julong Ra, Rb, Rn, Rm;
10873 
10874     //   int i;
10875 
10876     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
10877 
10878     //   for (i = 0; i < len; i++) {
10879     //     int j;
10880 
10881     //     Pa = Pa_base;
10882     //     Pb = Pb_base + i;
10883     //     Pm = Pm_base;
10884     //     Pn = Pn_base + i;
10885 
10886     //     Ra = *Pa;
10887     //     Rb = *Pb;
10888     //     Rm = *Pm;
10889     //     Rn = *Pn;
10890 
10891     //     int iters = i;
10892     //     for (j = 0; iters--; j++) {
10893     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
10894     //       MACC(Ra, Rb, t0, t1, t2);
10895     //       Ra = *++Pa;
10896     //       Rb = *--Pb;
10897     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
10898     //       MACC(Rm, Rn, t0, t1, t2);
10899     //       Rm = *++Pm;
10900     //       Rn = *--Pn;
10901     //     }
10902 
10903     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
10904     //     MACC(Ra, Rb, t0, t1, t2);
10905     //     *Pm = Rm = t0 * inv;
10906     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
10907     //     MACC(Rm, Rn, t0, t1, t2);
10908 
10909     //     assert(t0 == 0, "broken Montgomery multiply");
10910 
10911     //     t0 = t1; t1 = t2; t2 = 0;
10912     //   }
10913 
10914     //   for (i = len; i < 2*len; i++) {
10915     //     int j;
10916 
10917     //     Pa = Pa_base + i-len;
10918     //     Pb = Pb_base + len;
10919     //     Pm = Pm_base + i-len;
10920     //     Pn = Pn_base + len;
10921 
10922     //     Ra = *++Pa;
10923     //     Rb = *--Pb;
10924     //     Rm = *++Pm;
10925     //     Rn = *--Pn;
10926 
10927     //     int iters = len*2-i-1;
10928     //     for (j = i-len+1; iters--; j++) {
10929     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
10930     //       MACC(Ra, Rb, t0, t1, t2);
10931     //       Ra = *++Pa;
10932     //       Rb = *--Pb;
10933     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
10934     //       MACC(Rm, Rn, t0, t1, t2);
10935     //       Rm = *++Pm;
10936     //       Rn = *--Pn;
10937     //     }
10938 
10939     //     Pm_base[i-len] = t0;
10940     //     t0 = t1; t1 = t2; t2 = 0;
10941     //   }
10942 
10943     //   while (t0)
10944     //     t0 = sub(Pm_base, Pn_base, t0, len);
10945     // }
10946 
10947     /**
10948      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
10949      * multiplies than Montgomery multiplication so it should be up to
10950      * 25% faster.  However, its loop control is more complex and it
10951      * may actually run slower on some machines.
10952      *
10953      * Arguments:
10954      *
10955      * Inputs:
10956      *   c_rarg0   - int array elements a
10957      *   c_rarg1   - int array elements n (the modulus)
10958      *   c_rarg2   - int length
10959      *   c_rarg3   - int inv
10960      *   c_rarg4   - int array elements m (the result)
10961      *
10962      */
10963     address generate_square() {
10964       Label argh;
10965       bind(argh);
10966       stop("MontgomeryMultiply total_allocation must be <= 8192");
10967 
10968       align(CodeEntryAlignment);
10969       address entry = pc();
10970 
10971       enter();
10972 
10973       // Make room.
10974       cmpw(Rlen, 512);
10975       br(Assembler::HI, argh);
10976       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
10977       andr(sp, Ra, -2 * wordSize);
10978 
10979       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
10980 
10981       {
10982         // Copy input args, reversing as we go.  We use Ra as a
10983         // temporary variable.
10984         reverse(Ra, Pa_base, Rlen, t0, t1);
10985         reverse(Ra, Pn_base, Rlen, t0, t1);
10986       }
10987 
10988       // Push all call-saved registers and also Pm_base which we'll need
10989       // at the end.
10990       save_regs();
10991 
10992       mov(Pm_base, Ra);
10993 
10994       mov(t0, zr);
10995       mov(t1, zr);
10996       mov(t2, zr);
10997 
10998       block_comment("for (int i = 0; i < len; i++) {");
10999       mov(Ri, zr); {
11000         Label loop, end;
11001         bind(loop);
11002         cmp(Ri, Rlen);
11003         br(Assembler::GE, end);
11004 
11005         pre1(Ri);
11006 
11007         block_comment("for (j = (i+1)/2; j; j--) {"); {
11008           add(Rj, Ri, 1);
11009           lsr(Rj, Rj, 1);
11010           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11011         } block_comment("  } // j");
11012 
11013         last_squaring(Ri);
11014 
11015         block_comment("  for (j = i/2; j; j--) {"); {
11016           lsr(Rj, Ri, 1);
11017           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11018         } block_comment("  } // j");
11019 
11020         post1_squaring();
11021         add(Ri, Ri, 1);
11022         cmp(Ri, Rlen);
11023         br(Assembler::LT, loop);
11024 
11025         bind(end);
11026         block_comment("} // i");
11027       }
11028 
11029       block_comment("for (int i = len; i < 2*len; i++) {");
11030       mov(Ri, Rlen); {
11031         Label loop, end;
11032         bind(loop);
11033         cmp(Ri, Rlen, Assembler::LSL, 1);
11034         br(Assembler::GE, end);
11035 
11036         pre2(Ri, Rlen);
11037 
11038         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11039           lsl(Rj, Rlen, 1);
11040           sub(Rj, Rj, Ri);
11041           sub(Rj, Rj, 1);
11042           lsr(Rj, Rj, 1);
11043           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11044         } block_comment("  } // j");
11045 
11046         last_squaring(Ri);
11047 
11048         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11049           lsl(Rj, Rlen, 1);
11050           sub(Rj, Rj, Ri);
11051           lsr(Rj, Rj, 1);
11052           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11053         } block_comment("  } // j");
11054 
11055         post2(Ri, Rlen);
11056         add(Ri, Ri, 1);
11057         cmp(Ri, Rlen, Assembler::LSL, 1);
11058 
11059         br(Assembler::LT, loop);
11060         bind(end);
11061         block_comment("} // i");
11062       }
11063 
11064       normalize(Rlen);
11065 
11066       mov(Ra, Pm_base);  // Save Pm_base in Ra
11067       restore_regs();  // Restore caller's Pm_base
11068 
11069       // Copy our result into caller's Pm_base
11070       reverse(Pm_base, Ra, Rlen, t0, t1);
11071 
11072       leave();
11073       ret(lr);
11074 
11075       return entry;
11076     }
11077     // In C, approximately:
11078 
11079     // void
11080     // montgomery_square(julong Pa_base[], julong Pn_base[],
11081     //                   julong Pm_base[], julong inv, int len) {
11082     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11083     //   julong *Pa, *Pb, *Pn, *Pm;
11084     //   julong Ra, Rb, Rn, Rm;
11085 
11086     //   int i;
11087 
11088     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11089 
11090     //   for (i = 0; i < len; i++) {
11091     //     int j;
11092 
11093     //     Pa = Pa_base;
11094     //     Pb = Pa_base + i;
11095     //     Pm = Pm_base;
11096     //     Pn = Pn_base + i;
11097 
11098     //     Ra = *Pa;
11099     //     Rb = *Pb;
11100     //     Rm = *Pm;
11101     //     Rn = *Pn;
11102 
11103     //     int iters = (i+1)/2;
11104     //     for (j = 0; iters--; j++) {
11105     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11106     //       MACC2(Ra, Rb, t0, t1, t2);
11107     //       Ra = *++Pa;
11108     //       Rb = *--Pb;
11109     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11110     //       MACC(Rm, Rn, t0, t1, t2);
11111     //       Rm = *++Pm;
11112     //       Rn = *--Pn;
11113     //     }
11114     //     if ((i & 1) == 0) {
11115     //       assert(Ra == Pa_base[j], "must be");
11116     //       MACC(Ra, Ra, t0, t1, t2);
11117     //     }
11118     //     iters = i/2;
11119     //     assert(iters == i-j, "must be");
11120     //     for (; iters--; j++) {
11121     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11122     //       MACC(Rm, Rn, t0, t1, t2);
11123     //       Rm = *++Pm;
11124     //       Rn = *--Pn;
11125     //     }
11126 
11127     //     *Pm = Rm = t0 * inv;
11128     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11129     //     MACC(Rm, Rn, t0, t1, t2);
11130 
11131     //     assert(t0 == 0, "broken Montgomery multiply");
11132 
11133     //     t0 = t1; t1 = t2; t2 = 0;
11134     //   }
11135 
11136     //   for (i = len; i < 2*len; i++) {
11137     //     int start = i-len+1;
11138     //     int end = start + (len - start)/2;
11139     //     int j;
11140 
11141     //     Pa = Pa_base + i-len;
11142     //     Pb = Pa_base + len;
11143     //     Pm = Pm_base + i-len;
11144     //     Pn = Pn_base + len;
11145 
11146     //     Ra = *++Pa;
11147     //     Rb = *--Pb;
11148     //     Rm = *++Pm;
11149     //     Rn = *--Pn;
11150 
11151     //     int iters = (2*len-i-1)/2;
11152     //     assert(iters == end-start, "must be");
11153     //     for (j = start; iters--; j++) {
11154     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11155     //       MACC2(Ra, Rb, t0, t1, t2);
11156     //       Ra = *++Pa;
11157     //       Rb = *--Pb;
11158     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11159     //       MACC(Rm, Rn, t0, t1, t2);
11160     //       Rm = *++Pm;
11161     //       Rn = *--Pn;
11162     //     }
11163     //     if ((i & 1) == 0) {
11164     //       assert(Ra == Pa_base[j], "must be");
11165     //       MACC(Ra, Ra, t0, t1, t2);
11166     //     }
11167     //     iters =  (2*len-i)/2;
11168     //     assert(iters == len-j, "must be");
11169     //     for (; iters--; j++) {
11170     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11171     //       MACC(Rm, Rn, t0, t1, t2);
11172     //       Rm = *++Pm;
11173     //       Rn = *--Pn;
11174     //     }
11175     //     Pm_base[i-len] = t0;
11176     //     t0 = t1; t1 = t2; t2 = 0;
11177     //   }
11178 
11179     //   while (t0)
11180     //     t0 = sub(Pm_base, Pn_base, t0, len);
11181     // }
11182   };
11183 
11184   // Initialization
11185   void generate_initial_stubs() {
11186     // Generate initial stubs and initializes the entry points
11187 
11188     // entry points that exist in all platforms Note: This is code
11189     // that could be shared among different platforms - however the
11190     // benefit seems to be smaller than the disadvantage of having a
11191     // much more complicated generator structure. See also comment in
11192     // stubRoutines.hpp.
11193 
11194     StubRoutines::_forward_exception_entry = generate_forward_exception();
11195 
11196     StubRoutines::_call_stub_entry =
11197       generate_call_stub(StubRoutines::_call_stub_return_address);
11198 
11199     // is referenced by megamorphic call
11200     StubRoutines::_catch_exception_entry = generate_catch_exception();
11201 
11202     // Initialize table for copy memory (arraycopy) check.
11203     if (UnsafeMemoryAccess::_table == nullptr) {
11204       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11205     }
11206 
11207     if (UseCRC32Intrinsics) {
11208       // set table address before stub generation which use it
11209       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
11210       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11211     }
11212 
11213     if (UseCRC32CIntrinsics) {
11214       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11215     }
11216 
11217     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11218       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11219     }
11220 
11221     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11222       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11223     }
11224 
11225     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11226         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11227       StubRoutines::_hf2f = generate_float16ToFloat();
11228       StubRoutines::_f2hf = generate_floatToFloat16();
11229     }
11230   }
11231 
11232   void generate_continuation_stubs() {
11233     // Continuation stubs:
11234     StubRoutines::_cont_thaw          = generate_cont_thaw();
11235     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11236     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11237     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11238   }
11239 
11240   void generate_final_stubs() {
11241     // support for verify_oop (must happen after universe_init)
11242     if (VerifyOops) {
11243       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11244     }
11245 
11246     // arraycopy stubs used by compilers
11247     generate_arraycopy_stubs();
11248 
11249     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11250 
11251     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11252 
11253     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11254     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11255 
11256 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11257 
11258     generate_atomic_entry_points();
11259 
11260 #endif // LINUX
11261 
11262 #ifdef COMPILER2
11263     if (UseSecondarySupersTable) {
11264       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11265       if (! InlineSecondarySupersTest) {
11266         generate_lookup_secondary_supers_table_stub();
11267       }
11268     }
11269 #endif
11270 
11271     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11272   }
11273 
11274   void generate_compiler_stubs() {
11275 #if COMPILER2_OR_JVMCI
11276 
11277     if (UseSVE == 0) {
11278       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
11279     }
11280 
11281     // array equals stub for large arrays.
11282     if (!UseSimpleArrayEquals) {
11283       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11284     }
11285 
11286     // arrays_hascode stub for large arrays.
11287     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11288     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11289     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11290     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11291     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11292 
11293     // byte_array_inflate stub for large arrays.
11294     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11295 
11296     // countPositives stub for large arrays.
11297     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11298 
11299     generate_compare_long_strings();
11300 
11301     generate_string_indexof_stubs();
11302 
11303 #ifdef COMPILER2
11304     if (UseMultiplyToLenIntrinsic) {
11305       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11306     }
11307 
11308     if (UseSquareToLenIntrinsic) {
11309       StubRoutines::_squareToLen = generate_squareToLen();
11310     }
11311 
11312     if (UseMulAddIntrinsic) {
11313       StubRoutines::_mulAdd = generate_mulAdd();
11314     }
11315 
11316     if (UseSIMDForBigIntegerShiftIntrinsics) {
11317       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11318       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11319     }
11320 
11321     if (UseMontgomeryMultiplyIntrinsic) {
11322       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
11323       StubCodeMark mark(this, stub_id);
11324       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11325       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11326     }
11327 
11328     if (UseMontgomerySquareIntrinsic) {
11329       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
11330       StubCodeMark mark(this, stub_id);
11331       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11332       // We use generate_multiply() rather than generate_square()
11333       // because it's faster for the sizes of modulus we care about.
11334       StubRoutines::_montgomerySquare = g.generate_multiply();
11335     }
11336 
11337 #endif // COMPILER2
11338 
11339     if (UseChaCha20Intrinsics) {
11340       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11341     }
11342 
11343     if (UseKyberIntrinsics) {
11344       StubRoutines::_kyberNtt = generate_kyberNtt();
11345       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11346       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11347       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11348       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11349       StubRoutines::_kyber12To16 = generate_kyber12To16();
11350       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11351     }
11352 
11353     if (UseDilithiumIntrinsics) {
11354       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11355       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11356       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11357       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11358       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11359     }
11360 
11361     if (UseBASE64Intrinsics) {
11362         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11363         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11364     }
11365 
11366     // data cache line writeback
11367     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11368     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11369 
11370     if (UseAESIntrinsics) {
11371       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11372       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11373       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11374       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11375       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11376     }
11377     if (UseGHASHIntrinsics) {
11378       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11379       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11380     }
11381     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11382       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11383     }
11384 
11385     if (UseMD5Intrinsics) {
11386       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
11387       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
11388     }
11389     if (UseSHA1Intrinsics) {
11390       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
11391       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
11392     }
11393     if (UseSHA256Intrinsics) {
11394       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
11395       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
11396     }
11397     if (UseSHA512Intrinsics) {
11398       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
11399       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
11400     }
11401     if (UseSHA3Intrinsics) {
11402       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
11403       StubRoutines::_double_keccak         = generate_double_keccak();
11404       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
11405     }
11406 
11407     if (UsePoly1305Intrinsics) {
11408       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11409     }
11410 
11411     // generate Adler32 intrinsics code
11412     if (UseAdler32Intrinsics) {
11413       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11414     }
11415 
11416 #endif // COMPILER2_OR_JVMCI
11417   }
11418 
11419  public:
11420   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
11421     switch(blob_id) {
11422     case initial_id:
11423       generate_initial_stubs();
11424       break;
11425      case continuation_id:
11426       generate_continuation_stubs();
11427       break;
11428     case compiler_id:
11429       generate_compiler_stubs();
11430       break;
11431     case final_id:
11432       generate_final_stubs();
11433       break;
11434     default:
11435       fatal("unexpected blob id: %d", blob_id);
11436       break;
11437     };
11438   }
11439 }; // end class declaration
11440 
11441 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
11442   StubGenerator g(code, blob_id);
11443 }
11444 
11445 
11446 #if defined (LINUX)
11447 
11448 // Define pointers to atomic stubs and initialize them to point to the
11449 // code in atomic_aarch64.S.
11450 
11451 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11452   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11453     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11454   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11455     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11456 
11457 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11458 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11459 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11460 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11461 DEFAULT_ATOMIC_OP(xchg, 4, )
11462 DEFAULT_ATOMIC_OP(xchg, 8, )
11463 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11464 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11465 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11466 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11467 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11468 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11469 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11470 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11471 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11472 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11473 
11474 #undef DEFAULT_ATOMIC_OP
11475 
11476 #endif // LINUX