1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubId stub_id = StubId::stubgen_call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     __ ldr(j_rarg2, result);
  332     Label is_long, is_float, is_double, exit;
  333     __ ldr(j_rarg1, result_type);
  334     __ cmp(j_rarg1, (u1)T_OBJECT);
  335     __ br(Assembler::EQ, is_long);
  336     __ cmp(j_rarg1, (u1)T_LONG);
  337     __ br(Assembler::EQ, is_long);
  338     __ cmp(j_rarg1, (u1)T_FLOAT);
  339     __ br(Assembler::EQ, is_float);
  340     __ cmp(j_rarg1, (u1)T_DOUBLE);
  341     __ br(Assembler::EQ, is_double);
  342 
  343     // handle T_INT case
  344     __ strw(r0, Address(j_rarg2));
  345 
  346     __ BIND(exit);
  347 
  348     // pop parameters
  349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  350 
  351 #ifdef ASSERT
  352     // verify that threads correspond
  353     {
  354       Label L, S;
  355       __ ldr(rscratch1, thread);
  356       __ cmp(rthread, rscratch1);
  357       __ br(Assembler::NE, S);
  358       __ get_thread(rscratch1);
  359       __ cmp(rthread, rscratch1);
  360       __ br(Assembler::EQ, L);
  361       __ BIND(S);
  362       __ stop("StubRoutines::call_stub: threads must correspond");
  363       __ BIND(L);
  364     }
  365 #endif
  366 
  367     __ pop_cont_fastpath(rthread);
  368 
  369     // restore callee-save registers
  370     __ ldpd(v15, v14,  d15_save);
  371     __ ldpd(v13, v12,  d13_save);
  372     __ ldpd(v11, v10,  d11_save);
  373     __ ldpd(v9,  v8,   d9_save);
  374 
  375     __ ldp(r28, r27,   r28_save);
  376     __ ldp(r26, r25,   r26_save);
  377     __ ldp(r24, r23,   r24_save);
  378     __ ldp(r22, r21,   r22_save);
  379     __ ldp(r20, r19,   r20_save);
  380 
  381     // restore fpcr
  382     __ ldr(rscratch1,  fpcr_save);
  383     __ set_fpcr(rscratch1);
  384 
  385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  386     __ ldrw(c_rarg2, result_type);
  387     __ ldr(c_rarg3,  method);
  388     __ ldp(c_rarg4, c_rarg5,  entry_point);
  389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  390 
  391     // leave frame and return to caller
  392     __ leave();
  393     __ ret(lr);
  394 
  395     // handle return types different from T_INT
  396 
  397     __ BIND(is_long);
  398     __ str(r0, Address(j_rarg2, 0));
  399     __ br(Assembler::AL, exit);
  400 
  401     __ BIND(is_float);
  402     __ strs(j_farg0, Address(j_rarg2, 0));
  403     __ br(Assembler::AL, exit);
  404 
  405     __ BIND(is_double);
  406     __ strd(j_farg0, Address(j_rarg2, 0));
  407     __ br(Assembler::AL, exit);
  408 
  409     return start;
  410   }
  411 
  412   // Return point for a Java call if there's an exception thrown in
  413   // Java code.  The exception is caught and transformed into a
  414   // pending exception stored in JavaThread that can be tested from
  415   // within the VM.
  416   //
  417   // Note: Usually the parameters are removed by the callee. In case
  418   // of an exception crossing an activation frame boundary, that is
  419   // not the case if the callee is compiled code => need to setup the
  420   // rsp.
  421   //
  422   // r0: exception oop
  423 
  424   address generate_catch_exception() {
  425     StubId stub_id = StubId::stubgen_catch_exception_id;
  426     StubCodeMark mark(this, stub_id);
  427     address start = __ pc();
  428 
  429     // same as in generate_call_stub():
  430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  431     const Address thread        (rfp, thread_off         * wordSize);
  432 
  433 #ifdef ASSERT
  434     // verify that threads correspond
  435     {
  436       Label L, S;
  437       __ ldr(rscratch1, thread);
  438       __ cmp(rthread, rscratch1);
  439       __ br(Assembler::NE, S);
  440       __ get_thread(rscratch1);
  441       __ cmp(rthread, rscratch1);
  442       __ br(Assembler::EQ, L);
  443       __ bind(S);
  444       __ stop("StubRoutines::catch_exception: threads must correspond");
  445       __ bind(L);
  446     }
  447 #endif
  448 
  449     // set pending exception
  450     __ verify_oop(r0);
  451 
  452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  453     __ mov(rscratch1, (address)__FILE__);
  454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  455     __ movw(rscratch1, (int)__LINE__);
  456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  457 
  458     // complete return to VM
  459     assert(StubRoutines::_call_stub_return_address != nullptr,
  460            "_call_stub_return_address must have been generated before");
  461     __ b(StubRoutines::_call_stub_return_address);
  462 
  463     return start;
  464   }
  465 
  466   // Continuation point for runtime calls returning with a pending
  467   // exception.  The pending exception check happened in the runtime
  468   // or native call stub.  The pending exception in Thread is
  469   // converted into a Java-level exception.
  470   //
  471   // Contract with Java-level exception handlers:
  472   // r0: exception
  473   // r3: throwing pc
  474   //
  475   // NOTE: At entry of this stub, exception-pc must be in LR !!
  476 
  477   // NOTE: this is always used as a jump target within generated code
  478   // so it just needs to be generated code with no x86 prolog
  479 
  480   address generate_forward_exception() {
  481     StubId stub_id = StubId::stubgen_forward_exception_id;
  482     StubCodeMark mark(this, stub_id);
  483     address start = __ pc();
  484 
  485     // Upon entry, LR points to the return address returning into
  486     // Java (interpreted or compiled) code; i.e., the return address
  487     // becomes the throwing pc.
  488     //
  489     // Arguments pushed before the runtime call are still on the stack
  490     // but the exception handler will reset the stack pointer ->
  491     // ignore them.  A potential result in registers can be ignored as
  492     // well.
  493 
  494 #ifdef ASSERT
  495     // make sure this code is only executed if there is a pending exception
  496     {
  497       Label L;
  498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  499       __ cbnz(rscratch1, L);
  500       __ stop("StubRoutines::forward exception: no pending exception (1)");
  501       __ bind(L);
  502     }
  503 #endif
  504 
  505     // compute exception handler into r19
  506 
  507     // call the VM to find the handler address associated with the
  508     // caller address. pass thread in r0 and caller pc (ret address)
  509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  510     // the stack.
  511     __ mov(c_rarg1, lr);
  512     // lr will be trashed by the VM call so we move it to R19
  513     // (callee-saved) because we also need to pass it to the handler
  514     // returned by this call.
  515     __ mov(r19, lr);
  516     BLOCK_COMMENT("call exception_handler_for_return_address");
  517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  518                          SharedRuntime::exception_handler_for_return_address),
  519                     rthread, c_rarg1);
  520     // Reinitialize the ptrue predicate register, in case the external runtime
  521     // call clobbers ptrue reg, as we may return to SVE compiled code.
  522     __ reinitialize_ptrue();
  523 
  524     // we should not really care that lr is no longer the callee
  525     // address. we saved the value the handler needs in r19 so we can
  526     // just copy it to r3. however, the C2 handler will push its own
  527     // frame and then calls into the VM and the VM code asserts that
  528     // the PC for the frame above the handler belongs to a compiled
  529     // Java method. So, we restore lr here to satisfy that assert.
  530     __ mov(lr, r19);
  531     // setup r0 & r3 & clear pending exception
  532     __ mov(r3, r19);
  533     __ mov(r19, r0);
  534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  536 
  537 #ifdef ASSERT
  538     // make sure exception is set
  539     {
  540       Label L;
  541       __ cbnz(r0, L);
  542       __ stop("StubRoutines::forward exception: no pending exception (2)");
  543       __ bind(L);
  544     }
  545 #endif
  546 
  547     // continue at exception handler
  548     // r0: exception
  549     // r3: throwing pc
  550     // r19: exception handler
  551     __ verify_oop(r0);
  552     __ br(r19);
  553 
  554     return start;
  555   }
  556 
  557   // Non-destructive plausibility checks for oops
  558   //
  559   // Arguments:
  560   //    r0: oop to verify
  561   //    rscratch1: error message
  562   //
  563   // Stack after saving c_rarg3:
  564   //    [tos + 0]: saved c_rarg3
  565   //    [tos + 1]: saved c_rarg2
  566   //    [tos + 2]: saved lr
  567   //    [tos + 3]: saved rscratch2
  568   //    [tos + 4]: saved r0
  569   //    [tos + 5]: saved rscratch1
  570   address generate_verify_oop() {
  571     StubId stub_id = StubId::stubgen_verify_oop_id;
  572     StubCodeMark mark(this, stub_id);
  573     address start = __ pc();
  574 
  575     Label exit, error;
  576 
  577     // save c_rarg2 and c_rarg3
  578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  579 
  580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ ldr(c_rarg3, Address(c_rarg2));
  583     __ add(c_rarg3, c_rarg3, 1);
  584     __ str(c_rarg3, Address(c_rarg2));
  585 
  586     // object is in r0
  587     // make sure object is 'reasonable'
  588     __ cbz(r0, exit); // if obj is null it is OK
  589 
  590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  592 
  593     // return if everything seems ok
  594     __ bind(exit);
  595 
  596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  597     __ ret(lr);
  598 
  599     // handle errors
  600     __ bind(error);
  601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  602 
  603     __ push(RegSet::range(r0, r29), sp);
  604     // debug(char* msg, int64_t pc, int64_t regs[])
  605     __ mov(c_rarg0, rscratch1);      // pass address of error message
  606     __ mov(c_rarg1, lr);             // pass return address
  607     __ mov(c_rarg2, sp);             // pass address of regs on stack
  608 #ifndef PRODUCT
  609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  610 #endif
  611     BLOCK_COMMENT("call MacroAssembler::debug");
  612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  613     __ blr(rscratch1);
  614     __ hlt(0);
  615 
  616     return start;
  617   }
  618 
  619   // Generate indices for iota vector.
  620   address generate_iota_indices(StubId stub_id) {
  621     __ align(CodeEntryAlignment);
  622     StubCodeMark mark(this, stub_id);
  623     address start = __ pc();
  624     // B
  625     __ emit_data64(0x0706050403020100, relocInfo::none);
  626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  627     // H
  628     __ emit_data64(0x0003000200010000, relocInfo::none);
  629     __ emit_data64(0x0007000600050004, relocInfo::none);
  630     // S
  631     __ emit_data64(0x0000000100000000, relocInfo::none);
  632     __ emit_data64(0x0000000300000002, relocInfo::none);
  633     // D
  634     __ emit_data64(0x0000000000000000, relocInfo::none);
  635     __ emit_data64(0x0000000000000001, relocInfo::none);
  636     // S - FP
  637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  639     // D - FP
  640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  642     return start;
  643   }
  644 
  645   // The inner part of zero_words().  This is the bulk operation,
  646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  647   // caller is responsible for zeroing the last few words.
  648   //
  649   // Inputs:
  650   // r10: the HeapWord-aligned base address of an array to zero.
  651   // r11: the count in HeapWords, r11 > 0.
  652   //
  653   // Returns r10 and r11, adjusted for the caller to clear.
  654   // r10: the base address of the tail of words left to clear.
  655   // r11: the number of words in the tail.
  656   //      r11 < MacroAssembler::zero_words_block_size.
  657 
  658   address generate_zero_blocks() {
  659     Label done;
  660     Label base_aligned;
  661 
  662     Register base = r10, cnt = r11;
  663 
  664     __ align(CodeEntryAlignment);
  665     StubId stub_id = StubId::stubgen_zero_blocks_id;
  666     StubCodeMark mark(this, stub_id);
  667     address start = __ pc();
  668 
  669     if (UseBlockZeroing) {
  670       int zva_length = VM_Version::zva_length();
  671 
  672       // Ensure ZVA length can be divided by 16. This is required by
  673       // the subsequent operations.
  674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  675 
  676       __ tbz(base, 3, base_aligned);
  677       __ str(zr, Address(__ post(base, 8)));
  678       __ sub(cnt, cnt, 1);
  679       __ bind(base_aligned);
  680 
  681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  682       // alignment.
  683       Label small;
  684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  685       __ subs(rscratch1, cnt, low_limit >> 3);
  686       __ br(Assembler::LT, small);
  687       __ zero_dcache_blocks(base, cnt);
  688       __ bind(small);
  689     }
  690 
  691     {
  692       // Number of stp instructions we'll unroll
  693       const int unroll =
  694         MacroAssembler::zero_words_block_size / 2;
  695       // Clear the remaining blocks.
  696       Label loop;
  697       __ subs(cnt, cnt, unroll * 2);
  698       __ br(Assembler::LT, done);
  699       __ bind(loop);
  700       for (int i = 0; i < unroll; i++)
  701         __ stp(zr, zr, __ post(base, 16));
  702       __ subs(cnt, cnt, unroll * 2);
  703       __ br(Assembler::GE, loop);
  704       __ bind(done);
  705       __ add(cnt, cnt, unroll * 2);
  706     }
  707 
  708     __ ret(lr);
  709 
  710     return start;
  711   }
  712 
  713 
  714   typedef enum {
  715     copy_forwards = 1,
  716     copy_backwards = -1
  717   } copy_direction;
  718 
  719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  720   // for arraycopy stubs.
  721   class ArrayCopyBarrierSetHelper : StackObj {
  722     BarrierSetAssembler* _bs_asm;
  723     MacroAssembler* _masm;
  724     DecoratorSet _decorators;
  725     BasicType _type;
  726     Register _gct1;
  727     Register _gct2;
  728     Register _gct3;
  729     FloatRegister _gcvt1;
  730     FloatRegister _gcvt2;
  731     FloatRegister _gcvt3;
  732 
  733   public:
  734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  735                               DecoratorSet decorators,
  736                               BasicType type,
  737                               Register gct1,
  738                               Register gct2,
  739                               Register gct3,
  740                               FloatRegister gcvt1,
  741                               FloatRegister gcvt2,
  742                               FloatRegister gcvt3)
  743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  744         _masm(masm),
  745         _decorators(decorators),
  746         _type(type),
  747         _gct1(gct1),
  748         _gct2(gct2),
  749         _gct3(gct3),
  750         _gcvt1(gcvt1),
  751         _gcvt2(gcvt2),
  752         _gcvt3(gcvt3) {
  753     }
  754 
  755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  757                             dst1, dst2, src,
  758                             _gct1, _gct2, _gcvt1);
  759     }
  760 
  761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  763                              dst, src1, src2,
  764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  765     }
  766 
  767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  769                             dst1, dst2, src,
  770                             _gct1);
  771     }
  772 
  773     void copy_store_at_16(Address dst, Register src1, Register src2) {
  774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  775                              dst, src1, src2,
  776                              _gct1, _gct2, _gct3);
  777     }
  778 
  779     void copy_load_at_8(Register dst, Address src) {
  780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  781                             dst, noreg, src,
  782                             _gct1);
  783     }
  784 
  785     void copy_store_at_8(Address dst, Register src) {
  786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  787                              dst, src, noreg,
  788                              _gct1, _gct2, _gct3);
  789     }
  790   };
  791 
  792   // Bulk copy of blocks of 8 words.
  793   //
  794   // count is a count of words.
  795   //
  796   // Precondition: count >= 8
  797   //
  798   // Postconditions:
  799   //
  800   // The least significant bit of count contains the remaining count
  801   // of words to copy.  The rest of count is trash.
  802   //
  803   // s and d are adjusted to point to the remaining words to copy
  804   //
  805   void generate_copy_longs(StubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  806     BasicType type;
  807     copy_direction direction;
  808 
  809     switch (stub_id) {
  810     case StubId::stubgen_copy_byte_f_id:
  811       direction = copy_forwards;
  812       type = T_BYTE;
  813       break;
  814     case StubId::stubgen_copy_byte_b_id:
  815       direction = copy_backwards;
  816       type = T_BYTE;
  817       break;
  818     case StubId::stubgen_copy_oop_f_id:
  819       direction = copy_forwards;
  820       type = T_OBJECT;
  821       break;
  822     case StubId::stubgen_copy_oop_b_id:
  823       direction = copy_backwards;
  824       type = T_OBJECT;
  825       break;
  826     case StubId::stubgen_copy_oop_uninit_f_id:
  827       direction = copy_forwards;
  828       type = T_OBJECT;
  829       break;
  830     case StubId::stubgen_copy_oop_uninit_b_id:
  831       direction = copy_backwards;
  832       type = T_OBJECT;
  833       break;
  834     default:
  835       ShouldNotReachHere();
  836     }
  837 
  838     int unit = wordSize * direction;
  839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  840 
  841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  843     const Register stride = r14;
  844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  847 
  848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  849     assert_different_registers(s, d, count, rscratch1, rscratch2);
  850 
  851     Label again, drain;
  852 
  853     __ align(CodeEntryAlignment);
  854 
  855     StubCodeMark mark(this, stub_id);
  856 
  857     __ bind(start);
  858 
  859     Label unaligned_copy_long;
  860     if (AvoidUnalignedAccesses) {
  861       __ tbnz(d, 3, unaligned_copy_long);
  862     }
  863 
  864     if (direction == copy_forwards) {
  865       __ sub(s, s, bias);
  866       __ sub(d, d, bias);
  867     }
  868 
  869 #ifdef ASSERT
  870     // Make sure we are never given < 8 words
  871     {
  872       Label L;
  873       __ cmp(count, (u1)8);
  874       __ br(Assembler::GE, L);
  875       __ stop("genrate_copy_longs called with < 8 words");
  876       __ bind(L);
  877     }
  878 #endif
  879 
  880     // Fill 8 registers
  881     if (UseSIMDForMemoryOps) {
  882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  884     } else {
  885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  889     }
  890 
  891     __ subs(count, count, 16);
  892     __ br(Assembler::LO, drain);
  893 
  894     int prefetch = PrefetchCopyIntervalInBytes;
  895     bool use_stride = false;
  896     if (direction == copy_backwards) {
  897        use_stride = prefetch > 256;
  898        prefetch = -prefetch;
  899        if (use_stride) __ mov(stride, prefetch);
  900     }
  901 
  902     __ bind(again);
  903 
  904     if (PrefetchCopyIntervalInBytes > 0)
  905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  906 
  907     if (UseSIMDForMemoryOps) {
  908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  912     } else {
  913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  921     }
  922 
  923     __ subs(count, count, 8);
  924     __ br(Assembler::HS, again);
  925 
  926     // Drain
  927     __ bind(drain);
  928     if (UseSIMDForMemoryOps) {
  929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  931     } else {
  932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936     }
  937 
  938     {
  939       Label L1, L2;
  940       __ tbz(count, exact_log2(4), L1);
  941       if (UseSIMDForMemoryOps) {
  942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  944       } else {
  945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  949       }
  950       __ bind(L1);
  951 
  952       if (direction == copy_forwards) {
  953         __ add(s, s, bias);
  954         __ add(d, d, bias);
  955       }
  956 
  957       __ tbz(count, 1, L2);
  958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  960       __ bind(L2);
  961     }
  962 
  963     __ ret(lr);
  964 
  965     if (AvoidUnalignedAccesses) {
  966       Label drain, again;
  967       // Register order for storing. Order is different for backward copy.
  968 
  969       __ bind(unaligned_copy_long);
  970 
  971       // source address is even aligned, target odd aligned
  972       //
  973       // when forward copying word pairs we read long pairs at offsets
  974       // {0, 2, 4, 6} (in long words). when backwards copying we read
  975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  976       // address by -2 in the forwards case so we can compute the
  977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  978       // or -1.
  979       //
  980       // when forward copying we need to store 1 word, 3 pairs and
  981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  982       // zero offset We adjust the destination by -1 which means we
  983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  984       //
  985       // When backwards copyng we need to store 1 word, 3 pairs and
  986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  987       // offsets {1, 3, 5, 7, 8} * unit.
  988 
  989       if (direction == copy_forwards) {
  990         __ sub(s, s, 16);
  991         __ sub(d, d, 8);
  992       }
  993 
  994       // Fill 8 registers
  995       //
  996       // for forwards copy s was offset by -16 from the original input
  997       // value of s so the register contents are at these offsets
  998       // relative to the 64 bit block addressed by that original input
  999       // and so on for each successive 64 byte block when s is updated
 1000       //
 1001       // t0 at offset 0,  t1 at offset 8
 1002       // t2 at offset 16, t3 at offset 24
 1003       // t4 at offset 32, t5 at offset 40
 1004       // t6 at offset 48, t7 at offset 56
 1005 
 1006       // for backwards copy s was not offset so the register contents
 1007       // are at these offsets into the preceding 64 byte block
 1008       // relative to that original input and so on for each successive
 1009       // preceding 64 byte block when s is updated. this explains the
 1010       // slightly counter-intuitive looking pattern of register usage
 1011       // in the stp instructions for backwards copy.
 1012       //
 1013       // t0 at offset -16, t1 at offset -8
 1014       // t2 at offset -32, t3 at offset -24
 1015       // t4 at offset -48, t5 at offset -40
 1016       // t6 at offset -64, t7 at offset -56
 1017 
 1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1022 
 1023       __ subs(count, count, 16);
 1024       __ br(Assembler::LO, drain);
 1025 
 1026       int prefetch = PrefetchCopyIntervalInBytes;
 1027       bool use_stride = false;
 1028       if (direction == copy_backwards) {
 1029          use_stride = prefetch > 256;
 1030          prefetch = -prefetch;
 1031          if (use_stride) __ mov(stride, prefetch);
 1032       }
 1033 
 1034       __ bind(again);
 1035 
 1036       if (PrefetchCopyIntervalInBytes > 0)
 1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1038 
 1039       if (direction == copy_forwards) {
 1040        // allowing for the offset of -8 the store instructions place
 1041        // registers into the target 64 bit block at the following
 1042        // offsets
 1043        //
 1044        // t0 at offset 0
 1045        // t1 at offset 8,  t2 at offset 16
 1046        // t3 at offset 24, t4 at offset 32
 1047        // t5 at offset 40, t6 at offset 48
 1048        // t7 at offset 56
 1049 
 1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1059       } else {
 1060        // d was not offset when we started so the registers are
 1061        // written into the 64 bit block preceding d with the following
 1062        // offsets
 1063        //
 1064        // t1 at offset -8
 1065        // t3 at offset -24, t0 at offset -16
 1066        // t5 at offset -48, t2 at offset -32
 1067        // t7 at offset -56, t4 at offset -48
 1068        //                   t6 at offset -64
 1069        //
 1070        // note that this matches the offsets previously noted for the
 1071        // loads
 1072 
 1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1082       }
 1083 
 1084       __ subs(count, count, 8);
 1085       __ br(Assembler::HS, again);
 1086 
 1087       // Drain
 1088       //
 1089       // this uses the same pattern of offsets and register arguments
 1090       // as above
 1091       __ bind(drain);
 1092       if (direction == copy_forwards) {
 1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1098       } else {
 1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1104       }
 1105       // now we need to copy any remaining part block which may
 1106       // include a 4 word block subblock and/or a 2 word subblock.
 1107       // bits 2 and 1 in the count are the tell-tale for whether we
 1108       // have each such subblock
 1109       {
 1110         Label L1, L2;
 1111         __ tbz(count, exact_log2(4), L1);
 1112        // this is the same as above but copying only 4 longs hence
 1113        // with only one intervening stp between the str instructions
 1114        // but note that the offsets and registers still follow the
 1115        // same pattern
 1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1118         if (direction == copy_forwards) {
 1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1122         } else {
 1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1126         }
 1127         __ bind(L1);
 1128 
 1129         __ tbz(count, 1, L2);
 1130        // this is the same as above but copying only 2 longs hence
 1131        // there is no intervening stp between the str instructions
 1132        // but note that the offset and register patterns are still
 1133        // the same
 1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1135         if (direction == copy_forwards) {
 1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1141         }
 1142         __ bind(L2);
 1143 
 1144        // for forwards copy we need to re-adjust the offsets we
 1145        // applied so that s and d are follow the last words written
 1146 
 1147        if (direction == copy_forwards) {
 1148          __ add(s, s, 16);
 1149          __ add(d, d, 8);
 1150        }
 1151 
 1152       }
 1153 
 1154       __ ret(lr);
 1155       }
 1156   }
 1157 
 1158   // Small copy: less than 16 bytes.
 1159   //
 1160   // NB: Ignores all of the bits of count which represent more than 15
 1161   // bytes, so a caller doesn't have to mask them.
 1162 
 1163   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1164     bool is_backwards = step < 0;
 1165     size_t granularity = g_uabs(step);
 1166     int direction = is_backwards ? -1 : 1;
 1167 
 1168     Label Lword, Lint, Lshort, Lbyte;
 1169 
 1170     assert(granularity
 1171            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1172 
 1173     const Register t0 = r3;
 1174     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1175     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1176 
 1177     // ??? I don't know if this bit-test-and-branch is the right thing
 1178     // to do.  It does a lot of jumping, resulting in several
 1179     // mispredicted branches.  It might make more sense to do this
 1180     // with something like Duff's device with a single computed branch.
 1181 
 1182     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1183     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1184     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1185     __ bind(Lword);
 1186 
 1187     if (granularity <= sizeof (jint)) {
 1188       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1189       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1190       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1191       __ bind(Lint);
 1192     }
 1193 
 1194     if (granularity <= sizeof (jshort)) {
 1195       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1196       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1197       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1198       __ bind(Lshort);
 1199     }
 1200 
 1201     if (granularity <= sizeof (jbyte)) {
 1202       __ tbz(count, 0, Lbyte);
 1203       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1204       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1205       __ bind(Lbyte);
 1206     }
 1207   }
 1208 
 1209   Label copy_f, copy_b;
 1210   Label copy_obj_f, copy_obj_b;
 1211   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1212 
 1213   // All-singing all-dancing memory copy.
 1214   //
 1215   // Copy count units of memory from s to d.  The size of a unit is
 1216   // step, which can be positive or negative depending on the direction
 1217   // of copy.  If is_aligned is false, we align the source address.
 1218   //
 1219 
 1220   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1221                    Register s, Register d, Register count, int step) {
 1222     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1223     bool is_backwards = step < 0;
 1224     unsigned int granularity = g_uabs(step);
 1225     const Register t0 = r3, t1 = r4;
 1226 
 1227     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1228     // load all the data before writing anything
 1229     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1230     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1231     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1232     const Register send = r17, dend = r16;
 1233     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1234     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1235     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1236 
 1237     if (PrefetchCopyIntervalInBytes > 0)
 1238       __ prfm(Address(s, 0), PLDL1KEEP);
 1239     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1240     __ br(Assembler::HI, copy_big);
 1241 
 1242     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1243     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1244 
 1245     __ cmp(count, u1(16/granularity));
 1246     __ br(Assembler::LS, copy16);
 1247 
 1248     __ cmp(count, u1(64/granularity));
 1249     __ br(Assembler::HI, copy80);
 1250 
 1251     __ cmp(count, u1(32/granularity));
 1252     __ br(Assembler::LS, copy32);
 1253 
 1254     // 33..64 bytes
 1255     if (UseSIMDForMemoryOps) {
 1256       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1257       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1258       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1259       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1260     } else {
 1261       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1262       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1263       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1264       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1265 
 1266       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1267       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1268       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1269       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1270     }
 1271     __ b(finish);
 1272 
 1273     // 17..32 bytes
 1274     __ bind(copy32);
 1275     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1276     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1277 
 1278     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1279     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1280     __ b(finish);
 1281 
 1282     // 65..80/96 bytes
 1283     // (96 bytes if SIMD because we do 32 byes per instruction)
 1284     __ bind(copy80);
 1285     if (UseSIMDForMemoryOps) {
 1286       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1287       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1288       // Unaligned pointers can be an issue for copying.
 1289       // The issue has more chances to happen when granularity of data is
 1290       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1291       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1292       // The most performance drop has been seen for the range 65-80 bytes.
 1293       // For such cases using the pair of ldp/stp instead of the third pair of
 1294       // ldpq/stpq fixes the performance issue.
 1295       if (granularity < sizeof (jint)) {
 1296         Label copy96;
 1297         __ cmp(count, u1(80/granularity));
 1298         __ br(Assembler::HI, copy96);
 1299         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1300 
 1301         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1302         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1303 
 1304         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1305         __ b(finish);
 1306 
 1307         __ bind(copy96);
 1308       }
 1309       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1310 
 1311       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1312       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1313 
 1314       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1315     } else {
 1316       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1317       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1318       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1319       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1320       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1321 
 1322       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1323       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1324       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1325       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1326       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1327     }
 1328     __ b(finish);
 1329 
 1330     // 0..16 bytes
 1331     __ bind(copy16);
 1332     __ cmp(count, u1(8/granularity));
 1333     __ br(Assembler::LO, copy8);
 1334 
 1335     // 8..16 bytes
 1336     bs.copy_load_at_8(t0, Address(s, 0));
 1337     bs.copy_load_at_8(t1, Address(send, -8));
 1338     bs.copy_store_at_8(Address(d, 0), t0);
 1339     bs.copy_store_at_8(Address(dend, -8), t1);
 1340     __ b(finish);
 1341 
 1342     if (granularity < 8) {
 1343       // 4..7 bytes
 1344       __ bind(copy8);
 1345       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1346       __ ldrw(t0, Address(s, 0));
 1347       __ ldrw(t1, Address(send, -4));
 1348       __ strw(t0, Address(d, 0));
 1349       __ strw(t1, Address(dend, -4));
 1350       __ b(finish);
 1351       if (granularity < 4) {
 1352         // 0..3 bytes
 1353         __ bind(copy4);
 1354         __ cbz(count, finish); // get rid of 0 case
 1355         if (granularity == 2) {
 1356           __ ldrh(t0, Address(s, 0));
 1357           __ strh(t0, Address(d, 0));
 1358         } else { // granularity == 1
 1359           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1360           // the first and last byte.
 1361           // Handle the 3 byte case by loading and storing base + count/2
 1362           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1363           // This does means in the 1 byte case we load/store the same
 1364           // byte 3 times.
 1365           __ lsr(count, count, 1);
 1366           __ ldrb(t0, Address(s, 0));
 1367           __ ldrb(t1, Address(send, -1));
 1368           __ ldrb(t2, Address(s, count));
 1369           __ strb(t0, Address(d, 0));
 1370           __ strb(t1, Address(dend, -1));
 1371           __ strb(t2, Address(d, count));
 1372         }
 1373         __ b(finish);
 1374       }
 1375     }
 1376 
 1377     __ bind(copy_big);
 1378     if (is_backwards) {
 1379       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1380       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1381     }
 1382 
 1383     // Now we've got the small case out of the way we can align the
 1384     // source address on a 2-word boundary.
 1385 
 1386     // Here we will materialize a count in r15, which is used by copy_memory_small
 1387     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1388     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1389     // can not be used as a temp register, as it contains the count.
 1390 
 1391     Label aligned;
 1392 
 1393     if (is_aligned) {
 1394       // We may have to adjust by 1 word to get s 2-word-aligned.
 1395       __ tbz(s, exact_log2(wordSize), aligned);
 1396       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1397       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1398       __ sub(count, count, wordSize/granularity);
 1399     } else {
 1400       if (is_backwards) {
 1401         __ andr(r15, s, 2 * wordSize - 1);
 1402       } else {
 1403         __ neg(r15, s);
 1404         __ andr(r15, r15, 2 * wordSize - 1);
 1405       }
 1406       // r15 is the byte adjustment needed to align s.
 1407       __ cbz(r15, aligned);
 1408       int shift = exact_log2(granularity);
 1409       if (shift > 0) {
 1410         __ lsr(r15, r15, shift);
 1411       }
 1412       __ sub(count, count, r15);
 1413 
 1414 #if 0
 1415       // ?? This code is only correct for a disjoint copy.  It may or
 1416       // may not make sense to use it in that case.
 1417 
 1418       // Copy the first pair; s and d may not be aligned.
 1419       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1420       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1421 
 1422       // Align s and d, adjust count
 1423       if (is_backwards) {
 1424         __ sub(s, s, r15);
 1425         __ sub(d, d, r15);
 1426       } else {
 1427         __ add(s, s, r15);
 1428         __ add(d, d, r15);
 1429       }
 1430 #else
 1431       copy_memory_small(decorators, type, s, d, r15, step);
 1432 #endif
 1433     }
 1434 
 1435     __ bind(aligned);
 1436 
 1437     // s is now 2-word-aligned.
 1438 
 1439     // We have a count of units and some trailing bytes. Adjust the
 1440     // count and do a bulk copy of words. If the shift is zero
 1441     // perform a move instead to benefit from zero latency moves.
 1442     int shift = exact_log2(wordSize/granularity);
 1443     if (shift > 0) {
 1444       __ lsr(r15, count, shift);
 1445     } else {
 1446       __ mov(r15, count);
 1447     }
 1448     if (direction == copy_forwards) {
 1449       if (type != T_OBJECT) {
 1450         __ bl(copy_f);
 1451       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1452         __ bl(copy_obj_uninit_f);
 1453       } else {
 1454         __ bl(copy_obj_f);
 1455       }
 1456     } else {
 1457       if (type != T_OBJECT) {
 1458         __ bl(copy_b);
 1459       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1460         __ bl(copy_obj_uninit_b);
 1461       } else {
 1462         __ bl(copy_obj_b);
 1463       }
 1464     }
 1465 
 1466     // And the tail.
 1467     copy_memory_small(decorators, type, s, d, count, step);
 1468 
 1469     if (granularity >= 8) __ bind(copy8);
 1470     if (granularity >= 4) __ bind(copy4);
 1471     __ bind(finish);
 1472   }
 1473 
 1474 
 1475   void clobber_registers() {
 1476 #ifdef ASSERT
 1477     RegSet clobbered
 1478       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1479     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1480     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1481     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1482       __ mov(*it, rscratch1);
 1483     }
 1484 #endif
 1485 
 1486   }
 1487 
 1488   // Scan over array at a for count oops, verifying each one.
 1489   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1490   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1491     Label loop, end;
 1492     __ mov(rscratch1, a);
 1493     __ mov(rscratch2, zr);
 1494     __ bind(loop);
 1495     __ cmp(rscratch2, count);
 1496     __ br(Assembler::HS, end);
 1497     if (size == wordSize) {
 1498       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1499       __ verify_oop(temp);
 1500     } else {
 1501       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1502       __ decode_heap_oop(temp); // calls verify_oop
 1503     }
 1504     __ add(rscratch2, rscratch2, 1);
 1505     __ b(loop);
 1506     __ bind(end);
 1507   }
 1508 
 1509   // Arguments:
 1510   //   stub_id - is used to name the stub and identify all details of
 1511   //             how to perform the copy.
 1512   //
 1513   //   entry - is assigned to the stub's post push entry point unless
 1514   //           it is null
 1515   //
 1516   // Inputs:
 1517   //   c_rarg0   - source array address
 1518   //   c_rarg1   - destination array address
 1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1520   //
 1521   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1522   // the hardware handle it.  The two dwords within qwords that span
 1523   // cache line boundaries will still be loaded and stored atomically.
 1524   //
 1525   // Side Effects: entry is set to the (post push) entry point so it
 1526   //               can be used by the corresponding conjoint copy
 1527   //               method
 1528   //
 1529   address generate_disjoint_copy(StubId stub_id, address *entry) {
 1530     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1531     RegSet saved_reg = RegSet::of(s, d, count);
 1532     int size;
 1533     bool aligned;
 1534     bool is_oop;
 1535     bool dest_uninitialized;
 1536     switch (stub_id) {
 1537     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1538       size = sizeof(jbyte);
 1539       aligned = false;
 1540       is_oop = false;
 1541       dest_uninitialized = false;
 1542       break;
 1543     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1544       size = sizeof(jbyte);
 1545       aligned = true;
 1546       is_oop = false;
 1547       dest_uninitialized = false;
 1548       break;
 1549     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1550       size = sizeof(jshort);
 1551       aligned = false;
 1552       is_oop = false;
 1553       dest_uninitialized = false;
 1554       break;
 1555     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1556       size = sizeof(jshort);
 1557       aligned = true;
 1558       is_oop = false;
 1559       dest_uninitialized = false;
 1560       break;
 1561     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1562       size = sizeof(jint);
 1563       aligned = false;
 1564       is_oop = false;
 1565       dest_uninitialized = false;
 1566       break;
 1567     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1568       size = sizeof(jint);
 1569       aligned = true;
 1570       is_oop = false;
 1571       dest_uninitialized = false;
 1572       break;
 1573     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1574       // since this is always aligned we can (should!) use the same
 1575       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1576       ShouldNotReachHere();
 1577       break;
 1578     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1579       size = sizeof(jlong);
 1580       aligned = true;
 1581       is_oop = false;
 1582       dest_uninitialized = false;
 1583       break;
 1584     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1585       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1586       aligned = !UseCompressedOops;
 1587       is_oop = true;
 1588       dest_uninitialized = false;
 1589       break;
 1590     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1591       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1592       aligned = !UseCompressedOops;
 1593       is_oop = true;
 1594       dest_uninitialized = false;
 1595       break;
 1596     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1597       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1598       aligned = !UseCompressedOops;
 1599       is_oop = true;
 1600       dest_uninitialized = true;
 1601       break;
 1602     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1603       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1604       aligned = !UseCompressedOops;
 1605       is_oop = true;
 1606       dest_uninitialized = true;
 1607       break;
 1608     default:
 1609       ShouldNotReachHere();
 1610       break;
 1611     }
 1612 
 1613     __ align(CodeEntryAlignment);
 1614     StubCodeMark mark(this, stub_id);
 1615     address start = __ pc();
 1616     __ enter();
 1617 
 1618     if (entry != nullptr) {
 1619       *entry = __ pc();
 1620       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1621       BLOCK_COMMENT("Entry:");
 1622     }
 1623 
 1624     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1625     if (dest_uninitialized) {
 1626       decorators |= IS_DEST_UNINITIALIZED;
 1627     }
 1628     if (aligned) {
 1629       decorators |= ARRAYCOPY_ALIGNED;
 1630     }
 1631 
 1632     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1633     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1634 
 1635     if (is_oop) {
 1636       // save regs before copy_memory
 1637       __ push(RegSet::of(d, count), sp);
 1638     }
 1639     {
 1640       // UnsafeMemoryAccess page error: continue after unsafe access
 1641       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1642       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1643       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1644     }
 1645 
 1646     if (is_oop) {
 1647       __ pop(RegSet::of(d, count), sp);
 1648       if (VerifyOops)
 1649         verify_oop_array(size, d, count, r16);
 1650     }
 1651 
 1652     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1653 
 1654     __ leave();
 1655     __ mov(r0, zr); // return 0
 1656     __ ret(lr);
 1657     return start;
 1658   }
 1659 
 1660   // Arguments:
 1661   //   stub_id - is used to name the stub and identify all details of
 1662   //             how to perform the copy.
 1663   //
 1664   //   nooverlap_target - identifes the (post push) entry for the
 1665   //             corresponding disjoint copy routine which can be
 1666   //             jumped to if the ranges do not actually overlap
 1667   //
 1668   //   entry - is assigned to the stub's post push entry point unless
 1669   //           it is null
 1670   //
 1671   //
 1672   // Inputs:
 1673   //   c_rarg0   - source array address
 1674   //   c_rarg1   - destination array address
 1675   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1676   //
 1677   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1678   // the hardware handle it.  The two dwords within qwords that span
 1679   // cache line boundaries will still be loaded and stored atomically.
 1680   //
 1681   // Side Effects:
 1682   //   entry is set to the no-overlap entry point so it can be used by
 1683   //   some other conjoint copy method
 1684   //
 1685   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) {
 1686     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1687     RegSet saved_regs = RegSet::of(s, d, count);
 1688     int size;
 1689     bool aligned;
 1690     bool is_oop;
 1691     bool dest_uninitialized;
 1692     switch (stub_id) {
 1693     case StubId::stubgen_jbyte_arraycopy_id:
 1694       size = sizeof(jbyte);
 1695       aligned = false;
 1696       is_oop = false;
 1697       dest_uninitialized = false;
 1698       break;
 1699     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1700       size = sizeof(jbyte);
 1701       aligned = true;
 1702       is_oop = false;
 1703       dest_uninitialized = false;
 1704       break;
 1705     case StubId::stubgen_jshort_arraycopy_id:
 1706       size = sizeof(jshort);
 1707       aligned = false;
 1708       is_oop = false;
 1709       dest_uninitialized = false;
 1710       break;
 1711     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1712       size = sizeof(jshort);
 1713       aligned = true;
 1714       is_oop = false;
 1715       dest_uninitialized = false;
 1716       break;
 1717     case StubId::stubgen_jint_arraycopy_id:
 1718       size = sizeof(jint);
 1719       aligned = false;
 1720       is_oop = false;
 1721       dest_uninitialized = false;
 1722       break;
 1723     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1724       size = sizeof(jint);
 1725       aligned = true;
 1726       is_oop = false;
 1727       dest_uninitialized = false;
 1728       break;
 1729     case StubId::stubgen_jlong_arraycopy_id:
 1730       // since this is always aligned we can (should!) use the same
 1731       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1732       ShouldNotReachHere();
 1733       break;
 1734     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1735       size = sizeof(jlong);
 1736       aligned = true;
 1737       is_oop = false;
 1738       dest_uninitialized = false;
 1739       break;
 1740     case StubId::stubgen_oop_arraycopy_id:
 1741       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1742       aligned = !UseCompressedOops;
 1743       is_oop = true;
 1744       dest_uninitialized = false;
 1745       break;
 1746     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1747       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1748       aligned = !UseCompressedOops;
 1749       is_oop = true;
 1750       dest_uninitialized = false;
 1751       break;
 1752     case StubId::stubgen_oop_arraycopy_uninit_id:
 1753       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1754       aligned = !UseCompressedOops;
 1755       is_oop = true;
 1756       dest_uninitialized = true;
 1757       break;
 1758     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1759       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1760       aligned = !UseCompressedOops;
 1761       is_oop = true;
 1762       dest_uninitialized = true;
 1763       break;
 1764     default:
 1765       ShouldNotReachHere();
 1766     }
 1767 
 1768     StubCodeMark mark(this, stub_id);
 1769     address start = __ pc();
 1770     __ enter();
 1771 
 1772     if (entry != nullptr) {
 1773       *entry = __ pc();
 1774       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1775       BLOCK_COMMENT("Entry:");
 1776     }
 1777 
 1778     // use fwd copy when (d-s) above_equal (count*size)
 1779     __ sub(rscratch1, d, s);
 1780     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1781     __ br(Assembler::HS, nooverlap_target);
 1782 
 1783     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1784     if (dest_uninitialized) {
 1785       decorators |= IS_DEST_UNINITIALIZED;
 1786     }
 1787     if (aligned) {
 1788       decorators |= ARRAYCOPY_ALIGNED;
 1789     }
 1790 
 1791     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1792     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1793 
 1794     if (is_oop) {
 1795       // save regs before copy_memory
 1796       __ push(RegSet::of(d, count), sp);
 1797     }
 1798     {
 1799       // UnsafeMemoryAccess page error: continue after unsafe access
 1800       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1801       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1802       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1803     }
 1804     if (is_oop) {
 1805       __ pop(RegSet::of(d, count), sp);
 1806       if (VerifyOops)
 1807         verify_oop_array(size, d, count, r16);
 1808     }
 1809     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1810     __ leave();
 1811     __ mov(r0, zr); // return 0
 1812     __ ret(lr);
 1813     return start;
 1814   }
 1815 
 1816   // Helper for generating a dynamic type check.
 1817   // Smashes rscratch1, rscratch2.
 1818   void generate_type_check(Register sub_klass,
 1819                            Register super_check_offset,
 1820                            Register super_klass,
 1821                            Register temp1,
 1822                            Register temp2,
 1823                            Register result,
 1824                            Label& L_success) {
 1825     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1826 
 1827     BLOCK_COMMENT("type_check:");
 1828 
 1829     Label L_miss;
 1830 
 1831     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1832                                      super_check_offset);
 1833     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1834 
 1835     // Fall through on failure!
 1836     __ BIND(L_miss);
 1837   }
 1838 
 1839   //
 1840   //  Generate checkcasting array copy stub
 1841   //
 1842   //  Input:
 1843   //    c_rarg0   - source array address
 1844   //    c_rarg1   - destination array address
 1845   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1846   //    c_rarg3   - size_t ckoff (super_check_offset)
 1847   //    c_rarg4   - oop ckval (super_klass)
 1848   //
 1849   //  Output:
 1850   //    r0 ==  0  -  success
 1851   //    r0 == -1^K - failure, where K is partial transfer count
 1852   //
 1853   address generate_checkcast_copy(StubId stub_id, address *entry) {
 1854     bool dest_uninitialized;
 1855     switch (stub_id) {
 1856     case StubId::stubgen_checkcast_arraycopy_id:
 1857       dest_uninitialized = false;
 1858       break;
 1859     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1860       dest_uninitialized = true;
 1861       break;
 1862     default:
 1863       ShouldNotReachHere();
 1864     }
 1865 
 1866     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1867 
 1868     // Input registers (after setup_arg_regs)
 1869     const Register from        = c_rarg0;   // source array address
 1870     const Register to          = c_rarg1;   // destination array address
 1871     const Register count       = c_rarg2;   // elementscount
 1872     const Register ckoff       = c_rarg3;   // super_check_offset
 1873     const Register ckval       = c_rarg4;   // super_klass
 1874 
 1875     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1876     RegSet wb_post_saved_regs = RegSet::of(count);
 1877 
 1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1879     const Register copied_oop  = r22;       // actual oop copied
 1880     const Register count_save  = r21;       // orig elementscount
 1881     const Register start_to    = r20;       // destination array start address
 1882     const Register r19_klass   = r19;       // oop._klass
 1883 
 1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1886 
 1887     //---------------------------------------------------------------
 1888     // Assembler stub will be used for this call to arraycopy
 1889     // if the two arrays are subtypes of Object[] but the
 1890     // destination array type is not equal to or a supertype
 1891     // of the source type.  Each element must be separately
 1892     // checked.
 1893 
 1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1895                                copied_oop, r19_klass, count_save);
 1896 
 1897     __ align(CodeEntryAlignment);
 1898     StubCodeMark mark(this, stub_id);
 1899     address start = __ pc();
 1900 
 1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1902 
 1903 #ifdef ASSERT
 1904     // caller guarantees that the arrays really are different
 1905     // otherwise, we would have to make conjoint checks
 1906     { Label L;
 1907       __ b(L);                  // conjoint check not yet implemented
 1908       __ stop("checkcast_copy within a single array");
 1909       __ bind(L);
 1910     }
 1911 #endif //ASSERT
 1912 
 1913     // Caller of this entry point must set up the argument registers.
 1914     if (entry != nullptr) {
 1915       *entry = __ pc();
 1916       BLOCK_COMMENT("Entry:");
 1917     }
 1918 
 1919      // Empty array:  Nothing to do.
 1920     __ cbz(count, L_done);
 1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1922 
 1923 #ifdef ASSERT
 1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1925     // The ckoff and ckval must be mutually consistent,
 1926     // even though caller generates both.
 1927     { Label L;
 1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1929       __ ldrw(start_to, Address(ckval, sco_offset));
 1930       __ cmpw(ckoff, start_to);
 1931       __ br(Assembler::EQ, L);
 1932       __ stop("super_check_offset inconsistent");
 1933       __ bind(L);
 1934     }
 1935 #endif //ASSERT
 1936 
 1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1938     bool is_oop = true;
 1939     int element_size = UseCompressedOops ? 4 : 8;
 1940     if (dest_uninitialized) {
 1941       decorators |= IS_DEST_UNINITIALIZED;
 1942     }
 1943 
 1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1946 
 1947     // save the original count
 1948     __ mov(count_save, count);
 1949 
 1950     // Copy from low to high addresses
 1951     __ mov(start_to, to);              // Save destination array start address
 1952     __ b(L_load_element);
 1953 
 1954     // ======== begin loop ========
 1955     // (Loop is rotated; its entry is L_load_element.)
 1956     // Loop control:
 1957     //   for (; count != 0; count--) {
 1958     //     copied_oop = load_heap_oop(from++);
 1959     //     ... generate_type_check ...;
 1960     //     store_heap_oop(to++, copied_oop);
 1961     //   }
 1962     __ align(OptoLoopAlignment);
 1963 
 1964     __ BIND(L_store_element);
 1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1966                       __ post(to, element_size), copied_oop, noreg,
 1967                       gct1, gct2, gct3);
 1968     __ sub(count, count, 1);
 1969     __ cbz(count, L_do_card_marks);
 1970 
 1971     // ======== loop entry is here ========
 1972     __ BIND(L_load_element);
 1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1974                      copied_oop, noreg, __ post(from, element_size),
 1975                      gct1);
 1976     __ cbz(copied_oop, L_store_element);
 1977 
 1978     __ load_klass(r19_klass, copied_oop);// query the object klass
 1979 
 1980     BLOCK_COMMENT("type_check:");
 1981     generate_type_check(/*sub_klass*/r19_klass,
 1982                         /*super_check_offset*/ckoff,
 1983                         /*super_klass*/ckval,
 1984                         /*r_array_base*/gct1,
 1985                         /*temp2*/gct2,
 1986                         /*result*/r10, L_store_element);
 1987 
 1988     // Fall through on failure!
 1989 
 1990     // ======== end loop ========
 1991 
 1992     // It was a real error; we must depend on the caller to finish the job.
 1993     // Register count = remaining oops, count_orig = total oops.
 1994     // Emit GC store barriers for the oops we have copied and report
 1995     // their number to the caller.
 1996 
 1997     __ subs(count, count_save, count);     // K = partially copied oop count
 1998     __ eon(count, count, zr);              // report (-1^K) to caller
 1999     __ br(Assembler::EQ, L_done_pop);
 2000 
 2001     __ BIND(L_do_card_marks);
 2002     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2003 
 2004     __ bind(L_done_pop);
 2005     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2007 
 2008     __ bind(L_done);
 2009     __ mov(r0, count);
 2010     __ leave();
 2011     __ ret(lr);
 2012 
 2013     return start;
 2014   }
 2015 
 2016   // Perform range checks on the proposed arraycopy.
 2017   // Kills temp, but nothing else.
 2018   // Also, clean the sign bits of src_pos and dst_pos.
 2019   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2020                               Register src_pos, // source position (c_rarg1)
 2021                               Register dst,     // destination array oo (c_rarg2)
 2022                               Register dst_pos, // destination position (c_rarg3)
 2023                               Register length,
 2024                               Register temp,
 2025                               Label& L_failed) {
 2026     BLOCK_COMMENT("arraycopy_range_checks:");
 2027 
 2028     assert_different_registers(rscratch1, temp);
 2029 
 2030     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2031     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2032     __ addw(temp, length, src_pos);
 2033     __ cmpw(temp, rscratch1);
 2034     __ br(Assembler::HI, L_failed);
 2035 
 2036     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2037     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2038     __ addw(temp, length, dst_pos);
 2039     __ cmpw(temp, rscratch1);
 2040     __ br(Assembler::HI, L_failed);
 2041 
 2042     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2043     __ movw(src_pos, src_pos);
 2044     __ movw(dst_pos, dst_pos);
 2045 
 2046     BLOCK_COMMENT("arraycopy_range_checks done");
 2047   }
 2048 
 2049   // These stubs get called from some dumb test routine.
 2050   // I'll write them properly when they're called from
 2051   // something that's actually doing something.
 2052   static void fake_arraycopy_stub(address src, address dst, int count) {
 2053     assert(count == 0, "huh?");
 2054   }
 2055 
 2056 
 2057   //
 2058   //  Generate 'unsafe' array copy stub
 2059   //  Though just as safe as the other stubs, it takes an unscaled
 2060   //  size_t argument instead of an element count.
 2061   //
 2062   //  Input:
 2063   //    c_rarg0   - source array address
 2064   //    c_rarg1   - destination array address
 2065   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2066   //
 2067   // Examines the alignment of the operands and dispatches
 2068   // to a long, int, short, or byte copy loop.
 2069   //
 2070   address generate_unsafe_copy(address byte_copy_entry,
 2071                                address short_copy_entry,
 2072                                address int_copy_entry,
 2073                                address long_copy_entry) {
 2074     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2075 
 2076     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2077     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2078 
 2079     __ align(CodeEntryAlignment);
 2080     StubCodeMark mark(this, stub_id);
 2081     address start = __ pc();
 2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2083 
 2084     // bump this on entry, not on exit:
 2085     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2086 
 2087     __ orr(rscratch1, s, d);
 2088     __ orr(rscratch1, rscratch1, count);
 2089 
 2090     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2091     __ cbz(rscratch1, L_long_aligned);
 2092     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2093     __ cbz(rscratch1, L_int_aligned);
 2094     __ tbz(rscratch1, 0, L_short_aligned);
 2095     __ b(RuntimeAddress(byte_copy_entry));
 2096 
 2097     __ BIND(L_short_aligned);
 2098     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2099     __ b(RuntimeAddress(short_copy_entry));
 2100     __ BIND(L_int_aligned);
 2101     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2102     __ b(RuntimeAddress(int_copy_entry));
 2103     __ BIND(L_long_aligned);
 2104     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2105     __ b(RuntimeAddress(long_copy_entry));
 2106 
 2107     return start;
 2108   }
 2109 
 2110   //
 2111   //  Generate generic array copy stubs
 2112   //
 2113   //  Input:
 2114   //    c_rarg0    -  src oop
 2115   //    c_rarg1    -  src_pos (32-bits)
 2116   //    c_rarg2    -  dst oop
 2117   //    c_rarg3    -  dst_pos (32-bits)
 2118   //    c_rarg4    -  element count (32-bits)
 2119   //
 2120   //  Output:
 2121   //    r0 ==  0  -  success
 2122   //    r0 == -1^K - failure, where K is partial transfer count
 2123   //
 2124   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2125                                 address int_copy_entry, address oop_copy_entry,
 2126                                 address long_copy_entry, address checkcast_copy_entry) {
 2127     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2128 
 2129     Label L_failed, L_objArray;
 2130     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2131 
 2132     // Input registers
 2133     const Register src        = c_rarg0;  // source array oop
 2134     const Register src_pos    = c_rarg1;  // source position
 2135     const Register dst        = c_rarg2;  // destination array oop
 2136     const Register dst_pos    = c_rarg3;  // destination position
 2137     const Register length     = c_rarg4;
 2138 
 2139 
 2140     // Registers used as temps
 2141     const Register dst_klass  = c_rarg5;
 2142 
 2143     __ align(CodeEntryAlignment);
 2144 
 2145     StubCodeMark mark(this, stub_id);
 2146 
 2147     address start = __ pc();
 2148 
 2149     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2150 
 2151     // bump this on entry, not on exit:
 2152     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2153 
 2154     //-----------------------------------------------------------------------
 2155     // Assembler stub will be used for this call to arraycopy
 2156     // if the following conditions are met:
 2157     //
 2158     // (1) src and dst must not be null.
 2159     // (2) src_pos must not be negative.
 2160     // (3) dst_pos must not be negative.
 2161     // (4) length  must not be negative.
 2162     // (5) src klass and dst klass should be the same and not null.
 2163     // (6) src and dst should be arrays.
 2164     // (7) src_pos + length must not exceed length of src.
 2165     // (8) dst_pos + length must not exceed length of dst.
 2166     //
 2167 
 2168     //  if (src == nullptr) return -1;
 2169     __ cbz(src, L_failed);
 2170 
 2171     //  if (src_pos < 0) return -1;
 2172     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2173 
 2174     //  if (dst == nullptr) return -1;
 2175     __ cbz(dst, L_failed);
 2176 
 2177     //  if (dst_pos < 0) return -1;
 2178     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2179 
 2180     // registers used as temp
 2181     const Register scratch_length    = r16; // elements count to copy
 2182     const Register scratch_src_klass = r17; // array klass
 2183     const Register lh                = r15; // layout helper
 2184 
 2185     //  if (length < 0) return -1;
 2186     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2187     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2188 
 2189     __ load_klass(scratch_src_klass, src);
 2190 #ifdef ASSERT
 2191     //  assert(src->klass() != nullptr);
 2192     {
 2193       BLOCK_COMMENT("assert klasses not null {");
 2194       Label L1, L2;
 2195       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2196       __ bind(L1);
 2197       __ stop("broken null klass");
 2198       __ bind(L2);
 2199       __ load_klass(rscratch1, dst);
 2200       __ cbz(rscratch1, L1);     // this would be broken also
 2201       BLOCK_COMMENT("} assert klasses not null done");
 2202     }
 2203 #endif
 2204 
 2205     // Load layout helper (32-bits)
 2206     //
 2207     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2208     // 32        30    24            16              8     2                 0
 2209     //
 2210     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2211     //
 2212 
 2213     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2214 
 2215     // Handle objArrays completely differently...
 2216     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2217     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2218     __ movw(rscratch1, objArray_lh);
 2219     __ eorw(rscratch2, lh, rscratch1);
 2220     __ cbzw(rscratch2, L_objArray);
 2221 
 2222     //  if (src->klass() != dst->klass()) return -1;
 2223     __ load_klass(rscratch2, dst);
 2224     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2225     __ cbnz(rscratch2, L_failed);
 2226 
 2227     //  if (!src->is_Array()) return -1;
 2228     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2229 
 2230     // At this point, it is known to be a typeArray (array_tag 0x3).
 2231 #ifdef ASSERT
 2232     {
 2233       BLOCK_COMMENT("assert primitive array {");
 2234       Label L;
 2235       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2236       __ cmpw(lh, rscratch2);
 2237       __ br(Assembler::GE, L);
 2238       __ stop("must be a primitive array");
 2239       __ bind(L);
 2240       BLOCK_COMMENT("} assert primitive array done");
 2241     }
 2242 #endif
 2243 
 2244     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2245                            rscratch2, L_failed);
 2246 
 2247     // TypeArrayKlass
 2248     //
 2249     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2250     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2251     //
 2252 
 2253     const Register rscratch1_offset = rscratch1;    // array offset
 2254     const Register r15_elsize = lh; // element size
 2255 
 2256     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2257            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2258     __ add(src, src, rscratch1_offset);           // src array offset
 2259     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2260     BLOCK_COMMENT("choose copy loop based on element size");
 2261 
 2262     // next registers should be set before the jump to corresponding stub
 2263     const Register from     = c_rarg0;  // source array address
 2264     const Register to       = c_rarg1;  // destination array address
 2265     const Register count    = c_rarg2;  // elements count
 2266 
 2267     // 'from', 'to', 'count' registers should be set in such order
 2268     // since they are the same as 'src', 'src_pos', 'dst'.
 2269 
 2270     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2271 
 2272     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2273     // size in bytes).  We do a simple bitwise binary search.
 2274   __ BIND(L_copy_bytes);
 2275     __ tbnz(r15_elsize, 1, L_copy_ints);
 2276     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2277     __ lea(from, Address(src, src_pos));// src_addr
 2278     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2279     __ movw(count, scratch_length); // length
 2280     __ b(RuntimeAddress(byte_copy_entry));
 2281 
 2282   __ BIND(L_copy_shorts);
 2283     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2285     __ movw(count, scratch_length); // length
 2286     __ b(RuntimeAddress(short_copy_entry));
 2287 
 2288   __ BIND(L_copy_ints);
 2289     __ tbnz(r15_elsize, 0, L_copy_longs);
 2290     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2291     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2292     __ movw(count, scratch_length); // length
 2293     __ b(RuntimeAddress(int_copy_entry));
 2294 
 2295   __ BIND(L_copy_longs);
 2296 #ifdef ASSERT
 2297     {
 2298       BLOCK_COMMENT("assert long copy {");
 2299       Label L;
 2300       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2301       __ cmpw(r15_elsize, LogBytesPerLong);
 2302       __ br(Assembler::EQ, L);
 2303       __ stop("must be long copy, but elsize is wrong");
 2304       __ bind(L);
 2305       BLOCK_COMMENT("} assert long copy done");
 2306     }
 2307 #endif
 2308     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2310     __ movw(count, scratch_length); // length
 2311     __ b(RuntimeAddress(long_copy_entry));
 2312 
 2313     // ObjArrayKlass
 2314   __ BIND(L_objArray);
 2315     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2316 
 2317     Label L_plain_copy, L_checkcast_copy;
 2318     //  test array classes for subtyping
 2319     __ load_klass(r15, dst);
 2320     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2321     __ br(Assembler::NE, L_checkcast_copy);
 2322 
 2323     // Identically typed arrays can be copied without element-wise checks.
 2324     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2325                            rscratch2, L_failed);
 2326 
 2327     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2328     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2329     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2330     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2331     __ movw(count, scratch_length); // length
 2332   __ BIND(L_plain_copy);
 2333     __ b(RuntimeAddress(oop_copy_entry));
 2334 
 2335   __ BIND(L_checkcast_copy);
 2336     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2337     {
 2338       // Before looking at dst.length, make sure dst is also an objArray.
 2339       __ ldrw(rscratch1, Address(r15, lh_offset));
 2340       __ movw(rscratch2, objArray_lh);
 2341       __ eorw(rscratch1, rscratch1, rscratch2);
 2342       __ cbnzw(rscratch1, L_failed);
 2343 
 2344       // It is safe to examine both src.length and dst.length.
 2345       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2346                              r15, L_failed);
 2347 
 2348       __ load_klass(dst_klass, dst); // reload
 2349 
 2350       // Marshal the base address arguments now, freeing registers.
 2351       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2352       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2354       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2355       __ movw(count, length);           // length (reloaded)
 2356       Register sco_temp = c_rarg3;      // this register is free now
 2357       assert_different_registers(from, to, count, sco_temp,
 2358                                  dst_klass, scratch_src_klass);
 2359       // assert_clean_int(count, sco_temp);
 2360 
 2361       // Generate the type check.
 2362       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2364 
 2365       // Smashes rscratch1, rscratch2
 2366       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2367                           L_plain_copy);
 2368 
 2369       // Fetch destination element klass from the ObjArrayKlass header.
 2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2373 
 2374       // the checkcast_copy loop needs two extra arguments:
 2375       assert(c_rarg3 == sco_temp, "#3 already in place");
 2376       // Set up arguments for checkcast_copy_entry.
 2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2378       __ b(RuntimeAddress(checkcast_copy_entry));
 2379     }
 2380 
 2381   __ BIND(L_failed);
 2382     __ mov(r0, -1);
 2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2384     __ ret(lr);
 2385 
 2386     return start;
 2387   }
 2388 
 2389   //
 2390   // Generate stub for array fill. If "aligned" is true, the
 2391   // "to" address is assumed to be heapword aligned.
 2392   //
 2393   // Arguments for generated stub:
 2394   //   to:    c_rarg0
 2395   //   value: c_rarg1
 2396   //   count: c_rarg2 treated as signed
 2397   //
 2398   address generate_fill(StubId stub_id) {
 2399     BasicType t;
 2400     bool aligned;
 2401 
 2402     switch (stub_id) {
 2403     case StubId::stubgen_jbyte_fill_id:
 2404       t = T_BYTE;
 2405       aligned = false;
 2406       break;
 2407     case StubId::stubgen_jshort_fill_id:
 2408       t = T_SHORT;
 2409       aligned = false;
 2410       break;
 2411     case StubId::stubgen_jint_fill_id:
 2412       t = T_INT;
 2413       aligned = false;
 2414       break;
 2415     case StubId::stubgen_arrayof_jbyte_fill_id:
 2416       t = T_BYTE;
 2417       aligned = true;
 2418       break;
 2419     case StubId::stubgen_arrayof_jshort_fill_id:
 2420       t = T_SHORT;
 2421       aligned = true;
 2422       break;
 2423     case StubId::stubgen_arrayof_jint_fill_id:
 2424       t = T_INT;
 2425       aligned = true;
 2426       break;
 2427     default:
 2428       ShouldNotReachHere();
 2429     };
 2430 
 2431     __ align(CodeEntryAlignment);
 2432     StubCodeMark mark(this, stub_id);
 2433     address start = __ pc();
 2434 
 2435     BLOCK_COMMENT("Entry:");
 2436 
 2437     const Register to        = c_rarg0;  // source array address
 2438     const Register value     = c_rarg1;  // value
 2439     const Register count     = c_rarg2;  // elements count
 2440 
 2441     const Register bz_base = r10;        // base for block_zero routine
 2442     const Register cnt_words = r11;      // temp register
 2443 
 2444     __ enter();
 2445 
 2446     Label L_fill_elements, L_exit1;
 2447 
 2448     int shift = -1;
 2449     switch (t) {
 2450       case T_BYTE:
 2451         shift = 0;
 2452         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2453         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2454         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2455         __ br(Assembler::LO, L_fill_elements);
 2456         break;
 2457       case T_SHORT:
 2458         shift = 1;
 2459         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2461         __ br(Assembler::LO, L_fill_elements);
 2462         break;
 2463       case T_INT:
 2464         shift = 2;
 2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2466         __ br(Assembler::LO, L_fill_elements);
 2467         break;
 2468       default: ShouldNotReachHere();
 2469     }
 2470 
 2471     // Align source address at 8 bytes address boundary.
 2472     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2473     if (!aligned) {
 2474       switch (t) {
 2475         case T_BYTE:
 2476           // One byte misalignment happens only for byte arrays.
 2477           __ tbz(to, 0, L_skip_align1);
 2478           __ strb(value, Address(__ post(to, 1)));
 2479           __ subw(count, count, 1);
 2480           __ bind(L_skip_align1);
 2481           // Fallthrough
 2482         case T_SHORT:
 2483           // Two bytes misalignment happens only for byte and short (char) arrays.
 2484           __ tbz(to, 1, L_skip_align2);
 2485           __ strh(value, Address(__ post(to, 2)));
 2486           __ subw(count, count, 2 >> shift);
 2487           __ bind(L_skip_align2);
 2488           // Fallthrough
 2489         case T_INT:
 2490           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2491           __ tbz(to, 2, L_skip_align4);
 2492           __ strw(value, Address(__ post(to, 4)));
 2493           __ subw(count, count, 4 >> shift);
 2494           __ bind(L_skip_align4);
 2495           break;
 2496         default: ShouldNotReachHere();
 2497       }
 2498     }
 2499 
 2500     //
 2501     //  Fill large chunks
 2502     //
 2503     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2504     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2505     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2506     if (UseBlockZeroing) {
 2507       Label non_block_zeroing, rest;
 2508       // If the fill value is zero we can use the fast zero_words().
 2509       __ cbnz(value, non_block_zeroing);
 2510       __ mov(bz_base, to);
 2511       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2512       address tpc = __ zero_words(bz_base, cnt_words);
 2513       if (tpc == nullptr) {
 2514         fatal("CodeCache is full at generate_fill");
 2515       }
 2516       __ b(rest);
 2517       __ bind(non_block_zeroing);
 2518       __ fill_words(to, cnt_words, value);
 2519       __ bind(rest);
 2520     } else {
 2521       __ fill_words(to, cnt_words, value);
 2522     }
 2523 
 2524     // Remaining count is less than 8 bytes. Fill it by a single store.
 2525     // Note that the total length is no less than 8 bytes.
 2526     if (t == T_BYTE || t == T_SHORT) {
 2527       Label L_exit1;
 2528       __ cbzw(count, L_exit1);
 2529       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2530       __ str(value, Address(to, -8));    // overwrite some elements
 2531       __ bind(L_exit1);
 2532       __ leave();
 2533       __ ret(lr);
 2534     }
 2535 
 2536     // Handle copies less than 8 bytes.
 2537     Label L_fill_2, L_fill_4, L_exit2;
 2538     __ bind(L_fill_elements);
 2539     switch (t) {
 2540       case T_BYTE:
 2541         __ tbz(count, 0, L_fill_2);
 2542         __ strb(value, Address(__ post(to, 1)));
 2543         __ bind(L_fill_2);
 2544         __ tbz(count, 1, L_fill_4);
 2545         __ strh(value, Address(__ post(to, 2)));
 2546         __ bind(L_fill_4);
 2547         __ tbz(count, 2, L_exit2);
 2548         __ strw(value, Address(to));
 2549         break;
 2550       case T_SHORT:
 2551         __ tbz(count, 0, L_fill_4);
 2552         __ strh(value, Address(__ post(to, 2)));
 2553         __ bind(L_fill_4);
 2554         __ tbz(count, 1, L_exit2);
 2555         __ strw(value, Address(to));
 2556         break;
 2557       case T_INT:
 2558         __ cbzw(count, L_exit2);
 2559         __ strw(value, Address(to));
 2560         break;
 2561       default: ShouldNotReachHere();
 2562     }
 2563     __ bind(L_exit2);
 2564     __ leave();
 2565     __ ret(lr);
 2566     return start;
 2567   }
 2568 
 2569   address generate_unsafecopy_common_error_exit() {
 2570     address start_pc = __ pc();
 2571       __ leave();
 2572       __ mov(r0, 0);
 2573       __ ret(lr);
 2574     return start_pc;
 2575   }
 2576 
 2577   //
 2578   //  Generate 'unsafe' set memory stub
 2579   //  Though just as safe as the other stubs, it takes an unscaled
 2580   //  size_t (# bytes) argument instead of an element count.
 2581   //
 2582   //  This fill operation is atomicity preserving: as long as the
 2583   //  address supplied is sufficiently aligned, all writes of up to 64
 2584   //  bits in size are single-copy atomic.
 2585   //
 2586   //  Input:
 2587   //    c_rarg0   - destination array address
 2588   //    c_rarg1   - byte count (size_t)
 2589   //    c_rarg2   - byte value
 2590   //
 2591   address generate_unsafe_setmemory() {
 2592     __ align(CodeEntryAlignment);
 2593     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2594     address start = __ pc();
 2595 
 2596     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2597     Label tail;
 2598 
 2599     UnsafeMemoryAccessMark umam(this, true, false);
 2600 
 2601     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2602 
 2603     __ dup(v0, __ T16B, value);
 2604 
 2605     if (AvoidUnalignedAccesses) {
 2606       __ cmp(count, (u1)16);
 2607       __ br(__ LO, tail);
 2608 
 2609       __ mov(rscratch1, 16);
 2610       __ andr(rscratch2, dest, 15);
 2611       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2612       __ strq(v0, Address(dest));
 2613       __ sub(count, count, rscratch1);
 2614       __ add(dest, dest, rscratch1);
 2615     }
 2616 
 2617     __ subs(count, count, (u1)64);
 2618     __ br(__ LO, tail);
 2619     {
 2620       Label again;
 2621       __ bind(again);
 2622       __ stpq(v0, v0, Address(dest));
 2623       __ stpq(v0, v0, Address(dest, 32));
 2624 
 2625       __ subs(count, count, 64);
 2626       __ add(dest, dest, 64);
 2627       __ br(__ HS, again);
 2628     }
 2629 
 2630     __ bind(tail);
 2631     // The count of bytes is off by 64, but we don't need to correct
 2632     // it because we're only going to use the least-significant few
 2633     // count bits from here on.
 2634     // __ add(count, count, 64);
 2635 
 2636     {
 2637       Label dont;
 2638       __ tbz(count, exact_log2(32), dont);
 2639       __ stpq(v0, v0, __ post(dest, 32));
 2640       __ bind(dont);
 2641     }
 2642     {
 2643       Label dont;
 2644       __ tbz(count, exact_log2(16), dont);
 2645       __ strq(v0, __ post(dest, 16));
 2646       __ bind(dont);
 2647     }
 2648     {
 2649       Label dont;
 2650       __ tbz(count, exact_log2(8), dont);
 2651       __ strd(v0, __ post(dest, 8));
 2652       __ bind(dont);
 2653     }
 2654 
 2655     Label finished;
 2656     __ tst(count, 7);
 2657     __ br(__ EQ, finished);
 2658 
 2659     {
 2660       Label dont;
 2661       __ tbz(count, exact_log2(4), dont);
 2662       __ strs(v0, __ post(dest, 4));
 2663       __ bind(dont);
 2664     }
 2665     {
 2666       Label dont;
 2667       __ tbz(count, exact_log2(2), dont);
 2668       __ bfi(value, value, 8, 8);
 2669       __ strh(value, __ post(dest, 2));
 2670       __ bind(dont);
 2671     }
 2672     {
 2673       Label dont;
 2674       __ tbz(count, exact_log2(1), dont);
 2675       __ strb(value, Address(dest));
 2676       __ bind(dont);
 2677     }
 2678 
 2679     __ bind(finished);
 2680     __ leave();
 2681     __ ret(lr);
 2682 
 2683     return start;
 2684   }
 2685 
 2686   address generate_data_cache_writeback() {
 2687     const Register line        = c_rarg0;  // address of line to write back
 2688 
 2689     __ align(CodeEntryAlignment);
 2690 
 2691     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2692     StubCodeMark mark(this, stub_id);
 2693 
 2694     address start = __ pc();
 2695     __ enter();
 2696     __ cache_wb(Address(line, 0));
 2697     __ leave();
 2698     __ ret(lr);
 2699 
 2700     return start;
 2701   }
 2702 
 2703   address generate_data_cache_writeback_sync() {
 2704     const Register is_pre     = c_rarg0;  // pre or post sync
 2705 
 2706     __ align(CodeEntryAlignment);
 2707 
 2708     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2709     StubCodeMark mark(this, stub_id);
 2710 
 2711     // pre wbsync is a no-op
 2712     // post wbsync translates to an sfence
 2713 
 2714     Label skip;
 2715     address start = __ pc();
 2716     __ enter();
 2717     __ cbnz(is_pre, skip);
 2718     __ cache_wbsync(false);
 2719     __ bind(skip);
 2720     __ leave();
 2721     __ ret(lr);
 2722 
 2723     return start;
 2724   }
 2725 
 2726   void generate_arraycopy_stubs() {
 2727     address entry;
 2728     address entry_jbyte_arraycopy;
 2729     address entry_jshort_arraycopy;
 2730     address entry_jint_arraycopy;
 2731     address entry_oop_arraycopy;
 2732     address entry_jlong_arraycopy;
 2733     address entry_checkcast_arraycopy;
 2734 
 2735     // generate the common exit first so later stubs can rely on it if
 2736     // they want an UnsafeMemoryAccess exit non-local to the stub
 2737     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2738     // register the stub as the default exit with class UnsafeMemoryAccess
 2739     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2740 
 2741     generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2742     generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2743 
 2744     generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2745     generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2746 
 2747     generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2748     generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2749 
 2750     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2751 
 2752     //*** jbyte
 2753     // Always need aligned and unaligned versions
 2754     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry);
 2755     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2756     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2757     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr);
 2758 
 2759     //*** jshort
 2760     // Always need aligned and unaligned versions
 2761     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry);
 2762     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2763     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry);
 2764     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr);
 2765 
 2766     //*** jint
 2767     // Aligned versions
 2768     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry);
 2769     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2770     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2771     // entry_jint_arraycopy always points to the unaligned version
 2772     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
 2773     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2774 
 2775     //*** jlong
 2776     // It is always aligned
 2777     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry);
 2778     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2779     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2780     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2781 
 2782     //*** oops
 2783     {
 2784       // With compressed oops we need unaligned versions; notice that
 2785       // we overwrite entry_oop_arraycopy.
 2786       bool aligned = !UseCompressedOops;
 2787 
 2788       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2789         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry);
 2790       StubRoutines::_arrayof_oop_arraycopy
 2791         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2792       // Aligned versions without pre-barriers
 2793       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2794         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2795       StubRoutines::_arrayof_oop_arraycopy_uninit
 2796         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2797     }
 2798 
 2799     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2800     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2801     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2802     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2803 
 2804     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2805     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2806 
 2807     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2808                                                               entry_jshort_arraycopy,
 2809                                                               entry_jint_arraycopy,
 2810                                                               entry_jlong_arraycopy);
 2811 
 2812     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2813                                                                entry_jshort_arraycopy,
 2814                                                                entry_jint_arraycopy,
 2815                                                                entry_oop_arraycopy,
 2816                                                                entry_jlong_arraycopy,
 2817                                                                entry_checkcast_arraycopy);
 2818 
 2819     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2820     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2821     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2822     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2823     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2824     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2825   }
 2826 
 2827   void generate_math_stubs() { Unimplemented(); }
 2828 
 2829   // Arguments:
 2830   //
 2831   // Inputs:
 2832   //   c_rarg0   - source byte array address
 2833   //   c_rarg1   - destination byte array address
 2834   //   c_rarg2   - K (key) in little endian int array
 2835   //
 2836   address generate_aescrypt_encryptBlock() {
 2837     __ align(CodeEntryAlignment);
 2838     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2839     StubCodeMark mark(this, stub_id);
 2840 
 2841     const Register from        = c_rarg0;  // source array address
 2842     const Register to          = c_rarg1;  // destination array address
 2843     const Register key         = c_rarg2;  // key array address
 2844     const Register keylen      = rscratch1;
 2845 
 2846     address start = __ pc();
 2847     __ enter();
 2848 
 2849     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2850 
 2851     __ aesenc_loadkeys(key, keylen);
 2852     __ aesecb_encrypt(from, to, keylen);
 2853 
 2854     __ mov(r0, 0);
 2855 
 2856     __ leave();
 2857     __ ret(lr);
 2858 
 2859     return start;
 2860   }
 2861 
 2862   // Arguments:
 2863   //
 2864   // Inputs:
 2865   //   c_rarg0   - source byte array address
 2866   //   c_rarg1   - destination byte array address
 2867   //   c_rarg2   - K (key) in little endian int array
 2868   //
 2869   address generate_aescrypt_decryptBlock() {
 2870     assert(UseAES, "need AES cryptographic extension support");
 2871     __ align(CodeEntryAlignment);
 2872     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2873     StubCodeMark mark(this, stub_id);
 2874     Label L_doLast;
 2875 
 2876     const Register from        = c_rarg0;  // source array address
 2877     const Register to          = c_rarg1;  // destination array address
 2878     const Register key         = c_rarg2;  // key array address
 2879     const Register keylen      = rscratch1;
 2880 
 2881     address start = __ pc();
 2882     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2883 
 2884     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2885 
 2886     __ aesecb_decrypt(from, to, key, keylen);
 2887 
 2888     __ mov(r0, 0);
 2889 
 2890     __ leave();
 2891     __ ret(lr);
 2892 
 2893     return start;
 2894   }
 2895 
 2896   // Arguments:
 2897   //
 2898   // Inputs:
 2899   //   c_rarg0   - source byte array address
 2900   //   c_rarg1   - destination byte array address
 2901   //   c_rarg2   - K (key) in little endian int array
 2902   //   c_rarg3   - r vector byte array address
 2903   //   c_rarg4   - input length
 2904   //
 2905   // Output:
 2906   //   x0        - input length
 2907   //
 2908   address generate_cipherBlockChaining_encryptAESCrypt() {
 2909     assert(UseAES, "need AES cryptographic extension support");
 2910     __ align(CodeEntryAlignment);
 2911     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2912     StubCodeMark mark(this, stub_id);
 2913 
 2914     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2915 
 2916     const Register from        = c_rarg0;  // source array address
 2917     const Register to          = c_rarg1;  // destination array address
 2918     const Register key         = c_rarg2;  // key array address
 2919     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2920                                            // and left with the results of the last encryption block
 2921     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2922     const Register keylen      = rscratch1;
 2923 
 2924     address start = __ pc();
 2925 
 2926       __ enter();
 2927 
 2928       __ movw(rscratch2, len_reg);
 2929 
 2930       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2931 
 2932       __ ld1(v0, __ T16B, rvec);
 2933 
 2934       __ cmpw(keylen, 52);
 2935       __ br(Assembler::CC, L_loadkeys_44);
 2936       __ br(Assembler::EQ, L_loadkeys_52);
 2937 
 2938       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2939       __ rev32(v17, __ T16B, v17);
 2940       __ rev32(v18, __ T16B, v18);
 2941     __ BIND(L_loadkeys_52);
 2942       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2943       __ rev32(v19, __ T16B, v19);
 2944       __ rev32(v20, __ T16B, v20);
 2945     __ BIND(L_loadkeys_44);
 2946       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2947       __ rev32(v21, __ T16B, v21);
 2948       __ rev32(v22, __ T16B, v22);
 2949       __ rev32(v23, __ T16B, v23);
 2950       __ rev32(v24, __ T16B, v24);
 2951       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2952       __ rev32(v25, __ T16B, v25);
 2953       __ rev32(v26, __ T16B, v26);
 2954       __ rev32(v27, __ T16B, v27);
 2955       __ rev32(v28, __ T16B, v28);
 2956       __ ld1(v29, v30, v31, __ T16B, key);
 2957       __ rev32(v29, __ T16B, v29);
 2958       __ rev32(v30, __ T16B, v30);
 2959       __ rev32(v31, __ T16B, v31);
 2960 
 2961     __ BIND(L_aes_loop);
 2962       __ ld1(v1, __ T16B, __ post(from, 16));
 2963       __ eor(v0, __ T16B, v0, v1);
 2964 
 2965       __ br(Assembler::CC, L_rounds_44);
 2966       __ br(Assembler::EQ, L_rounds_52);
 2967 
 2968       __ aese(v0, v17); __ aesmc(v0, v0);
 2969       __ aese(v0, v18); __ aesmc(v0, v0);
 2970     __ BIND(L_rounds_52);
 2971       __ aese(v0, v19); __ aesmc(v0, v0);
 2972       __ aese(v0, v20); __ aesmc(v0, v0);
 2973     __ BIND(L_rounds_44);
 2974       __ aese(v0, v21); __ aesmc(v0, v0);
 2975       __ aese(v0, v22); __ aesmc(v0, v0);
 2976       __ aese(v0, v23); __ aesmc(v0, v0);
 2977       __ aese(v0, v24); __ aesmc(v0, v0);
 2978       __ aese(v0, v25); __ aesmc(v0, v0);
 2979       __ aese(v0, v26); __ aesmc(v0, v0);
 2980       __ aese(v0, v27); __ aesmc(v0, v0);
 2981       __ aese(v0, v28); __ aesmc(v0, v0);
 2982       __ aese(v0, v29); __ aesmc(v0, v0);
 2983       __ aese(v0, v30);
 2984       __ eor(v0, __ T16B, v0, v31);
 2985 
 2986       __ st1(v0, __ T16B, __ post(to, 16));
 2987 
 2988       __ subw(len_reg, len_reg, 16);
 2989       __ cbnzw(len_reg, L_aes_loop);
 2990 
 2991       __ st1(v0, __ T16B, rvec);
 2992 
 2993       __ mov(r0, rscratch2);
 2994 
 2995       __ leave();
 2996       __ ret(lr);
 2997 
 2998       return start;
 2999   }
 3000 
 3001   // Arguments:
 3002   //
 3003   // Inputs:
 3004   //   c_rarg0   - source byte array address
 3005   //   c_rarg1   - destination byte array address
 3006   //   c_rarg2   - K (key) in little endian int array
 3007   //   c_rarg3   - r vector byte array address
 3008   //   c_rarg4   - input length
 3009   //
 3010   // Output:
 3011   //   r0        - input length
 3012   //
 3013   address generate_cipherBlockChaining_decryptAESCrypt() {
 3014     assert(UseAES, "need AES cryptographic extension support");
 3015     __ align(CodeEntryAlignment);
 3016     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3017     StubCodeMark mark(this, stub_id);
 3018 
 3019     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3020 
 3021     const Register from        = c_rarg0;  // source array address
 3022     const Register to          = c_rarg1;  // destination array address
 3023     const Register key         = c_rarg2;  // key array address
 3024     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3025                                            // and left with the results of the last encryption block
 3026     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3027     const Register keylen      = rscratch1;
 3028 
 3029     address start = __ pc();
 3030 
 3031       __ enter();
 3032 
 3033       __ movw(rscratch2, len_reg);
 3034 
 3035       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3036 
 3037       __ ld1(v2, __ T16B, rvec);
 3038 
 3039       __ ld1(v31, __ T16B, __ post(key, 16));
 3040       __ rev32(v31, __ T16B, v31);
 3041 
 3042       __ cmpw(keylen, 52);
 3043       __ br(Assembler::CC, L_loadkeys_44);
 3044       __ br(Assembler::EQ, L_loadkeys_52);
 3045 
 3046       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3047       __ rev32(v17, __ T16B, v17);
 3048       __ rev32(v18, __ T16B, v18);
 3049     __ BIND(L_loadkeys_52);
 3050       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3051       __ rev32(v19, __ T16B, v19);
 3052       __ rev32(v20, __ T16B, v20);
 3053     __ BIND(L_loadkeys_44);
 3054       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3055       __ rev32(v21, __ T16B, v21);
 3056       __ rev32(v22, __ T16B, v22);
 3057       __ rev32(v23, __ T16B, v23);
 3058       __ rev32(v24, __ T16B, v24);
 3059       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3060       __ rev32(v25, __ T16B, v25);
 3061       __ rev32(v26, __ T16B, v26);
 3062       __ rev32(v27, __ T16B, v27);
 3063       __ rev32(v28, __ T16B, v28);
 3064       __ ld1(v29, v30, __ T16B, key);
 3065       __ rev32(v29, __ T16B, v29);
 3066       __ rev32(v30, __ T16B, v30);
 3067 
 3068     __ BIND(L_aes_loop);
 3069       __ ld1(v0, __ T16B, __ post(from, 16));
 3070       __ orr(v1, __ T16B, v0, v0);
 3071 
 3072       __ br(Assembler::CC, L_rounds_44);
 3073       __ br(Assembler::EQ, L_rounds_52);
 3074 
 3075       __ aesd(v0, v17); __ aesimc(v0, v0);
 3076       __ aesd(v0, v18); __ aesimc(v0, v0);
 3077     __ BIND(L_rounds_52);
 3078       __ aesd(v0, v19); __ aesimc(v0, v0);
 3079       __ aesd(v0, v20); __ aesimc(v0, v0);
 3080     __ BIND(L_rounds_44);
 3081       __ aesd(v0, v21); __ aesimc(v0, v0);
 3082       __ aesd(v0, v22); __ aesimc(v0, v0);
 3083       __ aesd(v0, v23); __ aesimc(v0, v0);
 3084       __ aesd(v0, v24); __ aesimc(v0, v0);
 3085       __ aesd(v0, v25); __ aesimc(v0, v0);
 3086       __ aesd(v0, v26); __ aesimc(v0, v0);
 3087       __ aesd(v0, v27); __ aesimc(v0, v0);
 3088       __ aesd(v0, v28); __ aesimc(v0, v0);
 3089       __ aesd(v0, v29); __ aesimc(v0, v0);
 3090       __ aesd(v0, v30);
 3091       __ eor(v0, __ T16B, v0, v31);
 3092       __ eor(v0, __ T16B, v0, v2);
 3093 
 3094       __ st1(v0, __ T16B, __ post(to, 16));
 3095       __ orr(v2, __ T16B, v1, v1);
 3096 
 3097       __ subw(len_reg, len_reg, 16);
 3098       __ cbnzw(len_reg, L_aes_loop);
 3099 
 3100       __ st1(v2, __ T16B, rvec);
 3101 
 3102       __ mov(r0, rscratch2);
 3103 
 3104       __ leave();
 3105       __ ret(lr);
 3106 
 3107     return start;
 3108   }
 3109 
 3110   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3111   // Inputs: 128-bits. in is preserved.
 3112   // The least-significant 64-bit word is in the upper dword of each vector.
 3113   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3114   // Output: result
 3115   void be_add_128_64(FloatRegister result, FloatRegister in,
 3116                      FloatRegister inc, FloatRegister tmp) {
 3117     assert_different_registers(result, tmp, inc);
 3118 
 3119     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3120                                            // input
 3121     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3122     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3123                                            // MSD == 0 (must be!) to LSD
 3124     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3125   }
 3126 
 3127   // CTR AES crypt.
 3128   // Arguments:
 3129   //
 3130   // Inputs:
 3131   //   c_rarg0   - source byte array address
 3132   //   c_rarg1   - destination byte array address
 3133   //   c_rarg2   - K (key) in little endian int array
 3134   //   c_rarg3   - counter vector byte array address
 3135   //   c_rarg4   - input length
 3136   //   c_rarg5   - saved encryptedCounter start
 3137   //   c_rarg6   - saved used length
 3138   //
 3139   // Output:
 3140   //   r0       - input length
 3141   //
 3142   address generate_counterMode_AESCrypt() {
 3143     const Register in = c_rarg0;
 3144     const Register out = c_rarg1;
 3145     const Register key = c_rarg2;
 3146     const Register counter = c_rarg3;
 3147     const Register saved_len = c_rarg4, len = r10;
 3148     const Register saved_encrypted_ctr = c_rarg5;
 3149     const Register used_ptr = c_rarg6, used = r12;
 3150 
 3151     const Register offset = r7;
 3152     const Register keylen = r11;
 3153 
 3154     const unsigned char block_size = 16;
 3155     const int bulk_width = 4;
 3156     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3157     // performance with larger data sizes, but it also means that the
 3158     // fast path isn't used until you have at least 8 blocks, and up
 3159     // to 127 bytes of data will be executed on the slow path. For
 3160     // that reason, and also so as not to blow away too much icache, 4
 3161     // blocks seems like a sensible compromise.
 3162 
 3163     // Algorithm:
 3164     //
 3165     //    if (len == 0) {
 3166     //        goto DONE;
 3167     //    }
 3168     //    int result = len;
 3169     //    do {
 3170     //        if (used >= blockSize) {
 3171     //            if (len >= bulk_width * blockSize) {
 3172     //                CTR_large_block();
 3173     //                if (len == 0)
 3174     //                    goto DONE;
 3175     //            }
 3176     //            for (;;) {
 3177     //                16ByteVector v0 = counter;
 3178     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3179     //                used = 0;
 3180     //                if (len < blockSize)
 3181     //                    break;    /* goto NEXT */
 3182     //                16ByteVector v1 = load16Bytes(in, offset);
 3183     //                v1 = v1 ^ encryptedCounter;
 3184     //                store16Bytes(out, offset);
 3185     //                used = blockSize;
 3186     //                offset += blockSize;
 3187     //                len -= blockSize;
 3188     //                if (len == 0)
 3189     //                    goto DONE;
 3190     //            }
 3191     //        }
 3192     //      NEXT:
 3193     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3194     //        len--;
 3195     //    } while (len != 0);
 3196     //  DONE:
 3197     //    return result;
 3198     //
 3199     // CTR_large_block()
 3200     //    Wide bulk encryption of whole blocks.
 3201 
 3202     __ align(CodeEntryAlignment);
 3203     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3204     StubCodeMark mark(this, stub_id);
 3205     const address start = __ pc();
 3206     __ enter();
 3207 
 3208     Label DONE, CTR_large_block, large_block_return;
 3209     __ ldrw(used, Address(used_ptr));
 3210     __ cbzw(saved_len, DONE);
 3211 
 3212     __ mov(len, saved_len);
 3213     __ mov(offset, 0);
 3214 
 3215     // Compute #rounds for AES based on the length of the key array
 3216     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3217 
 3218     __ aesenc_loadkeys(key, keylen);
 3219 
 3220     {
 3221       Label L_CTR_loop, NEXT;
 3222 
 3223       __ bind(L_CTR_loop);
 3224 
 3225       __ cmp(used, block_size);
 3226       __ br(__ LO, NEXT);
 3227 
 3228       // Maybe we have a lot of data
 3229       __ subsw(rscratch1, len, bulk_width * block_size);
 3230       __ br(__ HS, CTR_large_block);
 3231       __ BIND(large_block_return);
 3232       __ cbzw(len, DONE);
 3233 
 3234       // Setup the counter
 3235       __ movi(v4, __ T4S, 0);
 3236       __ movi(v5, __ T4S, 1);
 3237       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3238 
 3239       // 128-bit big-endian increment
 3240       __ ld1(v0, __ T16B, counter);
 3241       __ rev64(v16, __ T16B, v0);
 3242       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3243       __ rev64(v16, __ T16B, v16);
 3244       __ st1(v16, __ T16B, counter);
 3245       // Previous counter value is in v0
 3246       // v4 contains { 0, 1 }
 3247 
 3248       {
 3249         // We have fewer than bulk_width blocks of data left. Encrypt
 3250         // them one by one until there is less than a full block
 3251         // remaining, being careful to save both the encrypted counter
 3252         // and the counter.
 3253 
 3254         Label inner_loop;
 3255         __ bind(inner_loop);
 3256         // Counter to encrypt is in v0
 3257         __ aesecb_encrypt(noreg, noreg, keylen);
 3258         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3259 
 3260         // Do we have a remaining full block?
 3261 
 3262         __ mov(used, 0);
 3263         __ cmp(len, block_size);
 3264         __ br(__ LO, NEXT);
 3265 
 3266         // Yes, we have a full block
 3267         __ ldrq(v1, Address(in, offset));
 3268         __ eor(v1, __ T16B, v1, v0);
 3269         __ strq(v1, Address(out, offset));
 3270         __ mov(used, block_size);
 3271         __ add(offset, offset, block_size);
 3272 
 3273         __ subw(len, len, block_size);
 3274         __ cbzw(len, DONE);
 3275 
 3276         // Increment the counter, store it back
 3277         __ orr(v0, __ T16B, v16, v16);
 3278         __ rev64(v16, __ T16B, v16);
 3279         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3280         __ rev64(v16, __ T16B, v16);
 3281         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3282 
 3283         __ b(inner_loop);
 3284       }
 3285 
 3286       __ BIND(NEXT);
 3287 
 3288       // Encrypt a single byte, and loop.
 3289       // We expect this to be a rare event.
 3290       __ ldrb(rscratch1, Address(in, offset));
 3291       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3292       __ eor(rscratch1, rscratch1, rscratch2);
 3293       __ strb(rscratch1, Address(out, offset));
 3294       __ add(offset, offset, 1);
 3295       __ add(used, used, 1);
 3296       __ subw(len, len,1);
 3297       __ cbnzw(len, L_CTR_loop);
 3298     }
 3299 
 3300     __ bind(DONE);
 3301     __ strw(used, Address(used_ptr));
 3302     __ mov(r0, saved_len);
 3303 
 3304     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3305     __ ret(lr);
 3306 
 3307     // Bulk encryption
 3308 
 3309     __ BIND (CTR_large_block);
 3310     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3311 
 3312     if (bulk_width == 8) {
 3313       __ sub(sp, sp, 4 * 16);
 3314       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3315     }
 3316     __ sub(sp, sp, 4 * 16);
 3317     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3318     RegSet saved_regs = (RegSet::of(in, out, offset)
 3319                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3320     __ push(saved_regs, sp);
 3321     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3322     __ add(in, in, offset);
 3323     __ add(out, out, offset);
 3324 
 3325     // Keys should already be loaded into the correct registers
 3326 
 3327     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3328     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3329 
 3330     // AES/CTR loop
 3331     {
 3332       Label L_CTR_loop;
 3333       __ BIND(L_CTR_loop);
 3334 
 3335       // Setup the counters
 3336       __ movi(v8, __ T4S, 0);
 3337       __ movi(v9, __ T4S, 1);
 3338       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3339 
 3340       for (int i = 0; i < bulk_width; i++) {
 3341         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3342         __ rev64(v0_ofs, __ T16B, v16);
 3343         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3344       }
 3345 
 3346       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3347 
 3348       // Encrypt the counters
 3349       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3350 
 3351       if (bulk_width == 8) {
 3352         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3353       }
 3354 
 3355       // XOR the encrypted counters with the inputs
 3356       for (int i = 0; i < bulk_width; i++) {
 3357         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3358         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3359         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3360       }
 3361 
 3362       // Write the encrypted data
 3363       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3364       if (bulk_width == 8) {
 3365         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3366       }
 3367 
 3368       __ subw(len, len, 16 * bulk_width);
 3369       __ cbnzw(len, L_CTR_loop);
 3370     }
 3371 
 3372     // Save the counter back where it goes
 3373     __ rev64(v16, __ T16B, v16);
 3374     __ st1(v16, __ T16B, counter);
 3375 
 3376     __ pop(saved_regs, sp);
 3377 
 3378     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3379     if (bulk_width == 8) {
 3380       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3381     }
 3382 
 3383     __ andr(rscratch1, len, -16 * bulk_width);
 3384     __ sub(len, len, rscratch1);
 3385     __ add(offset, offset, rscratch1);
 3386     __ mov(used, 16);
 3387     __ strw(used, Address(used_ptr));
 3388     __ b(large_block_return);
 3389 
 3390     return start;
 3391   }
 3392 
 3393   // Vector AES Galois Counter Mode implementation. Parameters:
 3394   //
 3395   // in = c_rarg0
 3396   // len = c_rarg1
 3397   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3398   // out = c_rarg3
 3399   // key = c_rarg4
 3400   // state = c_rarg5 - GHASH.state
 3401   // subkeyHtbl = c_rarg6 - powers of H
 3402   // counter = c_rarg7 - 16 bytes of CTR
 3403   // return - number of processed bytes
 3404   address generate_galoisCounterMode_AESCrypt() {
 3405     address ghash_polynomial = __ pc();
 3406     __ emit_int64(0x87);  // The low-order bits of the field
 3407                           // polynomial (i.e. p = z^7+z^2+z+1)
 3408                           // repeated in the low and high parts of a
 3409                           // 128-bit vector
 3410     __ emit_int64(0x87);
 3411 
 3412     __ align(CodeEntryAlignment);
 3413     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3414     StubCodeMark mark(this, stub_id);
 3415     address start = __ pc();
 3416     __ enter();
 3417 
 3418     const Register in = c_rarg0;
 3419     const Register len = c_rarg1;
 3420     const Register ct = c_rarg2;
 3421     const Register out = c_rarg3;
 3422     // and updated with the incremented counter in the end
 3423 
 3424     const Register key = c_rarg4;
 3425     const Register state = c_rarg5;
 3426 
 3427     const Register subkeyHtbl = c_rarg6;
 3428 
 3429     const Register counter = c_rarg7;
 3430 
 3431     const Register keylen = r10;
 3432     // Save state before entering routine
 3433     __ sub(sp, sp, 4 * 16);
 3434     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3435     __ sub(sp, sp, 4 * 16);
 3436     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3437 
 3438     // __ andr(len, len, -512);
 3439     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3440     __ str(len, __ pre(sp, -2 * wordSize));
 3441 
 3442     Label DONE;
 3443     __ cbz(len, DONE);
 3444 
 3445     // Compute #rounds for AES based on the length of the key array
 3446     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3447 
 3448     __ aesenc_loadkeys(key, keylen);
 3449     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3450     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3451 
 3452     // AES/CTR loop
 3453     {
 3454       Label L_CTR_loop;
 3455       __ BIND(L_CTR_loop);
 3456 
 3457       // Setup the counters
 3458       __ movi(v8, __ T4S, 0);
 3459       __ movi(v9, __ T4S, 1);
 3460       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3461 
 3462       assert(v0->encoding() < v8->encoding(), "");
 3463       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3464         FloatRegister f = as_FloatRegister(i);
 3465         __ rev32(f, __ T16B, v16);
 3466         __ addv(v16, __ T4S, v16, v8);
 3467       }
 3468 
 3469       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3470 
 3471       // Encrypt the counters
 3472       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3473 
 3474       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3475 
 3476       // XOR the encrypted counters with the inputs
 3477       for (int i = 0; i < 8; i++) {
 3478         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3479         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3480         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3481       }
 3482       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3483       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3484 
 3485       __ subw(len, len, 16 * 8);
 3486       __ cbnzw(len, L_CTR_loop);
 3487     }
 3488 
 3489     __ rev32(v16, __ T16B, v16);
 3490     __ st1(v16, __ T16B, counter);
 3491 
 3492     __ ldr(len, Address(sp));
 3493     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3494 
 3495     // GHASH/CTR loop
 3496     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3497                                 len, /*unrolls*/4);
 3498 
 3499 #ifdef ASSERT
 3500     { Label L;
 3501       __ cmp(len, (unsigned char)0);
 3502       __ br(Assembler::EQ, L);
 3503       __ stop("stubGenerator: abort");
 3504       __ bind(L);
 3505   }
 3506 #endif
 3507 
 3508   __ bind(DONE);
 3509     // Return the number of bytes processed
 3510     __ ldr(r0, __ post(sp, 2 * wordSize));
 3511 
 3512     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3513     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3514 
 3515     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3516     __ ret(lr);
 3517      return start;
 3518   }
 3519 
 3520   class Cached64Bytes {
 3521   private:
 3522     MacroAssembler *_masm;
 3523     Register _regs[8];
 3524 
 3525   public:
 3526     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3527       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3528       auto it = rs.begin();
 3529       for (auto &r: _regs) {
 3530         r = *it;
 3531         ++it;
 3532       }
 3533     }
 3534 
 3535     void gen_loads(Register base) {
 3536       for (int i = 0; i < 8; i += 2) {
 3537         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3538       }
 3539     }
 3540 
 3541     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3542     void extract_u32(Register dest, int i) {
 3543       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3544     }
 3545   };
 3546 
 3547   // Utility routines for md5.
 3548   // Clobbers r10 and r11.
 3549   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3550               int k, int s, int t) {
 3551     Register rscratch3 = r10;
 3552     Register rscratch4 = r11;
 3553 
 3554     __ eorw(rscratch3, r3, r4);
 3555     __ movw(rscratch2, t);
 3556     __ andw(rscratch3, rscratch3, r2);
 3557     __ addw(rscratch4, r1, rscratch2);
 3558     reg_cache.extract_u32(rscratch1, k);
 3559     __ eorw(rscratch3, rscratch3, r4);
 3560     __ addw(rscratch4, rscratch4, rscratch1);
 3561     __ addw(rscratch3, rscratch3, rscratch4);
 3562     __ rorw(rscratch2, rscratch3, 32 - s);
 3563     __ addw(r1, rscratch2, r2);
 3564   }
 3565 
 3566   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3567               int k, int s, int t) {
 3568     Register rscratch3 = r10;
 3569     Register rscratch4 = r11;
 3570 
 3571     reg_cache.extract_u32(rscratch1, k);
 3572     __ movw(rscratch2, t);
 3573     __ addw(rscratch4, r1, rscratch2);
 3574     __ addw(rscratch4, rscratch4, rscratch1);
 3575     __ bicw(rscratch2, r3, r4);
 3576     __ andw(rscratch3, r2, r4);
 3577     __ addw(rscratch2, rscratch2, rscratch4);
 3578     __ addw(rscratch2, rscratch2, rscratch3);
 3579     __ rorw(rscratch2, rscratch2, 32 - s);
 3580     __ addw(r1, rscratch2, r2);
 3581   }
 3582 
 3583   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3584               int k, int s, int t) {
 3585     Register rscratch3 = r10;
 3586     Register rscratch4 = r11;
 3587 
 3588     __ eorw(rscratch3, r3, r4);
 3589     __ movw(rscratch2, t);
 3590     __ addw(rscratch4, r1, rscratch2);
 3591     reg_cache.extract_u32(rscratch1, k);
 3592     __ eorw(rscratch3, rscratch3, r2);
 3593     __ addw(rscratch4, rscratch4, rscratch1);
 3594     __ addw(rscratch3, rscratch3, rscratch4);
 3595     __ rorw(rscratch2, rscratch3, 32 - s);
 3596     __ addw(r1, rscratch2, r2);
 3597   }
 3598 
 3599   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3600               int k, int s, int t) {
 3601     Register rscratch3 = r10;
 3602     Register rscratch4 = r11;
 3603 
 3604     __ movw(rscratch3, t);
 3605     __ ornw(rscratch2, r2, r4);
 3606     __ addw(rscratch4, r1, rscratch3);
 3607     reg_cache.extract_u32(rscratch1, k);
 3608     __ eorw(rscratch3, rscratch2, r3);
 3609     __ addw(rscratch4, rscratch4, rscratch1);
 3610     __ addw(rscratch3, rscratch3, rscratch4);
 3611     __ rorw(rscratch2, rscratch3, 32 - s);
 3612     __ addw(r1, rscratch2, r2);
 3613   }
 3614 
 3615   // Arguments:
 3616   //
 3617   // Inputs:
 3618   //   c_rarg0   - byte[]  source+offset
 3619   //   c_rarg1   - int[]   SHA.state
 3620   //   c_rarg2   - int     offset
 3621   //   c_rarg3   - int     limit
 3622   //
 3623   address generate_md5_implCompress(StubId stub_id) {
 3624     bool multi_block;
 3625     switch (stub_id) {
 3626     case StubId::stubgen_md5_implCompress_id:
 3627       multi_block = false;
 3628       break;
 3629     case StubId::stubgen_md5_implCompressMB_id:
 3630       multi_block = true;
 3631       break;
 3632     default:
 3633       ShouldNotReachHere();
 3634     }
 3635     __ align(CodeEntryAlignment);
 3636 
 3637     StubCodeMark mark(this, stub_id);
 3638     address start = __ pc();
 3639 
 3640     Register buf       = c_rarg0;
 3641     Register state     = c_rarg1;
 3642     Register ofs       = c_rarg2;
 3643     Register limit     = c_rarg3;
 3644     Register a         = r4;
 3645     Register b         = r5;
 3646     Register c         = r6;
 3647     Register d         = r7;
 3648     Register rscratch3 = r10;
 3649     Register rscratch4 = r11;
 3650 
 3651     Register state_regs[2] = { r12, r13 };
 3652     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3653     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3654 
 3655     __ push(saved_regs, sp);
 3656 
 3657     __ ldp(state_regs[0], state_regs[1], Address(state));
 3658     __ ubfx(a, state_regs[0],  0, 32);
 3659     __ ubfx(b, state_regs[0], 32, 32);
 3660     __ ubfx(c, state_regs[1],  0, 32);
 3661     __ ubfx(d, state_regs[1], 32, 32);
 3662 
 3663     Label md5_loop;
 3664     __ BIND(md5_loop);
 3665 
 3666     reg_cache.gen_loads(buf);
 3667 
 3668     // Round 1
 3669     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3670     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3671     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3672     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3673     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3674     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3675     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3676     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3677     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3678     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3679     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3680     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3681     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3682     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3683     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3684     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3685 
 3686     // Round 2
 3687     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3688     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3689     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3690     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3691     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3692     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3693     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3694     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3695     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3696     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3697     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3698     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3699     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3700     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3701     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3702     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3703 
 3704     // Round 3
 3705     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3706     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3707     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3708     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3709     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3710     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3711     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3712     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3713     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3714     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3715     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3716     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3717     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3718     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3719     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3720     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3721 
 3722     // Round 4
 3723     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3724     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3725     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3726     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3727     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3728     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3729     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3730     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3731     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3732     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3733     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3734     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3735     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3736     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3737     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3738     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3739 
 3740     __ addw(a, state_regs[0], a);
 3741     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3742     __ addw(b, rscratch2, b);
 3743     __ addw(c, state_regs[1], c);
 3744     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3745     __ addw(d, rscratch4, d);
 3746 
 3747     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3748     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3749 
 3750     if (multi_block) {
 3751       __ add(buf, buf, 64);
 3752       __ add(ofs, ofs, 64);
 3753       __ cmp(ofs, limit);
 3754       __ br(Assembler::LE, md5_loop);
 3755       __ mov(c_rarg0, ofs); // return ofs
 3756     }
 3757 
 3758     // write hash values back in the correct order
 3759     __ stp(state_regs[0], state_regs[1], Address(state));
 3760 
 3761     __ pop(saved_regs, sp);
 3762 
 3763     __ ret(lr);
 3764 
 3765     return start;
 3766   }
 3767 
 3768   // Arguments:
 3769   //
 3770   // Inputs:
 3771   //   c_rarg0   - byte[]  source+offset
 3772   //   c_rarg1   - int[]   SHA.state
 3773   //   c_rarg2   - int     offset
 3774   //   c_rarg3   - int     limit
 3775   //
 3776   address generate_sha1_implCompress(StubId stub_id) {
 3777     bool multi_block;
 3778     switch (stub_id) {
 3779     case StubId::stubgen_sha1_implCompress_id:
 3780       multi_block = false;
 3781       break;
 3782     case StubId::stubgen_sha1_implCompressMB_id:
 3783       multi_block = true;
 3784       break;
 3785     default:
 3786       ShouldNotReachHere();
 3787     }
 3788 
 3789     __ align(CodeEntryAlignment);
 3790 
 3791     StubCodeMark mark(this, stub_id);
 3792     address start = __ pc();
 3793 
 3794     Register buf   = c_rarg0;
 3795     Register state = c_rarg1;
 3796     Register ofs   = c_rarg2;
 3797     Register limit = c_rarg3;
 3798 
 3799     Label keys;
 3800     Label sha1_loop;
 3801 
 3802     // load the keys into v0..v3
 3803     __ adr(rscratch1, keys);
 3804     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3805     // load 5 words state into v6, v7
 3806     __ ldrq(v6, Address(state, 0));
 3807     __ ldrs(v7, Address(state, 16));
 3808 
 3809 
 3810     __ BIND(sha1_loop);
 3811     // load 64 bytes of data into v16..v19
 3812     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3813     __ rev32(v16, __ T16B, v16);
 3814     __ rev32(v17, __ T16B, v17);
 3815     __ rev32(v18, __ T16B, v18);
 3816     __ rev32(v19, __ T16B, v19);
 3817 
 3818     // do the sha1
 3819     __ addv(v4, __ T4S, v16, v0);
 3820     __ orr(v20, __ T16B, v6, v6);
 3821 
 3822     FloatRegister d0 = v16;
 3823     FloatRegister d1 = v17;
 3824     FloatRegister d2 = v18;
 3825     FloatRegister d3 = v19;
 3826 
 3827     for (int round = 0; round < 20; round++) {
 3828       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3829       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3830       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3831       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3832       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3833 
 3834       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3835       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3836       __ sha1h(tmp2, __ T4S, v20);
 3837       if (round < 5)
 3838         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3839       else if (round < 10 || round >= 15)
 3840         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3841       else
 3842         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3843       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3844 
 3845       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3846     }
 3847 
 3848     __ addv(v7, __ T2S, v7, v21);
 3849     __ addv(v6, __ T4S, v6, v20);
 3850 
 3851     if (multi_block) {
 3852       __ add(ofs, ofs, 64);
 3853       __ cmp(ofs, limit);
 3854       __ br(Assembler::LE, sha1_loop);
 3855       __ mov(c_rarg0, ofs); // return ofs
 3856     }
 3857 
 3858     __ strq(v6, Address(state, 0));
 3859     __ strs(v7, Address(state, 16));
 3860 
 3861     __ ret(lr);
 3862 
 3863     __ bind(keys);
 3864     __ emit_int32(0x5a827999);
 3865     __ emit_int32(0x6ed9eba1);
 3866     __ emit_int32(0x8f1bbcdc);
 3867     __ emit_int32(0xca62c1d6);
 3868 
 3869     return start;
 3870   }
 3871 
 3872 
 3873   // Arguments:
 3874   //
 3875   // Inputs:
 3876   //   c_rarg0   - byte[]  source+offset
 3877   //   c_rarg1   - int[]   SHA.state
 3878   //   c_rarg2   - int     offset
 3879   //   c_rarg3   - int     limit
 3880   //
 3881   address generate_sha256_implCompress(StubId stub_id) {
 3882     bool multi_block;
 3883     switch (stub_id) {
 3884     case StubId::stubgen_sha256_implCompress_id:
 3885       multi_block = false;
 3886       break;
 3887     case StubId::stubgen_sha256_implCompressMB_id:
 3888       multi_block = true;
 3889       break;
 3890     default:
 3891       ShouldNotReachHere();
 3892     }
 3893 
 3894     static const uint32_t round_consts[64] = {
 3895       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3896       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3897       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3898       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3899       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3900       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3901       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3902       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3903       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3904       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3905       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3906       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3907       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3908       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3909       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3910       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3911     };
 3912 
 3913     __ align(CodeEntryAlignment);
 3914 
 3915     StubCodeMark mark(this, stub_id);
 3916     address start = __ pc();
 3917 
 3918     Register buf   = c_rarg0;
 3919     Register state = c_rarg1;
 3920     Register ofs   = c_rarg2;
 3921     Register limit = c_rarg3;
 3922 
 3923     Label sha1_loop;
 3924 
 3925     __ stpd(v8, v9, __ pre(sp, -32));
 3926     __ stpd(v10, v11, Address(sp, 16));
 3927 
 3928 // dga == v0
 3929 // dgb == v1
 3930 // dg0 == v2
 3931 // dg1 == v3
 3932 // dg2 == v4
 3933 // t0 == v6
 3934 // t1 == v7
 3935 
 3936     // load 16 keys to v16..v31
 3937     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3938     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3939     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3940     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3941     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3942 
 3943     // load 8 words (256 bits) state
 3944     __ ldpq(v0, v1, state);
 3945 
 3946     __ BIND(sha1_loop);
 3947     // load 64 bytes of data into v8..v11
 3948     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3949     __ rev32(v8, __ T16B, v8);
 3950     __ rev32(v9, __ T16B, v9);
 3951     __ rev32(v10, __ T16B, v10);
 3952     __ rev32(v11, __ T16B, v11);
 3953 
 3954     __ addv(v6, __ T4S, v8, v16);
 3955     __ orr(v2, __ T16B, v0, v0);
 3956     __ orr(v3, __ T16B, v1, v1);
 3957 
 3958     FloatRegister d0 = v8;
 3959     FloatRegister d1 = v9;
 3960     FloatRegister d2 = v10;
 3961     FloatRegister d3 = v11;
 3962 
 3963 
 3964     for (int round = 0; round < 16; round++) {
 3965       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3966       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3967       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3968       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3969 
 3970       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3971        __ orr(v4, __ T16B, v2, v2);
 3972       if (round < 15)
 3973         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3974       __ sha256h(v2, __ T4S, v3, tmp2);
 3975       __ sha256h2(v3, __ T4S, v4, tmp2);
 3976       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3977 
 3978       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3979     }
 3980 
 3981     __ addv(v0, __ T4S, v0, v2);
 3982     __ addv(v1, __ T4S, v1, v3);
 3983 
 3984     if (multi_block) {
 3985       __ add(ofs, ofs, 64);
 3986       __ cmp(ofs, limit);
 3987       __ br(Assembler::LE, sha1_loop);
 3988       __ mov(c_rarg0, ofs); // return ofs
 3989     }
 3990 
 3991     __ ldpd(v10, v11, Address(sp, 16));
 3992     __ ldpd(v8, v9, __ post(sp, 32));
 3993 
 3994     __ stpq(v0, v1, state);
 3995 
 3996     __ ret(lr);
 3997 
 3998     return start;
 3999   }
 4000 
 4001   // Double rounds for sha512.
 4002   void sha512_dround(int dr,
 4003                      FloatRegister vi0, FloatRegister vi1,
 4004                      FloatRegister vi2, FloatRegister vi3,
 4005                      FloatRegister vi4, FloatRegister vrc0,
 4006                      FloatRegister vrc1, FloatRegister vin0,
 4007                      FloatRegister vin1, FloatRegister vin2,
 4008                      FloatRegister vin3, FloatRegister vin4) {
 4009       if (dr < 36) {
 4010         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4011       }
 4012       __ addv(v5, __ T2D, vrc0, vin0);
 4013       __ ext(v6, __ T16B, vi2, vi3, 8);
 4014       __ ext(v5, __ T16B, v5, v5, 8);
 4015       __ ext(v7, __ T16B, vi1, vi2, 8);
 4016       __ addv(vi3, __ T2D, vi3, v5);
 4017       if (dr < 32) {
 4018         __ ext(v5, __ T16B, vin3, vin4, 8);
 4019         __ sha512su0(vin0, __ T2D, vin1);
 4020       }
 4021       __ sha512h(vi3, __ T2D, v6, v7);
 4022       if (dr < 32) {
 4023         __ sha512su1(vin0, __ T2D, vin2, v5);
 4024       }
 4025       __ addv(vi4, __ T2D, vi1, vi3);
 4026       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4027   }
 4028 
 4029   // Arguments:
 4030   //
 4031   // Inputs:
 4032   //   c_rarg0   - byte[]  source+offset
 4033   //   c_rarg1   - int[]   SHA.state
 4034   //   c_rarg2   - int     offset
 4035   //   c_rarg3   - int     limit
 4036   //
 4037   address generate_sha512_implCompress(StubId stub_id) {
 4038     bool multi_block;
 4039     switch (stub_id) {
 4040     case StubId::stubgen_sha512_implCompress_id:
 4041       multi_block = false;
 4042       break;
 4043     case StubId::stubgen_sha512_implCompressMB_id:
 4044       multi_block = true;
 4045       break;
 4046     default:
 4047       ShouldNotReachHere();
 4048     }
 4049 
 4050     static const uint64_t round_consts[80] = {
 4051       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4052       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4053       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4054       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4055       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4056       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4057       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4058       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4059       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4060       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4061       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4062       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4063       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4064       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4065       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4066       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4067       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4068       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4069       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4070       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4071       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4072       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4073       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4074       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4075       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4076       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4077       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4078     };
 4079 
 4080     __ align(CodeEntryAlignment);
 4081 
 4082     StubCodeMark mark(this, stub_id);
 4083     address start = __ pc();
 4084 
 4085     Register buf   = c_rarg0;
 4086     Register state = c_rarg1;
 4087     Register ofs   = c_rarg2;
 4088     Register limit = c_rarg3;
 4089 
 4090     __ stpd(v8, v9, __ pre(sp, -64));
 4091     __ stpd(v10, v11, Address(sp, 16));
 4092     __ stpd(v12, v13, Address(sp, 32));
 4093     __ stpd(v14, v15, Address(sp, 48));
 4094 
 4095     Label sha512_loop;
 4096 
 4097     // load state
 4098     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4099 
 4100     // load first 4 round constants
 4101     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4102     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4103 
 4104     __ BIND(sha512_loop);
 4105     // load 128B of data into v12..v19
 4106     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4107     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4108     __ rev64(v12, __ T16B, v12);
 4109     __ rev64(v13, __ T16B, v13);
 4110     __ rev64(v14, __ T16B, v14);
 4111     __ rev64(v15, __ T16B, v15);
 4112     __ rev64(v16, __ T16B, v16);
 4113     __ rev64(v17, __ T16B, v17);
 4114     __ rev64(v18, __ T16B, v18);
 4115     __ rev64(v19, __ T16B, v19);
 4116 
 4117     __ mov(rscratch2, rscratch1);
 4118 
 4119     __ mov(v0, __ T16B, v8);
 4120     __ mov(v1, __ T16B, v9);
 4121     __ mov(v2, __ T16B, v10);
 4122     __ mov(v3, __ T16B, v11);
 4123 
 4124     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4125     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4126     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4127     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4128     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4129     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4130     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4131     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4132     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4133     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4134     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4135     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4136     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4137     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4138     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4139     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4140     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4141     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4142     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4143     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4144     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4145     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4146     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4147     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4148     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4149     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4150     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4151     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4152     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4153     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4154     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4155     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4156     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4157     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4158     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4159     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4160     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4161     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4162     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4163     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4164 
 4165     __ addv(v8, __ T2D, v8, v0);
 4166     __ addv(v9, __ T2D, v9, v1);
 4167     __ addv(v10, __ T2D, v10, v2);
 4168     __ addv(v11, __ T2D, v11, v3);
 4169 
 4170     if (multi_block) {
 4171       __ add(ofs, ofs, 128);
 4172       __ cmp(ofs, limit);
 4173       __ br(Assembler::LE, sha512_loop);
 4174       __ mov(c_rarg0, ofs); // return ofs
 4175     }
 4176 
 4177     __ st1(v8, v9, v10, v11, __ T2D, state);
 4178 
 4179     __ ldpd(v14, v15, Address(sp, 48));
 4180     __ ldpd(v12, v13, Address(sp, 32));
 4181     __ ldpd(v10, v11, Address(sp, 16));
 4182     __ ldpd(v8, v9, __ post(sp, 64));
 4183 
 4184     __ ret(lr);
 4185 
 4186     return start;
 4187   }
 4188 
 4189   // Execute one round of keccak of two computations in parallel.
 4190   // One of the states should be loaded into the lower halves of
 4191   // the vector registers v0-v24, the other should be loaded into
 4192   // the upper halves of those registers. The ld1r instruction loads
 4193   // the round constant into both halves of register v31.
 4194   // Intermediate results c0...c5 and d0...d5 are computed
 4195   // in registers v25...v30.
 4196   // All vector instructions that are used operate on both register
 4197   // halves in parallel.
 4198   // If only a single computation is needed, one can only load the lower halves.
 4199   void keccak_round(Register rscratch1) {
 4200   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4201   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4202   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4203   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4204   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4205   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4206   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4207   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4208   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4209   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4210 
 4211   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4212   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4213   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4214   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4215   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4216 
 4217   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4218   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4219   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4220   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4221   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4222   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4223   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4224   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4225   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4226   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4227   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4228   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4229   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4230   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4231   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4232   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4233   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4234   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4235   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4236   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4237   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4238   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4239   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4240   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4241   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4242 
 4243   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4244   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4245   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4246   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4247   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4248 
 4249   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4250 
 4251   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4252   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4253   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4254   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4255   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4256 
 4257   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4258   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4259   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4260   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4261   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4262 
 4263   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4264   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4265   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4266   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4267   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4268 
 4269   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4270   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4271   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4272   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4273   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4274 
 4275   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4276   }
 4277 
 4278   // Arguments:
 4279   //
 4280   // Inputs:
 4281   //   c_rarg0   - byte[]  source+offset
 4282   //   c_rarg1   - byte[]  SHA.state
 4283   //   c_rarg2   - int     block_size
 4284   //   c_rarg3   - int     offset
 4285   //   c_rarg4   - int     limit
 4286   //
 4287   address generate_sha3_implCompress(StubId stub_id) {
 4288     bool multi_block;
 4289     switch (stub_id) {
 4290     case StubId::stubgen_sha3_implCompress_id:
 4291       multi_block = false;
 4292       break;
 4293     case StubId::stubgen_sha3_implCompressMB_id:
 4294       multi_block = true;
 4295       break;
 4296     default:
 4297       ShouldNotReachHere();
 4298     }
 4299 
 4300     static const uint64_t round_consts[24] = {
 4301       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4302       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4303       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4304       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4305       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4306       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4307       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4308       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4309     };
 4310 
 4311     __ align(CodeEntryAlignment);
 4312 
 4313     StubCodeMark mark(this, stub_id);
 4314     address start = __ pc();
 4315 
 4316     Register buf           = c_rarg0;
 4317     Register state         = c_rarg1;
 4318     Register block_size    = c_rarg2;
 4319     Register ofs           = c_rarg3;
 4320     Register limit         = c_rarg4;
 4321 
 4322     Label sha3_loop, rounds24_loop;
 4323     Label sha3_512_or_sha3_384, shake128;
 4324 
 4325     __ stpd(v8, v9, __ pre(sp, -64));
 4326     __ stpd(v10, v11, Address(sp, 16));
 4327     __ stpd(v12, v13, Address(sp, 32));
 4328     __ stpd(v14, v15, Address(sp, 48));
 4329 
 4330     // load state
 4331     __ add(rscratch1, state, 32);
 4332     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4333     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4334     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4335     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4336     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4337     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4338     __ ld1(v24, __ T1D, rscratch1);
 4339 
 4340     __ BIND(sha3_loop);
 4341 
 4342     // 24 keccak rounds
 4343     __ movw(rscratch2, 24);
 4344 
 4345     // load round_constants base
 4346     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4347 
 4348     // load input
 4349     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4350     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4351     __ eor(v0, __ T8B, v0, v25);
 4352     __ eor(v1, __ T8B, v1, v26);
 4353     __ eor(v2, __ T8B, v2, v27);
 4354     __ eor(v3, __ T8B, v3, v28);
 4355     __ eor(v4, __ T8B, v4, v29);
 4356     __ eor(v5, __ T8B, v5, v30);
 4357     __ eor(v6, __ T8B, v6, v31);
 4358 
 4359     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4360     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4361 
 4362     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4363     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4364     __ eor(v7, __ T8B, v7, v25);
 4365     __ eor(v8, __ T8B, v8, v26);
 4366     __ eor(v9, __ T8B, v9, v27);
 4367     __ eor(v10, __ T8B, v10, v28);
 4368     __ eor(v11, __ T8B, v11, v29);
 4369     __ eor(v12, __ T8B, v12, v30);
 4370     __ eor(v13, __ T8B, v13, v31);
 4371 
 4372     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4373     __ eor(v14, __ T8B, v14, v25);
 4374     __ eor(v15, __ T8B, v15, v26);
 4375     __ eor(v16, __ T8B, v16, v27);
 4376 
 4377     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4378     __ andw(c_rarg5, block_size, 48);
 4379     __ cbzw(c_rarg5, rounds24_loop);
 4380 
 4381     __ tbnz(block_size, 5, shake128);
 4382     // block_size == 144, bit5 == 0, SHA3-224
 4383     __ ldrd(v28, __ post(buf, 8));
 4384     __ eor(v17, __ T8B, v17, v28);
 4385     __ b(rounds24_loop);
 4386 
 4387     __ BIND(shake128);
 4388     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4389     __ eor(v17, __ T8B, v17, v28);
 4390     __ eor(v18, __ T8B, v18, v29);
 4391     __ eor(v19, __ T8B, v19, v30);
 4392     __ eor(v20, __ T8B, v20, v31);
 4393     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4394 
 4395     __ BIND(sha3_512_or_sha3_384);
 4396     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4397     __ eor(v7, __ T8B, v7, v25);
 4398     __ eor(v8, __ T8B, v8, v26);
 4399     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4400 
 4401     // SHA3-384
 4402     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4403     __ eor(v9,  __ T8B, v9,  v27);
 4404     __ eor(v10, __ T8B, v10, v28);
 4405     __ eor(v11, __ T8B, v11, v29);
 4406     __ eor(v12, __ T8B, v12, v30);
 4407 
 4408     __ BIND(rounds24_loop);
 4409     __ subw(rscratch2, rscratch2, 1);
 4410 
 4411     keccak_round(rscratch1);
 4412 
 4413     __ cbnzw(rscratch2, rounds24_loop);
 4414 
 4415     if (multi_block) {
 4416       __ add(ofs, ofs, block_size);
 4417       __ cmp(ofs, limit);
 4418       __ br(Assembler::LE, sha3_loop);
 4419       __ mov(c_rarg0, ofs); // return ofs
 4420     }
 4421 
 4422     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4423     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4424     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4425     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4426     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4427     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4428     __ st1(v24, __ T1D, state);
 4429 
 4430     // restore callee-saved registers
 4431     __ ldpd(v14, v15, Address(sp, 48));
 4432     __ ldpd(v12, v13, Address(sp, 32));
 4433     __ ldpd(v10, v11, Address(sp, 16));
 4434     __ ldpd(v8, v9, __ post(sp, 64));
 4435 
 4436     __ ret(lr);
 4437 
 4438     return start;
 4439   }
 4440 
 4441   // Inputs:
 4442   //   c_rarg0   - long[]  state0
 4443   //   c_rarg1   - long[]  state1
 4444   address generate_double_keccak() {
 4445     static const uint64_t round_consts[24] = {
 4446       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4447       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4448       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4449       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4450       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4451       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4452       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4453       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4454     };
 4455 
 4456     // Implements the double_keccak() method of the
 4457     // sun.secyrity.provider.SHA3Parallel class
 4458     __ align(CodeEntryAlignment);
 4459     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4460     address start = __ pc();
 4461     __ enter();
 4462 
 4463     Register state0        = c_rarg0;
 4464     Register state1        = c_rarg1;
 4465 
 4466     Label rounds24_loop;
 4467 
 4468     // save callee-saved registers
 4469     __ stpd(v8, v9, __ pre(sp, -64));
 4470     __ stpd(v10, v11, Address(sp, 16));
 4471     __ stpd(v12, v13, Address(sp, 32));
 4472     __ stpd(v14, v15, Address(sp, 48));
 4473 
 4474     // load states
 4475     __ add(rscratch1, state0, 32);
 4476     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4477     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4478     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4479     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4480     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4481     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4482     __ ld1(v24, __ D, 0, rscratch1);
 4483     __ add(rscratch1, state1, 32);
 4484     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4485     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4486     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4487     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4488     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4489     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4490     __ ld1(v24, __ D, 1, rscratch1);
 4491 
 4492     // 24 keccak rounds
 4493     __ movw(rscratch2, 24);
 4494 
 4495     // load round_constants base
 4496     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4497 
 4498     __ BIND(rounds24_loop);
 4499     __ subw(rscratch2, rscratch2, 1);
 4500     keccak_round(rscratch1);
 4501     __ cbnzw(rscratch2, rounds24_loop);
 4502 
 4503     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4504     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4505     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4506     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4507     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4508     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4509     __ st1(v24, __ D, 0, state0);
 4510     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4511     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4512     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4513     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4514     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4515     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4516     __ st1(v24, __ D, 1, state1);
 4517 
 4518     // restore callee-saved vector registers
 4519     __ ldpd(v14, v15, Address(sp, 48));
 4520     __ ldpd(v12, v13, Address(sp, 32));
 4521     __ ldpd(v10, v11, Address(sp, 16));
 4522     __ ldpd(v8, v9, __ post(sp, 64));
 4523 
 4524     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4525     __ mov(r0, zr); // return 0
 4526     __ ret(lr);
 4527 
 4528     return start;
 4529   }
 4530 
 4531   // ChaCha20 block function.  This version parallelizes the 32-bit
 4532   // state elements on each of 16 vectors, producing 4 blocks of
 4533   // keystream at a time.
 4534   //
 4535   // state (int[16]) = c_rarg0
 4536   // keystream (byte[256]) = c_rarg1
 4537   // return - number of bytes of produced keystream (always 256)
 4538   //
 4539   // This implementation takes each 32-bit integer from the state
 4540   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4541   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4542   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4543   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4544   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4545   // operations represented by one QUARTERROUND function, we instead stack all
 4546   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4547   // and then do the same for the second set of 4 quarter rounds.  This removes
 4548   // some latency that would otherwise be incurred by waiting for an add to
 4549   // complete before performing an xor (which depends on the result of the
 4550   // add), etc. An adjustment happens between the first and second groups of 4
 4551   // quarter rounds, but this is done only in the inputs to the macro functions
 4552   // that generate the assembly instructions - these adjustments themselves are
 4553   // not part of the resulting assembly.
 4554   // The 4 registers v0-v3 are used during the quarter round operations as
 4555   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4556   // registers become the vectors involved in adding the start state back onto
 4557   // the post-QR working state.  After the adds are complete, each of the 16
 4558   // vectors write their first lane back to the keystream buffer, followed
 4559   // by the second lane from all vectors and so on.
 4560   address generate_chacha20Block_blockpar() {
 4561     Label L_twoRounds, L_cc20_const;
 4562     // The constant data is broken into two 128-bit segments to be loaded
 4563     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4564     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4565     // The second 128-bits is a table constant used for 8-bit left rotations.
 4566     __ BIND(L_cc20_const);
 4567     __ emit_int64(0x0000000100000000UL);
 4568     __ emit_int64(0x0000000300000002UL);
 4569     __ emit_int64(0x0605040702010003UL);
 4570     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4571 
 4572     __ align(CodeEntryAlignment);
 4573     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4574     StubCodeMark mark(this, stub_id);
 4575     address start = __ pc();
 4576     __ enter();
 4577 
 4578     int i, j;
 4579     const Register state = c_rarg0;
 4580     const Register keystream = c_rarg1;
 4581     const Register loopCtr = r10;
 4582     const Register tmpAddr = r11;
 4583     const FloatRegister ctrAddOverlay = v28;
 4584     const FloatRegister lrot8Tbl = v29;
 4585 
 4586     // Organize SIMD registers in an array that facilitates
 4587     // putting repetitive opcodes into loop structures.  It is
 4588     // important that each grouping of 4 registers is monotonically
 4589     // increasing to support the requirements of multi-register
 4590     // instructions (e.g. ld4r, st4, etc.)
 4591     const FloatRegister workSt[16] = {
 4592          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4593         v20, v21, v22, v23, v24, v25, v26, v27
 4594     };
 4595 
 4596     // Pull in constant data.  The first 16 bytes are the add overlay
 4597     // which is applied to the vector holding the counter (state[12]).
 4598     // The second 16 bytes is the index register for the 8-bit left
 4599     // rotation tbl instruction.
 4600     __ adr(tmpAddr, L_cc20_const);
 4601     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4602 
 4603     // Load from memory and interlace across 16 SIMD registers,
 4604     // With each word from memory being broadcast to all lanes of
 4605     // each successive SIMD register.
 4606     //      Addr(0) -> All lanes in workSt[i]
 4607     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4608     __ mov(tmpAddr, state);
 4609     for (i = 0; i < 16; i += 4) {
 4610       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4611           __ post(tmpAddr, 16));
 4612     }
 4613     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4614 
 4615     // Before entering the loop, create 5 4-register arrays.  These
 4616     // will hold the 4 registers that represent the a/b/c/d fields
 4617     // in the quarter round operation.  For instance the "b" field
 4618     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4619     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4620     // since it is part of a diagonal organization.  The aSet and scratch
 4621     // register sets are defined at declaration time because they do not change
 4622     // organization at any point during the 20-round processing.
 4623     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4624     FloatRegister bSet[4];
 4625     FloatRegister cSet[4];
 4626     FloatRegister dSet[4];
 4627     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4628 
 4629     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4630     __ mov(loopCtr, 10);
 4631     __ BIND(L_twoRounds);
 4632 
 4633     // Set to columnar organization and do the following 4 quarter-rounds:
 4634     // QUARTERROUND(0, 4, 8, 12)
 4635     // QUARTERROUND(1, 5, 9, 13)
 4636     // QUARTERROUND(2, 6, 10, 14)
 4637     // QUARTERROUND(3, 7, 11, 15)
 4638     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4639     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4640     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4641 
 4642     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4643     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4644     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4645 
 4646     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4647     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4648     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4649 
 4650     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4651     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4652     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4653 
 4654     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4655     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4656     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4657 
 4658     // Set to diagonal organization and do the next 4 quarter-rounds:
 4659     // QUARTERROUND(0, 5, 10, 15)
 4660     // QUARTERROUND(1, 6, 11, 12)
 4661     // QUARTERROUND(2, 7, 8, 13)
 4662     // QUARTERROUND(3, 4, 9, 14)
 4663     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4664     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4665     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4666 
 4667     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4668     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4669     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4670 
 4671     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4672     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4673     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4674 
 4675     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4676     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4677     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4678 
 4679     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4680     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4681     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4682 
 4683     // Decrement and iterate
 4684     __ sub(loopCtr, loopCtr, 1);
 4685     __ cbnz(loopCtr, L_twoRounds);
 4686 
 4687     __ mov(tmpAddr, state);
 4688 
 4689     // Add the starting state back to the post-loop keystream
 4690     // state.  We read/interlace the state array from memory into
 4691     // 4 registers similar to what we did in the beginning.  Then
 4692     // add the counter overlay onto workSt[12] at the end.
 4693     for (i = 0; i < 16; i += 4) {
 4694       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4695       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4696       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4697       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4698       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4699     }
 4700     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4701 
 4702     // Write working state into the keystream buffer.  This is accomplished
 4703     // by taking the lane "i" from each of the four vectors and writing
 4704     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4705     // repeating with the next 4 vectors until all 16 vectors have been used.
 4706     // Then move to the next lane and repeat the process until all lanes have
 4707     // been written.
 4708     for (i = 0; i < 4; i++) {
 4709       for (j = 0; j < 16; j += 4) {
 4710         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4711             __ post(keystream, 16));
 4712       }
 4713     }
 4714 
 4715     __ mov(r0, 256);             // Return length of output keystream
 4716     __ leave();
 4717     __ ret(lr);
 4718 
 4719     return start;
 4720   }
 4721 
 4722   // Helpers to schedule parallel operation bundles across vector
 4723   // register sequences of size 2, 4 or 8.
 4724 
 4725   // Implement various primitive computations across vector sequences
 4726 
 4727   template<int N>
 4728   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4729                const VSeq<N>& v1, const VSeq<N>& v2) {
 4730     // output must not be constant
 4731     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4732     // output cannot overwrite pending inputs
 4733     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4734     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4735     for (int i = 0; i < N; i++) {
 4736       __ addv(v[i], T, v1[i], v2[i]);
 4737     }
 4738   }
 4739 
 4740   template<int N>
 4741   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4742                const VSeq<N>& v1, const VSeq<N>& v2) {
 4743     // output must not be constant
 4744     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4745     // output cannot overwrite pending inputs
 4746     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4747     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4748     for (int i = 0; i < N; i++) {
 4749       __ subv(v[i], T, v1[i], v2[i]);
 4750     }
 4751   }
 4752 
 4753   template<int N>
 4754   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4755                const VSeq<N>& v1, const VSeq<N>& v2) {
 4756     // output must not be constant
 4757     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4758     // output cannot overwrite pending inputs
 4759     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4760     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4761     for (int i = 0; i < N; i++) {
 4762       __ mulv(v[i], T, v1[i], v2[i]);
 4763     }
 4764   }
 4765 
 4766   template<int N>
 4767   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4768     // output must not be constant
 4769     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4770     // output cannot overwrite pending inputs
 4771     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4772     for (int i = 0; i < N; i++) {
 4773       __ negr(v[i], T, v1[i]);
 4774     }
 4775   }
 4776 
 4777   template<int N>
 4778   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4779                const VSeq<N>& v1, int shift) {
 4780     // output must not be constant
 4781     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4782     // output cannot overwrite pending inputs
 4783     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4784     for (int i = 0; i < N; i++) {
 4785       __ sshr(v[i], T, v1[i], shift);
 4786     }
 4787   }
 4788 
 4789   template<int N>
 4790   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4791     // output must not be constant
 4792     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4793     // output cannot overwrite pending inputs
 4794     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4795     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4796     for (int i = 0; i < N; i++) {
 4797       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4798     }
 4799   }
 4800 
 4801   template<int N>
 4802   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4803     // output must not be constant
 4804     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4805     // output cannot overwrite pending inputs
 4806     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4807     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4808     for (int i = 0; i < N; i++) {
 4809       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4810     }
 4811   }
 4812 
 4813   template<int N>
 4814   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4815     // output must not be constant
 4816     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4817     // output cannot overwrite pending inputs
 4818     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4819     for (int i = 0; i < N; i++) {
 4820       __ notr(v[i], __ T16B, v1[i]);
 4821     }
 4822   }
 4823 
 4824   template<int N>
 4825   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4826     // output must not be constant
 4827     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4828     // output cannot overwrite pending inputs
 4829     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4830     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4831     for (int i = 0; i < N; i++) {
 4832       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4833     }
 4834   }
 4835 
 4836   template<int N>
 4837   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4838     // output must not be constant
 4839     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4840     // output cannot overwrite pending inputs
 4841     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4842     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4843     for (int i = 0; i < N; i++) {
 4844       __ mlsv(v[i], T, v1[i], v2[i]);
 4845     }
 4846   }
 4847 
 4848   // load N/2 successive pairs of quadword values from memory in order
 4849   // into N successive vector registers of the sequence via the
 4850   // address supplied in base.
 4851   template<int N>
 4852   void vs_ldpq(const VSeq<N>& v, Register base) {
 4853     for (int i = 0; i < N; i += 2) {
 4854       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4855     }
 4856   }
 4857 
 4858   // load N/2 successive pairs of quadword values from memory in order
 4859   // into N vector registers of the sequence via the address supplied
 4860   // in base using post-increment addressing
 4861   template<int N>
 4862   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4863     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4864     for (int i = 0; i < N; i += 2) {
 4865       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4866     }
 4867   }
 4868 
 4869   // store N successive vector registers of the sequence into N/2
 4870   // successive pairs of quadword memory locations via the address
 4871   // supplied in base using post-increment addressing
 4872   template<int N>
 4873   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4874     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4875     for (int i = 0; i < N; i += 2) {
 4876       __ stpq(v[i], v[i+1], __ post(base, 32));
 4877     }
 4878   }
 4879 
 4880   // load N/2 pairs of quadword values from memory de-interleaved into
 4881   // N vector registers 2 at a time via the address supplied in base
 4882   // using post-increment addressing.
 4883   template<int N>
 4884   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4885     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4886     for (int i = 0; i < N; i += 2) {
 4887       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4888     }
 4889   }
 4890 
 4891   // store N vector registers interleaved into N/2 pairs of quadword
 4892   // memory locations via the address supplied in base using
 4893   // post-increment addressing.
 4894   template<int N>
 4895   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4896     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4897     for (int i = 0; i < N; i += 2) {
 4898       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4899     }
 4900   }
 4901 
 4902   // load N quadword values from memory de-interleaved into N vector
 4903   // registers 3 elements at a time via the address supplied in base.
 4904   template<int N>
 4905   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4906     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4907     for (int i = 0; i < N; i += 3) {
 4908       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4909     }
 4910   }
 4911 
 4912   // load N quadword values from memory de-interleaved into N vector
 4913   // registers 3 elements at a time via the address supplied in base
 4914   // using post-increment addressing.
 4915   template<int N>
 4916   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4917     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4918     for (int i = 0; i < N; i += 3) {
 4919       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4920     }
 4921   }
 4922 
 4923   // load N/2 pairs of quadword values from memory into N vector
 4924   // registers via the address supplied in base with each pair indexed
 4925   // using the the start offset plus the corresponding entry in the
 4926   // offsets array
 4927   template<int N>
 4928   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4929     for (int i = 0; i < N/2; i++) {
 4930       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4931     }
 4932   }
 4933 
 4934   // store N vector registers into N/2 pairs of quadword memory
 4935   // locations via the address supplied in base with each pair indexed
 4936   // using the the start offset plus the corresponding entry in the
 4937   // offsets array
 4938   template<int N>
 4939   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4940     for (int i = 0; i < N/2; i++) {
 4941       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4942     }
 4943   }
 4944 
 4945   // load N single quadword values from memory into N vector registers
 4946   // via the address supplied in base with each value indexed using
 4947   // the the start offset plus the corresponding entry in the offsets
 4948   // array
 4949   template<int N>
 4950   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4951                       int start, int (&offsets)[N]) {
 4952     for (int i = 0; i < N; i++) {
 4953       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4954     }
 4955   }
 4956 
 4957   // store N vector registers into N single quadword memory locations
 4958   // via the address supplied in base with each value indexed using
 4959   // the the start offset plus the corresponding entry in the offsets
 4960   // array
 4961   template<int N>
 4962   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4963                       int start, int (&offsets)[N]) {
 4964     for (int i = 0; i < N; i++) {
 4965       __ str(v[i], T, Address(base, start + offsets[i]));
 4966     }
 4967   }
 4968 
 4969   // load N/2 pairs of quadword values from memory de-interleaved into
 4970   // N vector registers 2 at a time via the address supplied in base
 4971   // with each pair indexed using the the start offset plus the
 4972   // corresponding entry in the offsets array
 4973   template<int N>
 4974   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4975                       Register tmp, int start, int (&offsets)[N/2]) {
 4976     for (int i = 0; i < N/2; i++) {
 4977       __ add(tmp, base, start + offsets[i]);
 4978       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4979     }
 4980   }
 4981 
 4982   // store N vector registers 2 at a time interleaved into N/2 pairs
 4983   // of quadword memory locations via the address supplied in base
 4984   // with each pair indexed using the the start offset plus the
 4985   // corresponding entry in the offsets array
 4986   template<int N>
 4987   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4988                       Register tmp, int start, int (&offsets)[N/2]) {
 4989     for (int i = 0; i < N/2; i++) {
 4990       __ add(tmp, base, start + offsets[i]);
 4991       __ st2(v[2*i], v[2*i+1], T, tmp);
 4992     }
 4993   }
 4994 
 4995   // Helper routines for various flavours of Montgomery multiply
 4996 
 4997   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 4998   // multiplications in parallel
 4999   //
 5000 
 5001   // See the montMul() method of the sun.security.provider.ML_DSA
 5002   // class.
 5003   //
 5004   // Computes 4x4S results or 8x8H results
 5005   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5006   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5007   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5008   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5009   // Outputs: va - 4x4S or 4x8H vector register sequences
 5010   // vb, vc, vtmp and vq must all be disjoint
 5011   // va must be disjoint from all other inputs/temps or must equal vc
 5012   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5013   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5014   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5015                    Assembler::SIMD_Arrangement T,
 5016                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5017     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5018     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5019     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5020     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5021 
 5022     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5023     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5024 
 5025     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5026 
 5027     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5028     assert(vs_disjoint(va, vb), "va and vb overlap");
 5029     assert(vs_disjoint(va, vq), "va and vq overlap");
 5030     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5031     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5032 
 5033     // schedule 4 streams of instructions across the vector sequences
 5034     for (int i = 0; i < 4; i++) {
 5035       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5036       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5037     }
 5038 
 5039     for (int i = 0; i < 4; i++) {
 5040       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5041     }
 5042 
 5043     for (int i = 0; i < 4; i++) {
 5044       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5045     }
 5046 
 5047     for (int i = 0; i < 4; i++) {
 5048       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5049     }
 5050   }
 5051 
 5052   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5053   // multiplications in parallel
 5054   //
 5055 
 5056   // See the montMul() method of the sun.security.provider.ML_DSA
 5057   // class.
 5058   //
 5059   // Computes 4x4S results or 8x8H results
 5060   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5061   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5062   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5063   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5064   // Outputs: va - 4x4S or 4x8H vector register sequences
 5065   // vb, vc, vtmp and vq must all be disjoint
 5066   // va must be disjoint from all other inputs/temps or must equal vc
 5067   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5068   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5069   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5070                    Assembler::SIMD_Arrangement T,
 5071                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5072     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5073     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5074     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5075     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5076 
 5077     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5078     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5079 
 5080     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5081 
 5082     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5083     assert(vs_disjoint(va, vb), "va and vb overlap");
 5084     assert(vs_disjoint(va, vq), "va and vq overlap");
 5085     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5086     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5087 
 5088     // schedule 2 streams of instructions across the vector sequences
 5089     for (int i = 0; i < 2; i++) {
 5090       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5091       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5092     }
 5093 
 5094     for (int i = 0; i < 2; i++) {
 5095       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5096     }
 5097 
 5098     for (int i = 0; i < 2; i++) {
 5099       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5100     }
 5101 
 5102     for (int i = 0; i < 2; i++) {
 5103       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5104     }
 5105   }
 5106 
 5107   // Perform 16 16-bit Montgomery multiplications in parallel.
 5108   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5109                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5110     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5111     // It will assert that the register use is valid
 5112     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5113   }
 5114 
 5115   // Perform 32 16-bit Montgomery multiplications in parallel.
 5116   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5117                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5118     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5119     // It will assert that the register use is valid
 5120     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5121   }
 5122 
 5123   // Perform 64 16-bit Montgomery multiplications in parallel.
 5124   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5125                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5126     // Schedule two successive 4x8H multiplies via the montmul helper
 5127     // on the front and back halves of va, vb and vc. The helper will
 5128     // assert that the register use has no overlap conflicts on each
 5129     // individual call but we also need to ensure that the necessary
 5130     // disjoint/equality constraints are met across both calls.
 5131 
 5132     // vb, vc, vtmp and vq must be disjoint. va must either be
 5133     // disjoint from all other registers or equal vc
 5134 
 5135     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5136     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5137     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5138 
 5139     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5140     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5141 
 5142     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5143 
 5144     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5145     assert(vs_disjoint(va, vb), "va and vb overlap");
 5146     assert(vs_disjoint(va, vq), "va and vq overlap");
 5147     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5148 
 5149     // we multiply the front and back halves of each sequence 4 at a
 5150     // time because
 5151     //
 5152     // 1) we are currently only able to get 4-way instruction
 5153     // parallelism at best
 5154     //
 5155     // 2) we need registers for the constants in vq and temporary
 5156     // scratch registers to hold intermediate results so vtmp can only
 5157     // be a VSeq<4> which means we only have 4 scratch slots
 5158 
 5159     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5160     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5161   }
 5162 
 5163   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5164                                const VSeq<4>& vc,
 5165                                const VSeq<4>& vtmp,
 5166                                const VSeq<2>& vq) {
 5167     // compute a = montmul(a1, c)
 5168     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5169     // ouptut a1 = a0 - a
 5170     vs_subv(va1, __ T8H, va0, vc);
 5171     //    and a0 = a0 + a
 5172     vs_addv(va0, __ T8H, va0, vc);
 5173   }
 5174 
 5175   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5176                                const VSeq<4>& vb,
 5177                                const VSeq<4>& vtmp1,
 5178                                const VSeq<4>& vtmp2,
 5179                                const VSeq<2>& vq) {
 5180     // compute c = a0 - a1
 5181     vs_subv(vtmp1, __ T8H, va0, va1);
 5182     // output a0 = a0 + a1
 5183     vs_addv(va0, __ T8H, va0, va1);
 5184     // output a1 = b montmul c
 5185     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5186   }
 5187 
 5188   void load64shorts(const VSeq<8>& v, Register shorts) {
 5189     vs_ldpq_post(v, shorts);
 5190   }
 5191 
 5192   void load32shorts(const VSeq<4>& v, Register shorts) {
 5193     vs_ldpq_post(v, shorts);
 5194   }
 5195 
 5196   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5197     vs_stpq_post(v, tmpAddr);
 5198   }
 5199 
 5200   // Kyber NTT function.
 5201   // Implements
 5202   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5203   //
 5204   // coeffs (short[256]) = c_rarg0
 5205   // ntt_zetas (short[256]) = c_rarg1
 5206   address generate_kyberNtt() {
 5207 
 5208     __ align(CodeEntryAlignment);
 5209     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5210     StubCodeMark mark(this, stub_id);
 5211     address start = __ pc();
 5212     __ enter();
 5213 
 5214     const Register coeffs = c_rarg0;
 5215     const Register zetas = c_rarg1;
 5216 
 5217     const Register kyberConsts = r10;
 5218     const Register tmpAddr = r11;
 5219 
 5220     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5221     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5222     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5223 
 5224     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5225     // load the montmul constants
 5226     vs_ldpq(vq, kyberConsts);
 5227 
 5228     // Each level corresponds to an iteration of the outermost loop of the
 5229     // Java method seilerNTT(int[] coeffs). There are some differences
 5230     // from what is done in the seilerNTT() method, though:
 5231     // 1. The computation is using 16-bit signed values, we do not convert them
 5232     // to ints here.
 5233     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5234     // this array for each level, it is easier that way to fill up the vector
 5235     // registers.
 5236     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5237     // multiplications (this is because that way there should not be any
 5238     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5239     // that we can use the 16-bit arithmetic in the vector unit.
 5240     //
 5241     // On each level, we fill up the vector registers in such a way that the
 5242     // array elements that need to be multiplied by the zetas go into one
 5243     // set of vector registers while the corresponding ones that don't need to
 5244     // be multiplied, go into another set.
 5245     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5246     // registers interleaving the steps of 4 identical computations,
 5247     // each done on 8 16-bit values per register.
 5248 
 5249     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5250     // to the zetas occur in discrete blocks whose size is some multiple
 5251     // of 32.
 5252 
 5253     // level 0
 5254     __ add(tmpAddr, coeffs, 256);
 5255     load64shorts(vs1, tmpAddr);
 5256     load64shorts(vs2, zetas);
 5257     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5258     __ add(tmpAddr, coeffs, 0);
 5259     load64shorts(vs1, tmpAddr);
 5260     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5261     vs_addv(vs1, __ T8H, vs1, vs2);
 5262     __ add(tmpAddr, coeffs, 0);
 5263     vs_stpq_post(vs1, tmpAddr);
 5264     __ add(tmpAddr, coeffs, 256);
 5265     vs_stpq_post(vs3, tmpAddr);
 5266     // restore montmul constants
 5267     vs_ldpq(vq, kyberConsts);
 5268     load64shorts(vs1, tmpAddr);
 5269     load64shorts(vs2, zetas);
 5270     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5271     __ add(tmpAddr, coeffs, 128);
 5272     load64shorts(vs1, tmpAddr);
 5273     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5274     vs_addv(vs1, __ T8H, vs1, vs2);
 5275     __ add(tmpAddr, coeffs, 128);
 5276     store64shorts(vs1, tmpAddr);
 5277     __ add(tmpAddr, coeffs, 384);
 5278     store64shorts(vs3, tmpAddr);
 5279 
 5280     // level 1
 5281     // restore montmul constants
 5282     vs_ldpq(vq, kyberConsts);
 5283     __ add(tmpAddr, coeffs, 128);
 5284     load64shorts(vs1, tmpAddr);
 5285     load64shorts(vs2, zetas);
 5286     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5287     __ add(tmpAddr, coeffs, 0);
 5288     load64shorts(vs1, tmpAddr);
 5289     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5290     vs_addv(vs1, __ T8H, vs1, vs2);
 5291     __ add(tmpAddr, coeffs, 0);
 5292     store64shorts(vs1, tmpAddr);
 5293     store64shorts(vs3, tmpAddr);
 5294     vs_ldpq(vq, kyberConsts);
 5295     __ add(tmpAddr, coeffs, 384);
 5296     load64shorts(vs1, tmpAddr);
 5297     load64shorts(vs2, zetas);
 5298     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5299     __ add(tmpAddr, coeffs, 256);
 5300     load64shorts(vs1, tmpAddr);
 5301     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5302     vs_addv(vs1, __ T8H, vs1, vs2);
 5303     __ add(tmpAddr, coeffs, 256);
 5304     store64shorts(vs1, tmpAddr);
 5305     store64shorts(vs3, tmpAddr);
 5306 
 5307     // level 2
 5308     vs_ldpq(vq, kyberConsts);
 5309     int offsets1[4] = { 0, 32, 128, 160 };
 5310     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5311     load64shorts(vs2, zetas);
 5312     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5313     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5314     // kyber_subv_addv64();
 5315     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5316     vs_addv(vs1, __ T8H, vs1, vs2);
 5317     __ add(tmpAddr, coeffs, 0);
 5318     vs_stpq_post(vs_front(vs1), tmpAddr);
 5319     vs_stpq_post(vs_front(vs3), tmpAddr);
 5320     vs_stpq_post(vs_back(vs1), tmpAddr);
 5321     vs_stpq_post(vs_back(vs3), tmpAddr);
 5322     vs_ldpq(vq, kyberConsts);
 5323     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5324     load64shorts(vs2, zetas);
 5325     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5326     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5327     // kyber_subv_addv64();
 5328     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5329     vs_addv(vs1, __ T8H, vs1, vs2);
 5330     __ add(tmpAddr, coeffs, 256);
 5331     vs_stpq_post(vs_front(vs1), tmpAddr);
 5332     vs_stpq_post(vs_front(vs3), tmpAddr);
 5333     vs_stpq_post(vs_back(vs1), tmpAddr);
 5334     vs_stpq_post(vs_back(vs3), tmpAddr);
 5335 
 5336     // level 3
 5337     vs_ldpq(vq, kyberConsts);
 5338     int offsets2[4] = { 0, 64, 128, 192 };
 5339     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5340     load64shorts(vs2, zetas);
 5341     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5342     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5343     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5344     vs_addv(vs1, __ T8H, vs1, vs2);
 5345     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5346     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5347 
 5348     vs_ldpq(vq, kyberConsts);
 5349     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5350     load64shorts(vs2, zetas);
 5351     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5352     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5353     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5354     vs_addv(vs1, __ T8H, vs1, vs2);
 5355     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5356     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5357 
 5358     // level 4
 5359     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5360     // so they are loaded using employing an ldr at 8 distinct offsets.
 5361 
 5362     vs_ldpq(vq, kyberConsts);
 5363     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5364     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5365     load64shorts(vs2, zetas);
 5366     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5367     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5368     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5369     vs_addv(vs1, __ T8H, vs1, vs2);
 5370     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5371     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5372 
 5373     vs_ldpq(vq, kyberConsts);
 5374     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5375     load64shorts(vs2, zetas);
 5376     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5377     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5378     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5379     vs_addv(vs1, __ T8H, vs1, vs2);
 5380     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5381     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5382 
 5383     // level 5
 5384     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5385     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5386 
 5387     vs_ldpq(vq, kyberConsts);
 5388     int offsets4[4] = { 0, 32, 64, 96 };
 5389     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5390     load32shorts(vs_front(vs2), zetas);
 5391     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5392     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5393     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5394     load32shorts(vs_front(vs2), zetas);
 5395     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5396     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5397     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5398     load32shorts(vs_front(vs2), zetas);
 5399     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5400     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5401 
 5402     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5403     load32shorts(vs_front(vs2), zetas);
 5404     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5405     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5406 
 5407     // level 6
 5408     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5409     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5410 
 5411     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5412     load32shorts(vs_front(vs2), zetas);
 5413     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5414     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5415     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5416     // __ ldpq(v18, v19, __ post(zetas, 32));
 5417     load32shorts(vs_front(vs2), zetas);
 5418     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5419     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5420 
 5421     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5422     load32shorts(vs_front(vs2), zetas);
 5423     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5424     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5425 
 5426     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5427     load32shorts(vs_front(vs2), zetas);
 5428     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5429     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5430 
 5431     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5432     __ mov(r0, zr); // return 0
 5433     __ ret(lr);
 5434 
 5435     return start;
 5436   }
 5437 
 5438   // Kyber Inverse NTT function
 5439   // Implements
 5440   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5441   //
 5442   // coeffs (short[256]) = c_rarg0
 5443   // ntt_zetas (short[256]) = c_rarg1
 5444   address generate_kyberInverseNtt() {
 5445 
 5446     __ align(CodeEntryAlignment);
 5447     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5448     StubCodeMark mark(this, stub_id);
 5449     address start = __ pc();
 5450     __ enter();
 5451 
 5452     const Register coeffs = c_rarg0;
 5453     const Register zetas = c_rarg1;
 5454 
 5455     const Register kyberConsts = r10;
 5456     const Register tmpAddr = r11;
 5457     const Register tmpAddr2 = c_rarg2;
 5458 
 5459     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5460     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5461     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5462 
 5463     __ lea(kyberConsts,
 5464              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5465 
 5466     // level 0
 5467     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5468     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5469 
 5470     vs_ldpq(vq, kyberConsts);
 5471     int offsets4[4] = { 0, 32, 64, 96 };
 5472     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5473     load32shorts(vs_front(vs2), zetas);
 5474     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5475                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5476     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5477     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5478     load32shorts(vs_front(vs2), zetas);
 5479     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5480                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5481     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5482     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5483     load32shorts(vs_front(vs2), zetas);
 5484     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5485                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5486     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5487     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5488     load32shorts(vs_front(vs2), zetas);
 5489     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5490                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5491     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5492 
 5493     // level 1
 5494     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5495     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5496 
 5497     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5498     load32shorts(vs_front(vs2), zetas);
 5499     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5500                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5501     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5502     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5503     load32shorts(vs_front(vs2), zetas);
 5504     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5505                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5506     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5507 
 5508     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5509     load32shorts(vs_front(vs2), zetas);
 5510     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5511                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5512     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5513     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5514     load32shorts(vs_front(vs2), zetas);
 5515     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5516                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5517     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5518 
 5519     // level 2
 5520     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5521     // so they are loaded using employing an ldr at 8 distinct offsets.
 5522 
 5523     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5524     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5525     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5526     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5527     vs_subv(vs1, __ T8H, vs1, vs2);
 5528     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5529     load64shorts(vs2, zetas);
 5530     vs_ldpq(vq, kyberConsts);
 5531     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5532     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5533 
 5534     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5535     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5536     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5537     vs_subv(vs1, __ T8H, vs1, vs2);
 5538     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5539     load64shorts(vs2, zetas);
 5540     vs_ldpq(vq, kyberConsts);
 5541     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5542     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5543 
 5544     // Barrett reduction at indexes where overflow may happen
 5545 
 5546     // load q and the multiplier for the Barrett reduction
 5547     __ add(tmpAddr, kyberConsts, 16);
 5548     vs_ldpq(vq, tmpAddr);
 5549 
 5550     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5551     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5552     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5553     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5554     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5555     vs_sshr(vs2, __ T8H, vs2, 11);
 5556     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5557     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5558     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5559     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5560     vs_sshr(vs2, __ T8H, vs2, 11);
 5561     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5562     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5563 
 5564     // level 3
 5565     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5566     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5567 
 5568     int offsets2[4] = { 0, 64, 128, 192 };
 5569     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5570     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5571     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5572     vs_subv(vs1, __ T8H, vs1, vs2);
 5573     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5574     load64shorts(vs2, zetas);
 5575     vs_ldpq(vq, kyberConsts);
 5576     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5577     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5578 
 5579     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5580     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5581     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5582     vs_subv(vs1, __ T8H, vs1, vs2);
 5583     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5584     load64shorts(vs2, zetas);
 5585     vs_ldpq(vq, kyberConsts);
 5586     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5587     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5588 
 5589     // level 4
 5590 
 5591     int offsets1[4] = { 0, 32, 128, 160 };
 5592     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5593     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5594     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5595     vs_subv(vs1, __ T8H, vs1, vs2);
 5596     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5597     load64shorts(vs2, zetas);
 5598     vs_ldpq(vq, kyberConsts);
 5599     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5600     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5601 
 5602     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5603     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5604     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5605     vs_subv(vs1, __ T8H, vs1, vs2);
 5606     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5607     load64shorts(vs2, zetas);
 5608     vs_ldpq(vq, kyberConsts);
 5609     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5610     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5611 
 5612     // level 5
 5613 
 5614     __ add(tmpAddr, coeffs, 0);
 5615     load64shorts(vs1, tmpAddr);
 5616     __ add(tmpAddr, coeffs, 128);
 5617     load64shorts(vs2, tmpAddr);
 5618     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5619     vs_subv(vs1, __ T8H, vs1, vs2);
 5620     __ add(tmpAddr, coeffs, 0);
 5621     store64shorts(vs3, tmpAddr);
 5622     load64shorts(vs2, zetas);
 5623     vs_ldpq(vq, kyberConsts);
 5624     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5625     __ add(tmpAddr, coeffs, 128);
 5626     store64shorts(vs2, tmpAddr);
 5627 
 5628     load64shorts(vs1, tmpAddr);
 5629     __ add(tmpAddr, coeffs, 384);
 5630     load64shorts(vs2, tmpAddr);
 5631     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5632     vs_subv(vs1, __ T8H, vs1, vs2);
 5633     __ add(tmpAddr, coeffs, 256);
 5634     store64shorts(vs3, tmpAddr);
 5635     load64shorts(vs2, zetas);
 5636     vs_ldpq(vq, kyberConsts);
 5637     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5638     __ add(tmpAddr, coeffs, 384);
 5639     store64shorts(vs2, tmpAddr);
 5640 
 5641     // Barrett reduction at indexes where overflow may happen
 5642 
 5643     // load q and the multiplier for the Barrett reduction
 5644     __ add(tmpAddr, kyberConsts, 16);
 5645     vs_ldpq(vq, tmpAddr);
 5646 
 5647     int offsets0[2] = { 0, 256 };
 5648     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5649     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5650     vs_sshr(vs2, __ T8H, vs2, 11);
 5651     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5652     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5653 
 5654     // level 6
 5655 
 5656     __ add(tmpAddr, coeffs, 0);
 5657     load64shorts(vs1, tmpAddr);
 5658     __ add(tmpAddr, coeffs, 256);
 5659     load64shorts(vs2, tmpAddr);
 5660     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5661     vs_subv(vs1, __ T8H, vs1, vs2);
 5662     __ add(tmpAddr, coeffs, 0);
 5663     store64shorts(vs3, tmpAddr);
 5664     load64shorts(vs2, zetas);
 5665     vs_ldpq(vq, kyberConsts);
 5666     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5667     __ add(tmpAddr, coeffs, 256);
 5668     store64shorts(vs2, tmpAddr);
 5669 
 5670     __ add(tmpAddr, coeffs, 128);
 5671     load64shorts(vs1, tmpAddr);
 5672     __ add(tmpAddr, coeffs, 384);
 5673     load64shorts(vs2, tmpAddr);
 5674     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5675     vs_subv(vs1, __ T8H, vs1, vs2);
 5676     __ add(tmpAddr, coeffs, 128);
 5677     store64shorts(vs3, tmpAddr);
 5678     load64shorts(vs2, zetas);
 5679     vs_ldpq(vq, kyberConsts);
 5680     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5681     __ add(tmpAddr, coeffs, 384);
 5682     store64shorts(vs2, tmpAddr);
 5683 
 5684     // multiply by 2^-n
 5685 
 5686     // load toMont(2^-n mod q)
 5687     __ add(tmpAddr, kyberConsts, 48);
 5688     __ ldr(v29, __ Q, tmpAddr);
 5689 
 5690     vs_ldpq(vq, kyberConsts);
 5691     __ add(tmpAddr, coeffs, 0);
 5692     load64shorts(vs1, tmpAddr);
 5693     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5694     __ add(tmpAddr, coeffs, 0);
 5695     store64shorts(vs2, tmpAddr);
 5696 
 5697     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5698     load64shorts(vs1, tmpAddr);
 5699     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5700     __ add(tmpAddr, coeffs, 128);
 5701     store64shorts(vs2, tmpAddr);
 5702 
 5703     // now tmpAddr contains coeffs + 256
 5704     load64shorts(vs1, tmpAddr);
 5705     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5706     __ add(tmpAddr, coeffs, 256);
 5707     store64shorts(vs2, tmpAddr);
 5708 
 5709     // now tmpAddr contains coeffs + 384
 5710     load64shorts(vs1, tmpAddr);
 5711     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5712     __ add(tmpAddr, coeffs, 384);
 5713     store64shorts(vs2, tmpAddr);
 5714 
 5715     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5716     __ mov(r0, zr); // return 0
 5717     __ ret(lr);
 5718 
 5719     return start;
 5720   }
 5721 
 5722   // Kyber multiply polynomials in the NTT domain.
 5723   // Implements
 5724   // static int implKyberNttMult(
 5725   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5726   //
 5727   // result (short[256]) = c_rarg0
 5728   // ntta (short[256]) = c_rarg1
 5729   // nttb (short[256]) = c_rarg2
 5730   // zetas (short[128]) = c_rarg3
 5731   address generate_kyberNttMult() {
 5732 
 5733     __ align(CodeEntryAlignment);
 5734     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5735     StubCodeMark mark(this, stub_id);
 5736     address start = __ pc();
 5737     __ enter();
 5738 
 5739     const Register result = c_rarg0;
 5740     const Register ntta = c_rarg1;
 5741     const Register nttb = c_rarg2;
 5742     const Register zetas = c_rarg3;
 5743 
 5744     const Register kyberConsts = r10;
 5745     const Register limit = r11;
 5746 
 5747     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5748     VSeq<4> vs3(16), vs4(20);
 5749     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5750     VSeq<2> vz(28);          // pair of zetas
 5751     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5752 
 5753     __ lea(kyberConsts,
 5754              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5755 
 5756     Label kyberNttMult_loop;
 5757 
 5758     __ add(limit, result, 512);
 5759 
 5760     // load q and qinv
 5761     vs_ldpq(vq, kyberConsts);
 5762 
 5763     // load R^2 mod q (to convert back from Montgomery representation)
 5764     __ add(kyberConsts, kyberConsts, 64);
 5765     __ ldr(v27, __ Q, kyberConsts);
 5766 
 5767     __ BIND(kyberNttMult_loop);
 5768 
 5769     // load 16 zetas
 5770     vs_ldpq_post(vz, zetas);
 5771 
 5772     // load 2 sets of 32 coefficients from the two input arrays
 5773     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5774     // are striped across pairs of vector registers
 5775     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5776     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5777     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5778     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5779 
 5780     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5781     // i.e. montmul the first and second halves of vs1 in order and
 5782     // then with one sequence reversed storing the two results in vs3
 5783     //
 5784     // vs3[0] <- montmul(a0, b0)
 5785     // vs3[1] <- montmul(a1, b1)
 5786     // vs3[2] <- montmul(a0, b1)
 5787     // vs3[3] <- montmul(a1, b0)
 5788     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5789     kyber_montmul16(vs_back(vs3),
 5790                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5791 
 5792     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5793     // i.e. montmul the first and second halves of vs4 in order and
 5794     // then with one sequence reversed storing the two results in vs1
 5795     //
 5796     // vs1[0] <- montmul(a2, b2)
 5797     // vs1[1] <- montmul(a3, b3)
 5798     // vs1[2] <- montmul(a2, b3)
 5799     // vs1[3] <- montmul(a3, b2)
 5800     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5801     kyber_montmul16(vs_back(vs1),
 5802                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5803 
 5804     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5805     // We can schedule two montmuls at a time if we use a suitable vector
 5806     // sequence <vs3[1], vs1[1]>.
 5807     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5808     VSeq<2> vs5(vs3[1], delta);
 5809 
 5810     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5811     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5812     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5813 
 5814     // add results in pairs storing in vs3
 5815     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5816     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5817     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5818 
 5819     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5820     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5821     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5822 
 5823     // vs1 <- montmul(vs3, montRSquareModQ)
 5824     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5825 
 5826     // store back the two pairs of result vectors de-interleaved as 8H elements
 5827     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5828     // in memory
 5829     vs_st2_post(vs1, __ T8H, result);
 5830 
 5831     __ cmp(result, limit);
 5832     __ br(Assembler::NE, kyberNttMult_loop);
 5833 
 5834     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5835     __ mov(r0, zr); // return 0
 5836     __ ret(lr);
 5837 
 5838     return start;
 5839   }
 5840 
 5841   // Kyber add 2 polynomials.
 5842   // Implements
 5843   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5844   //
 5845   // result (short[256]) = c_rarg0
 5846   // a (short[256]) = c_rarg1
 5847   // b (short[256]) = c_rarg2
 5848   address generate_kyberAddPoly_2() {
 5849 
 5850     __ align(CodeEntryAlignment);
 5851     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5852     StubCodeMark mark(this, stub_id);
 5853     address start = __ pc();
 5854     __ enter();
 5855 
 5856     const Register result = c_rarg0;
 5857     const Register a = c_rarg1;
 5858     const Register b = c_rarg2;
 5859 
 5860     const Register kyberConsts = r11;
 5861 
 5862     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5863     // So, we can load, add and store the data in 3 groups of 11,
 5864     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5865     // registers. A further constraint is that the mapping needs
 5866     // to skip callee saves. So, we allocate the register
 5867     // sequences using two 8 sequences, two 2 sequences and two
 5868     // single registers.
 5869     VSeq<8> vs1_1(0);
 5870     VSeq<2> vs1_2(16);
 5871     FloatRegister vs1_3 = v28;
 5872     VSeq<8> vs2_1(18);
 5873     VSeq<2> vs2_2(26);
 5874     FloatRegister vs2_3 = v29;
 5875 
 5876     // two constant vector sequences
 5877     VSeq<8> vc_1(31, 0);
 5878     VSeq<2> vc_2(31, 0);
 5879 
 5880     FloatRegister vc_3 = v31;
 5881     __ lea(kyberConsts,
 5882              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5883 
 5884     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5885     for (int i = 0; i < 3; i++) {
 5886       // load 80 or 88 values from a into vs1_1/2/3
 5887       vs_ldpq_post(vs1_1, a);
 5888       vs_ldpq_post(vs1_2, a);
 5889       if (i < 2) {
 5890         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5891       }
 5892       // load 80 or 88 values from b into vs2_1/2/3
 5893       vs_ldpq_post(vs2_1, b);
 5894       vs_ldpq_post(vs2_2, b);
 5895       if (i < 2) {
 5896         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5897       }
 5898       // sum 80 or 88 values across vs1 and vs2 into vs1
 5899       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5900       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5901       if (i < 2) {
 5902         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5903       }
 5904       // add constant to all 80 or 88 results
 5905       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5906       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5907       if (i < 2) {
 5908         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5909       }
 5910       // store 80 or 88 values
 5911       vs_stpq_post(vs1_1, result);
 5912       vs_stpq_post(vs1_2, result);
 5913       if (i < 2) {
 5914         __ str(vs1_3, __ Q, __ post(result, 16));
 5915       }
 5916     }
 5917 
 5918     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5919     __ mov(r0, zr); // return 0
 5920     __ ret(lr);
 5921 
 5922     return start;
 5923   }
 5924 
 5925   // Kyber add 3 polynomials.
 5926   // Implements
 5927   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5928   //
 5929   // result (short[256]) = c_rarg0
 5930   // a (short[256]) = c_rarg1
 5931   // b (short[256]) = c_rarg2
 5932   // c (short[256]) = c_rarg3
 5933   address generate_kyberAddPoly_3() {
 5934 
 5935     __ align(CodeEntryAlignment);
 5936     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 5937     StubCodeMark mark(this, stub_id);
 5938     address start = __ pc();
 5939     __ enter();
 5940 
 5941     const Register result = c_rarg0;
 5942     const Register a = c_rarg1;
 5943     const Register b = c_rarg2;
 5944     const Register c = c_rarg3;
 5945 
 5946     const Register kyberConsts = r11;
 5947 
 5948     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5949     // quadwords.  So, we can load, add and store the data in 3
 5950     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5951     // of 10 or 11 registers. A further constraint is that the
 5952     // mapping needs to skip callee saves. So, we allocate the
 5953     // register sequences using two 8 sequences, two 2 sequences
 5954     // and two single registers.
 5955     VSeq<8> vs1_1(0);
 5956     VSeq<2> vs1_2(16);
 5957     FloatRegister vs1_3 = v28;
 5958     VSeq<8> vs2_1(18);
 5959     VSeq<2> vs2_2(26);
 5960     FloatRegister vs2_3 = v29;
 5961 
 5962     // two constant vector sequences
 5963     VSeq<8> vc_1(31, 0);
 5964     VSeq<2> vc_2(31, 0);
 5965 
 5966     FloatRegister vc_3 = v31;
 5967 
 5968     __ lea(kyberConsts,
 5969              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5970 
 5971     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5972     for (int i = 0; i < 3; i++) {
 5973       // load 80 or 88 values from a into vs1_1/2/3
 5974       vs_ldpq_post(vs1_1, a);
 5975       vs_ldpq_post(vs1_2, a);
 5976       if (i < 2) {
 5977         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5978       }
 5979       // load 80 or 88 values from b into vs2_1/2/3
 5980       vs_ldpq_post(vs2_1, b);
 5981       vs_ldpq_post(vs2_2, b);
 5982       if (i < 2) {
 5983         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5984       }
 5985       // sum 80 or 88 values across vs1 and vs2 into vs1
 5986       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5987       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5988       if (i < 2) {
 5989         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5990       }
 5991       // load 80 or 88 values from c into vs2_1/2/3
 5992       vs_ldpq_post(vs2_1, c);
 5993       vs_ldpq_post(vs2_2, c);
 5994       if (i < 2) {
 5995         __ ldr(vs2_3, __ Q, __ post(c, 16));
 5996       }
 5997       // sum 80 or 88 values across vs1 and vs2 into vs1
 5998       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5999       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6000       if (i < 2) {
 6001         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6002       }
 6003       // add constant to all 80 or 88 results
 6004       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6005       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6006       if (i < 2) {
 6007         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6008       }
 6009       // store 80 or 88 values
 6010       vs_stpq_post(vs1_1, result);
 6011       vs_stpq_post(vs1_2, result);
 6012       if (i < 2) {
 6013         __ str(vs1_3, __ Q, __ post(result, 16));
 6014       }
 6015     }
 6016 
 6017     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6018     __ mov(r0, zr); // return 0
 6019     __ ret(lr);
 6020 
 6021     return start;
 6022   }
 6023 
 6024   // Kyber parse XOF output to polynomial coefficient candidates
 6025   // or decodePoly(12, ...).
 6026   // Implements
 6027   // static int implKyber12To16(
 6028   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6029   //
 6030   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6031   //
 6032   // condensed (byte[]) = c_rarg0
 6033   // condensedIndex = c_rarg1
 6034   // parsed (short[112 or 256]) = c_rarg2
 6035   // parsedLength (112 or 256) = c_rarg3
 6036   address generate_kyber12To16() {
 6037     Label L_F00, L_loop, L_end;
 6038 
 6039     __ BIND(L_F00);
 6040     __ emit_int64(0x0f000f000f000f00);
 6041     __ emit_int64(0x0f000f000f000f00);
 6042 
 6043     __ align(CodeEntryAlignment);
 6044     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6045     StubCodeMark mark(this, stub_id);
 6046     address start = __ pc();
 6047     __ enter();
 6048 
 6049     const Register condensed = c_rarg0;
 6050     const Register condensedOffs = c_rarg1;
 6051     const Register parsed = c_rarg2;
 6052     const Register parsedLength = c_rarg3;
 6053 
 6054     const Register tmpAddr = r11;
 6055 
 6056     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6057     // quadwords so we need a 6 vector sequence for the inputs.
 6058     // Parsing produces 64 shorts, employing two 8 vector
 6059     // sequences to store and combine the intermediate data.
 6060     VSeq<6> vin(24);
 6061     VSeq<8> va(0), vb(16);
 6062 
 6063     __ adr(tmpAddr, L_F00);
 6064     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6065     __ add(condensed, condensed, condensedOffs);
 6066 
 6067     __ BIND(L_loop);
 6068     // load 96 (6 x 16B) byte values
 6069     vs_ld3_post(vin, __ T16B, condensed);
 6070 
 6071     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6072     // holds 48 (16x3) contiguous bytes from memory striped
 6073     // horizontally across each of the 16 byte lanes. Equivalently,
 6074     // that is 16 pairs of 12-bit integers. Likewise the back half
 6075     // holds the next 48 bytes in the same arrangement.
 6076 
 6077     // Each vector in the front half can also be viewed as a vertical
 6078     // strip across the 16 pairs of 12 bit integers. Each byte in
 6079     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6080     // byte in vin[1] stores the high 4 bits of the first int and the
 6081     // low 4 bits of the second int. Each byte in vin[2] stores the
 6082     // high 8 bits of the second int. Likewise the vectors in second
 6083     // half.
 6084 
 6085     // Converting the data to 16-bit shorts requires first of all
 6086     // expanding each of the 6 x 16B vectors into 6 corresponding
 6087     // pairs of 8H vectors. Mask, shift and add operations on the
 6088     // resulting vector pairs can be used to combine 4 and 8 bit
 6089     // parts of related 8H vector elements.
 6090     //
 6091     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6092     // twice, one copy manipulated to provide the lower 4 bits
 6093     // belonging to the first short in a pair and another copy
 6094     // manipulated to provide the higher 4 bits belonging to the
 6095     // second short in a pair. This is why the the vector sequences va
 6096     // and vb used to hold the expanded 8H elements are of length 8.
 6097 
 6098     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6099     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6100     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6101     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6102     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6103     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6104     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6105     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6106 
 6107     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6108     // and vb[4:5]
 6109     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6110     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6111     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6112     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6113     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6114     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6115 
 6116     // shift lo byte of copy 1 of the middle stripe into the high byte
 6117     __ shl(va[2], __ T8H, va[2], 8);
 6118     __ shl(va[3], __ T8H, va[3], 8);
 6119     __ shl(vb[2], __ T8H, vb[2], 8);
 6120     __ shl(vb[3], __ T8H, vb[3], 8);
 6121 
 6122     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6123     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6124     // are in bit positions [4..11].
 6125     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6126     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6127     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6128     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6129 
 6130     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6131     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6132     // copy2
 6133     __ andr(va[2], __ T16B, va[2], v31);
 6134     __ andr(va[3], __ T16B, va[3], v31);
 6135     __ ushr(va[4], __ T8H, va[4], 4);
 6136     __ ushr(va[5], __ T8H, va[5], 4);
 6137     __ andr(vb[2], __ T16B, vb[2], v31);
 6138     __ andr(vb[3], __ T16B, vb[3], v31);
 6139     __ ushr(vb[4], __ T8H, vb[4], 4);
 6140     __ ushr(vb[5], __ T8H, vb[5], 4);
 6141 
 6142     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6143     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6144     // n.b. the ordering ensures: i) inputs are consumed before they
 6145     // are overwritten ii) the order of 16-bit results across successive
 6146     // pairs of vectors in va and then vb reflects the order of the
 6147     // corresponding 12-bit inputs
 6148     __ addv(va[0], __ T8H, va[0], va[2]);
 6149     __ addv(va[2], __ T8H, va[1], va[3]);
 6150     __ addv(va[1], __ T8H, va[4], va[6]);
 6151     __ addv(va[3], __ T8H, va[5], va[7]);
 6152     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6153     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6154     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6155     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6156 
 6157     // store 64 results interleaved as shorts
 6158     vs_st2_post(vs_front(va), __ T8H, parsed);
 6159     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6160 
 6161     __ sub(parsedLength, parsedLength, 64);
 6162     __ cmp(parsedLength, (u1)64);
 6163     __ br(Assembler::GE, L_loop);
 6164     __ cbz(parsedLength, L_end);
 6165 
 6166     // if anything is left it should be a final 72 bytes of input
 6167     // i.e. a final 48 12-bit values. so we handle this by loading
 6168     // 48 bytes into all 16B lanes of front(vin) and only 24
 6169     // bytes into the lower 8B lane of back(vin)
 6170     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6171     vs_ld3(vs_back(vin), __ T8B, condensed);
 6172 
 6173     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6174     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6175     // 5 and target element 2 of vb duplicates element 4.
 6176     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6177     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6178     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6179     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6180     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6181     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6182 
 6183     // This time expand just the lower 8 lanes
 6184     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6185     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6186     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6187 
 6188     // shift lo byte of copy 1 of the middle stripe into the high byte
 6189     __ shl(va[2], __ T8H, va[2], 8);
 6190     __ shl(va[3], __ T8H, va[3], 8);
 6191     __ shl(vb[2], __ T8H, vb[2], 8);
 6192 
 6193     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6194     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6195     // int are in bit positions [4..11].
 6196     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6197     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6198     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6199 
 6200     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6201     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6202     // copy2
 6203     __ andr(va[2], __ T16B, va[2], v31);
 6204     __ andr(va[3], __ T16B, va[3], v31);
 6205     __ ushr(va[4], __ T8H, va[4], 4);
 6206     __ ushr(va[5], __ T8H, va[5], 4);
 6207     __ andr(vb[2], __ T16B, vb[2], v31);
 6208     __ ushr(vb[4], __ T8H, vb[4], 4);
 6209 
 6210 
 6211 
 6212     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6213     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6214 
 6215     // n.b. ordering ensures: i) inputs are consumed before they are
 6216     // overwritten ii) order of 16-bit results across succsessive
 6217     // pairs of vectors in va and then lower half of vb reflects order
 6218     // of corresponding 12-bit inputs
 6219     __ addv(va[0], __ T8H, va[0], va[2]);
 6220     __ addv(va[2], __ T8H, va[1], va[3]);
 6221     __ addv(va[1], __ T8H, va[4], va[6]);
 6222     __ addv(va[3], __ T8H, va[5], va[7]);
 6223     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6224     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6225 
 6226     // store 48 results interleaved as shorts
 6227     vs_st2_post(vs_front(va), __ T8H, parsed);
 6228     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6229 
 6230     __ BIND(L_end);
 6231 
 6232     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6233     __ mov(r0, zr); // return 0
 6234     __ ret(lr);
 6235 
 6236     return start;
 6237   }
 6238 
 6239   // Kyber Barrett reduce function.
 6240   // Implements
 6241   // static int implKyberBarrettReduce(short[] coeffs) {}
 6242   //
 6243   // coeffs (short[256]) = c_rarg0
 6244   address generate_kyberBarrettReduce() {
 6245 
 6246     __ align(CodeEntryAlignment);
 6247     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6248     StubCodeMark mark(this, stub_id);
 6249     address start = __ pc();
 6250     __ enter();
 6251 
 6252     const Register coeffs = c_rarg0;
 6253 
 6254     const Register kyberConsts = r10;
 6255     const Register result = r11;
 6256 
 6257     // As above we process 256 sets of values in total i.e. 32 x
 6258     // 8H quadwords. So, we can load, add and store the data in 3
 6259     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6260     // of 10 or 11 registers. A further constraint is that the
 6261     // mapping needs to skip callee saves. So, we allocate the
 6262     // register sequences using two 8 sequences, two 2 sequences
 6263     // and two single registers.
 6264     VSeq<8> vs1_1(0);
 6265     VSeq<2> vs1_2(16);
 6266     FloatRegister vs1_3 = v28;
 6267     VSeq<8> vs2_1(18);
 6268     VSeq<2> vs2_2(26);
 6269     FloatRegister vs2_3 = v29;
 6270 
 6271     // we also need a pair of corresponding constant sequences
 6272 
 6273     VSeq<8> vc1_1(30, 0);
 6274     VSeq<2> vc1_2(30, 0);
 6275     FloatRegister vc1_3 = v30; // for kyber_q
 6276 
 6277     VSeq<8> vc2_1(31, 0);
 6278     VSeq<2> vc2_2(31, 0);
 6279     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6280 
 6281     __ add(result, coeffs, 0);
 6282     __ lea(kyberConsts,
 6283              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6284 
 6285     // load q and the multiplier for the Barrett reduction
 6286     __ add(kyberConsts, kyberConsts, 16);
 6287     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6288 
 6289     for (int i = 0; i < 3; i++) {
 6290       // load 80 or 88 coefficients
 6291       vs_ldpq_post(vs1_1, coeffs);
 6292       vs_ldpq_post(vs1_2, coeffs);
 6293       if (i < 2) {
 6294         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6295       }
 6296 
 6297       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6298       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6299       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6300       if (i < 2) {
 6301         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6302       }
 6303 
 6304       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6305       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6306       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6307       if (i < 2) {
 6308         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6309       }
 6310 
 6311       // vs1 <- vs1 - vs2 * kyber_q
 6312       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6313       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6314       if (i < 2) {
 6315         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6316       }
 6317 
 6318       vs_stpq_post(vs1_1, result);
 6319       vs_stpq_post(vs1_2, result);
 6320       if (i < 2) {
 6321         __ str(vs1_3, __ Q, __ post(result, 16));
 6322       }
 6323     }
 6324 
 6325     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6326     __ mov(r0, zr); // return 0
 6327     __ ret(lr);
 6328 
 6329     return start;
 6330   }
 6331 
 6332 
 6333   // Dilithium-specific montmul helper routines that generate parallel
 6334   // code for, respectively, a single 4x4s vector sequence montmul or
 6335   // two such multiplies in a row.
 6336 
 6337   // Perform 16 32-bit Montgomery multiplications in parallel
 6338   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6339                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6340     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6341     // It will assert that the register use is valid
 6342     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6343   }
 6344 
 6345   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6346   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6347                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6348     // Schedule two successive 4x4S multiplies via the montmul helper
 6349     // on the front and back halves of va, vb and vc. The helper will
 6350     // assert that the register use has no overlap conflicts on each
 6351     // individual call but we also need to ensure that the necessary
 6352     // disjoint/equality constraints are met across both calls.
 6353 
 6354     // vb, vc, vtmp and vq must be disjoint. va must either be
 6355     // disjoint from all other registers or equal vc
 6356 
 6357     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6358     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6359     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6360 
 6361     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6362     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6363 
 6364     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6365 
 6366     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6367     assert(vs_disjoint(va, vb), "va and vb overlap");
 6368     assert(vs_disjoint(va, vq), "va and vq overlap");
 6369     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6370 
 6371     // We multiply the front and back halves of each sequence 4 at a
 6372     // time because
 6373     //
 6374     // 1) we are currently only able to get 4-way instruction
 6375     // parallelism at best
 6376     //
 6377     // 2) we need registers for the constants in vq and temporary
 6378     // scratch registers to hold intermediate results so vtmp can only
 6379     // be a VSeq<4> which means we only have 4 scratch slots.
 6380 
 6381     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6382     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6383   }
 6384 
 6385   // Perform combined montmul then add/sub on 4x4S vectors.
 6386   void dilithium_montmul16_sub_add(
 6387           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6388           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6389     // compute a = montmul(a1, c)
 6390     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6391     // ouptut a1 = a0 - a
 6392     vs_subv(va1, __ T4S, va0, vc);
 6393     //    and a0 = a0 + a
 6394     vs_addv(va0, __ T4S, va0, vc);
 6395   }
 6396 
 6397   // Perform combined add/sub then montul on 4x4S vectors.
 6398   void dilithium_sub_add_montmul16(
 6399           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6400           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6401     // compute c = a0 - a1
 6402     vs_subv(vtmp1, __ T4S, va0, va1);
 6403     // output a0 = a0 + a1
 6404     vs_addv(va0, __ T4S, va0, va1);
 6405     // output a1 = b montmul c
 6406     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6407   }
 6408 
 6409   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6410   // in the Java implementation come in sequences of at least 8, so we
 6411   // can use ldpq to collect the corresponding data into pairs of vector
 6412   // registers.
 6413   // We collect the coefficients corresponding to the 'j+l' indexes into
 6414   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6415   // then we do the (Montgomery) multiplications by the zetas in parallel
 6416   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6417   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6418   // v0-v7 and finally save the results back to the coeffs array.
 6419   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6420     const Register coeffs, const Register zetas) {
 6421     int c1 = 0;
 6422     int c2 = 512;
 6423     int startIncr;
 6424     // don't use callee save registers v8 - v15
 6425     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6426     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6427     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6428     int offsets[4] = { 0, 32, 64, 96 };
 6429 
 6430     for (int level = 0; level < 5; level++) {
 6431       int c1Start = c1;
 6432       int c2Start = c2;
 6433       if (level == 3) {
 6434         offsets[1] = 32;
 6435         offsets[2] = 128;
 6436         offsets[3] = 160;
 6437       } else if (level == 4) {
 6438         offsets[1] = 64;
 6439         offsets[2] = 128;
 6440         offsets[3] = 192;
 6441       }
 6442 
 6443       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6444       // time at 4 different offsets and multiply them in order by the
 6445       // next set of input values. So we employ indexed load and store
 6446       // pair instructions with arrangement 4S.
 6447       for (int i = 0; i < 4; i++) {
 6448         // reload q and qinv
 6449         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6450         // load 8x4S coefficients via second start pos == c2
 6451         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6452         // load next 8x4S inputs == b
 6453         vs_ldpq_post(vs2, zetas);
 6454         // compute a == c2 * b mod MONT_Q
 6455         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6456         // load 8x4s coefficients via first start pos == c1
 6457         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6458         // compute a1 =  c1 + a
 6459         vs_addv(vs3, __ T4S, vs1, vs2);
 6460         // compute a2 =  c1 - a
 6461         vs_subv(vs1, __ T4S, vs1, vs2);
 6462         // output a1 and a2
 6463         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6464         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6465 
 6466         int k = 4 * level + i;
 6467 
 6468         if (k > 7) {
 6469           startIncr = 256;
 6470         } else if (k == 5) {
 6471           startIncr = 384;
 6472         } else {
 6473           startIncr = 128;
 6474         }
 6475 
 6476         c1Start += startIncr;
 6477         c2Start += startIncr;
 6478       }
 6479 
 6480       c2 /= 2;
 6481     }
 6482   }
 6483 
 6484   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6485   // Implements the method
 6486   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6487   // of the Java class sun.security.provider
 6488   //
 6489   // coeffs (int[256]) = c_rarg0
 6490   // zetas (int[256]) = c_rarg1
 6491   address generate_dilithiumAlmostNtt() {
 6492 
 6493     __ align(CodeEntryAlignment);
 6494     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6495     StubCodeMark mark(this, stub_id);
 6496     address start = __ pc();
 6497     __ enter();
 6498 
 6499     const Register coeffs = c_rarg0;
 6500     const Register zetas = c_rarg1;
 6501 
 6502     const Register tmpAddr = r9;
 6503     const Register dilithiumConsts = r10;
 6504     const Register result = r11;
 6505     // don't use callee save registers v8 - v15
 6506     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6507     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6508     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6509     int offsets[4] = { 0, 32, 64, 96};
 6510     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6511     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6512     __ add(result, coeffs, 0);
 6513     __ lea(dilithiumConsts,
 6514              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6515 
 6516     // Each level represents one iteration of the outer for loop of the Java version.
 6517 
 6518     // level 0-4
 6519     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6520 
 6521     // level 5
 6522 
 6523     // At level 5 the coefficients we need to combine with the zetas
 6524     // are grouped in memory in blocks of size 4. So, for both sets of
 6525     // coefficients we load 4 adjacent values at 8 different offsets
 6526     // using an indexed ldr with register variant Q and multiply them
 6527     // in sequence order by the next set of inputs. Likewise we store
 6528     // the resuls using an indexed str with register variant Q.
 6529     for (int i = 0; i < 1024; i += 256) {
 6530       // reload constants q, qinv each iteration as they get clobbered later
 6531       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6532       // load 32 (8x4S) coefficients via first offsets = c1
 6533       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6534       // load next 32 (8x4S) inputs = b
 6535       vs_ldpq_post(vs2, zetas);
 6536       // a = b montul c1
 6537       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6538       // load 32 (8x4S) coefficients via second offsets = c2
 6539       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6540       // add/sub with result of multiply
 6541       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6542       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6543       // write back new coefficients using same offsets
 6544       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6545       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6546     }
 6547 
 6548     // level 6
 6549     // At level 6 the coefficients we need to combine with the zetas
 6550     // are grouped in memory in pairs, the first two being montmul
 6551     // inputs and the second add/sub inputs. We can still implement
 6552     // the montmul+sub+add using 4-way parallelism but only if we
 6553     // combine the coefficients with the zetas 16 at a time. We load 8
 6554     // adjacent values at 4 different offsets using an ld2 load with
 6555     // arrangement 2D. That interleaves the lower and upper halves of
 6556     // each pair of quadwords into successive vector registers. We
 6557     // then need to montmul the 4 even elements of the coefficients
 6558     // register sequence by the zetas in order and then add/sub the 4
 6559     // odd elements of the coefficients register sequence. We use an
 6560     // equivalent st2 operation to store the results back into memory
 6561     // de-interleaved.
 6562     for (int i = 0; i < 1024; i += 128) {
 6563       // reload constants q, qinv each iteration as they get clobbered later
 6564       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6565       // load interleaved 16 (4x2D) coefficients via offsets
 6566       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6567       // load next 16 (4x4S) inputs
 6568       vs_ldpq_post(vs_front(vs2), zetas);
 6569       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6570       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6571                                   vs_front(vs2), vtmp, vq);
 6572       // store interleaved 16 (4x2D) coefficients via offsets
 6573       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6574     }
 6575 
 6576     // level 7
 6577     // At level 7 the coefficients we need to combine with the zetas
 6578     // occur singly with montmul inputs alterating with add/sub
 6579     // inputs. Once again we can use 4-way parallelism to combine 16
 6580     // zetas at a time. However, we have to load 8 adjacent values at
 6581     // 4 different offsets using an ld2 load with arrangement 4S. That
 6582     // interleaves the the odd words of each pair into one
 6583     // coefficients vector register and the even words of the pair
 6584     // into the next register. We then need to montmul the 4 even
 6585     // elements of the coefficients register sequence by the zetas in
 6586     // order and then add/sub the 4 odd elements of the coefficients
 6587     // register sequence. We use an equivalent st2 operation to store
 6588     // the results back into memory de-interleaved.
 6589 
 6590     for (int i = 0; i < 1024; i += 128) {
 6591       // reload constants q, qinv each iteration as they get clobbered later
 6592       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6593       // load interleaved 16 (4x4S) coefficients via offsets
 6594       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6595       // load next 16 (4x4S) inputs
 6596       vs_ldpq_post(vs_front(vs2), zetas);
 6597       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6598       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6599                                   vs_front(vs2), vtmp, vq);
 6600       // store interleaved 16 (4x4S) coefficients via offsets
 6601       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6602     }
 6603     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6604     __ mov(r0, zr); // return 0
 6605     __ ret(lr);
 6606 
 6607     return start;
 6608   }
 6609 
 6610   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6611   // in the Java implementation come in sequences of at least 8, so we
 6612   // can use ldpq to collect the corresponding data into pairs of vector
 6613   // registers
 6614   // We collect the coefficients that correspond to the 'j's into vs1
 6615   // the coefficiets that correspond to the 'j+l's into vs2 then
 6616   // do the additions into vs3 and the subtractions into vs1 then
 6617   // save the result of the additions, load the zetas into vs2
 6618   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6619   // finally save the results back to the coeffs array
 6620   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6621     const Register coeffs, const Register zetas) {
 6622     int c1 = 0;
 6623     int c2 = 32;
 6624     int startIncr;
 6625     int offsets[4];
 6626     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6627     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6628     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6629 
 6630     offsets[0] = 0;
 6631 
 6632     for (int level = 3; level < 8; level++) {
 6633       int c1Start = c1;
 6634       int c2Start = c2;
 6635       if (level == 3) {
 6636         offsets[1] = 64;
 6637         offsets[2] = 128;
 6638         offsets[3] = 192;
 6639       } else if (level == 4) {
 6640         offsets[1] = 32;
 6641         offsets[2] = 128;
 6642         offsets[3] = 160;
 6643       } else {
 6644         offsets[1] = 32;
 6645         offsets[2] = 64;
 6646         offsets[3] = 96;
 6647       }
 6648 
 6649       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6650       // time at 4 different offsets and multiply them in order by the
 6651       // next set of input values. So we employ indexed load and store
 6652       // pair instructions with arrangement 4S.
 6653       for (int i = 0; i < 4; i++) {
 6654         // load v1 32 (8x4S) coefficients relative to first start index
 6655         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6656         // load v2 32 (8x4S) coefficients relative to second start index
 6657         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6658         // a0 = v1 + v2 -- n.b. clobbers vqs
 6659         vs_addv(vs3, __ T4S, vs1, vs2);
 6660         // a1 = v1 - v2
 6661         vs_subv(vs1, __ T4S, vs1, vs2);
 6662         // save a1 relative to first start index
 6663         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6664         // load constants q, qinv each iteration as they get clobbered above
 6665         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6666         // load b next 32 (8x4S) inputs
 6667         vs_ldpq_post(vs2, zetas);
 6668         // a = a1 montmul b
 6669         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6670         // save a relative to second start index
 6671         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6672 
 6673         int k = 4 * level + i;
 6674 
 6675         if (k < 24) {
 6676           startIncr = 256;
 6677         } else if (k == 25) {
 6678           startIncr = 384;
 6679         } else {
 6680           startIncr = 128;
 6681         }
 6682 
 6683         c1Start += startIncr;
 6684         c2Start += startIncr;
 6685       }
 6686 
 6687       c2 *= 2;
 6688     }
 6689   }
 6690 
 6691   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6692   // Implements the method
 6693   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6694   // the sun.security.provider.ML_DSA class.
 6695   //
 6696   // coeffs (int[256]) = c_rarg0
 6697   // zetas (int[256]) = c_rarg1
 6698   address generate_dilithiumAlmostInverseNtt() {
 6699 
 6700     __ align(CodeEntryAlignment);
 6701     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6702     StubCodeMark mark(this, stub_id);
 6703     address start = __ pc();
 6704     __ enter();
 6705 
 6706     const Register coeffs = c_rarg0;
 6707     const Register zetas = c_rarg1;
 6708 
 6709     const Register tmpAddr = r9;
 6710     const Register dilithiumConsts = r10;
 6711     const Register result = r11;
 6712     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6713     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6714     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6715     int offsets[4] = { 0, 32, 64, 96 };
 6716     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6717     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6718 
 6719     __ add(result, coeffs, 0);
 6720     __ lea(dilithiumConsts,
 6721              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6722 
 6723     // Each level represents one iteration of the outer for loop of the Java version
 6724 
 6725     // level 0
 6726     // At level 0 we need to interleave adjacent quartets of
 6727     // coefficients before we multiply and add/sub by the next 16
 6728     // zetas just as we did for level 7 in the multiply code. So we
 6729     // load and store the values using an ld2/st2 with arrangement 4S.
 6730     for (int i = 0; i < 1024; i += 128) {
 6731       // load constants q, qinv
 6732       // n.b. this can be moved out of the loop as they do not get
 6733       // clobbered by first two loops
 6734       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6735       // a0/a1 load interleaved 32 (8x4S) coefficients
 6736       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6737       // b load next 32 (8x4S) inputs
 6738       vs_ldpq_post(vs_front(vs2), zetas);
 6739       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6740       // n.b. second half of vs2 provides temporary register storage
 6741       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6742                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6743       // a0/a1 store interleaved 32 (8x4S) coefficients
 6744       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6745     }
 6746 
 6747     // level 1
 6748     // At level 1 we need to interleave pairs of adjacent pairs of
 6749     // coefficients before we multiply by the next 16 zetas just as we
 6750     // did for level 6 in the multiply code. So we load and store the
 6751     // values an ld2/st2 with arrangement 2D.
 6752     for (int i = 0; i < 1024; i += 128) {
 6753       // a0/a1 load interleaved 32 (8x2D) coefficients
 6754       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6755       // b load next 16 (4x4S) inputs
 6756       vs_ldpq_post(vs_front(vs2), zetas);
 6757       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6758       // n.b. second half of vs2 provides temporary register storage
 6759       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6760                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6761       // a0/a1 store interleaved 32 (8x2D) coefficients
 6762       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6763     }
 6764 
 6765     // level 2
 6766     // At level 2 coefficients come in blocks of 4. So, we load 4
 6767     // adjacent coefficients at 8 distinct offsets for both the first
 6768     // and second coefficient sequences, using an ldr with register
 6769     // variant Q then combine them with next set of 32 zetas. Likewise
 6770     // we store the results using an str with register variant Q.
 6771     for (int i = 0; i < 1024; i += 256) {
 6772       // c0 load 32 (8x4S) coefficients via first offsets
 6773       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6774       // c1 load 32 (8x4S) coefficients via second offsets
 6775       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6776       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6777       vs_addv(vs3, __ T4S, vs1, vs2);
 6778       // c = c0 - c1
 6779       vs_subv(vs1, __ T4S, vs1, vs2);
 6780       // store a0 32 (8x4S) coefficients via first offsets
 6781       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6782       // b load 32 (8x4S) next inputs
 6783       vs_ldpq_post(vs2, zetas);
 6784       // reload constants q, qinv -- they were clobbered earlier
 6785       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6786       // compute a1 = b montmul c
 6787       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6788       // store a1 32 (8x4S) coefficients via second offsets
 6789       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6790     }
 6791 
 6792     // level 3-7
 6793     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6794 
 6795     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6796     __ mov(r0, zr); // return 0
 6797     __ ret(lr);
 6798 
 6799     return start;
 6800   }
 6801 
 6802   // Dilithium multiply polynomials in the NTT domain.
 6803   // Straightforward implementation of the method
 6804   // static int implDilithiumNttMult(
 6805   //              int[] result, int[] ntta, int[] nttb {} of
 6806   // the sun.security.provider.ML_DSA class.
 6807   //
 6808   // result (int[256]) = c_rarg0
 6809   // poly1 (int[256]) = c_rarg1
 6810   // poly2 (int[256]) = c_rarg2
 6811   address generate_dilithiumNttMult() {
 6812 
 6813         __ align(CodeEntryAlignment);
 6814     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6815     StubCodeMark mark(this, stub_id);
 6816     address start = __ pc();
 6817     __ enter();
 6818 
 6819     Label L_loop;
 6820 
 6821     const Register result = c_rarg0;
 6822     const Register poly1 = c_rarg1;
 6823     const Register poly2 = c_rarg2;
 6824 
 6825     const Register dilithiumConsts = r10;
 6826     const Register len = r11;
 6827 
 6828     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6829     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6830     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6831     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6832 
 6833     __ lea(dilithiumConsts,
 6834              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6835 
 6836     // load constants q, qinv
 6837     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6838     // load constant rSquare into v29
 6839     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6840 
 6841     __ mov(len, zr);
 6842     __ add(len, len, 1024);
 6843 
 6844     __ BIND(L_loop);
 6845 
 6846     // b load 32 (8x4S) next inputs from poly1
 6847     vs_ldpq_post(vs1, poly1);
 6848     // c load 32 (8x4S) next inputs from poly2
 6849     vs_ldpq_post(vs2, poly2);
 6850     // compute a = b montmul c
 6851     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6852     // compute a = rsquare montmul a
 6853     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6854     // save a 32 (8x4S) results
 6855     vs_stpq_post(vs2, result);
 6856 
 6857     __ sub(len, len, 128);
 6858     __ cmp(len, (u1)128);
 6859     __ br(Assembler::GE, L_loop);
 6860 
 6861     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6862     __ mov(r0, zr); // return 0
 6863     __ ret(lr);
 6864 
 6865     return start;
 6866   }
 6867 
 6868   // Dilithium Motgomery multiply an array by a constant.
 6869   // A straightforward implementation of the method
 6870   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6871   // of the sun.security.provider.MLDSA class
 6872   //
 6873   // coeffs (int[256]) = c_rarg0
 6874   // constant (int) = c_rarg1
 6875   address generate_dilithiumMontMulByConstant() {
 6876 
 6877     __ align(CodeEntryAlignment);
 6878     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6879     StubCodeMark mark(this, stub_id);
 6880     address start = __ pc();
 6881     __ enter();
 6882 
 6883     Label L_loop;
 6884 
 6885     const Register coeffs = c_rarg0;
 6886     const Register constant = c_rarg1;
 6887 
 6888     const Register dilithiumConsts = r10;
 6889     const Register result = r11;
 6890     const Register len = r12;
 6891 
 6892     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6893     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6894     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6895     VSeq<8> vconst(29, 0);             // for montmul by constant
 6896 
 6897     // results track inputs
 6898     __ add(result, coeffs, 0);
 6899     __ lea(dilithiumConsts,
 6900              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6901 
 6902     // load constants q, qinv -- they do not get clobbered by first two loops
 6903     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6904     // copy caller supplied constant across vconst
 6905     __ dup(vconst[0], __ T4S, constant);
 6906     __ mov(len, zr);
 6907     __ add(len, len, 1024);
 6908 
 6909     __ BIND(L_loop);
 6910 
 6911     // load next 32 inputs
 6912     vs_ldpq_post(vs2, coeffs);
 6913     // mont mul by constant
 6914     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6915     // write next 32 results
 6916     vs_stpq_post(vs2, result);
 6917 
 6918     __ sub(len, len, 128);
 6919     __ cmp(len, (u1)128);
 6920     __ br(Assembler::GE, L_loop);
 6921 
 6922     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6923     __ mov(r0, zr); // return 0
 6924     __ ret(lr);
 6925 
 6926     return start;
 6927   }
 6928 
 6929   // Dilithium decompose poly.
 6930   // Implements the method
 6931   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6932   // of the sun.security.provider.ML_DSA class
 6933   //
 6934   // input (int[256]) = c_rarg0
 6935   // lowPart (int[256]) = c_rarg1
 6936   // highPart (int[256]) = c_rarg2
 6937   // twoGamma2  (int) = c_rarg3
 6938   // multiplier (int) = c_rarg4
 6939   address generate_dilithiumDecomposePoly() {
 6940 
 6941     __ align(CodeEntryAlignment);
 6942     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 6943     StubCodeMark mark(this, stub_id);
 6944     address start = __ pc();
 6945     Label L_loop;
 6946 
 6947     const Register input = c_rarg0;
 6948     const Register lowPart = c_rarg1;
 6949     const Register highPart = c_rarg2;
 6950     const Register twoGamma2 = c_rarg3;
 6951     const Register multiplier = c_rarg4;
 6952 
 6953     const Register len = r9;
 6954     const Register dilithiumConsts = r10;
 6955     const Register tmp = r11;
 6956 
 6957     // 6 independent sets of 4x4s values
 6958     VSeq<4> vs1(0), vs2(4), vs3(8);
 6959     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6960 
 6961     // 7 constants for cross-multiplying
 6962     VSeq<4> one(25, 0);
 6963     VSeq<4> qminus1(26, 0);
 6964     VSeq<4> g2(27, 0);
 6965     VSeq<4> twog2(28, 0);
 6966     VSeq<4> mult(29, 0);
 6967     VSeq<4> q(30, 0);
 6968     VSeq<4> qadd(31, 0);
 6969 
 6970     __ enter();
 6971 
 6972     __ lea(dilithiumConsts,
 6973              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6974 
 6975     // save callee-saved registers
 6976     __ stpd(v8, v9, __ pre(sp, -64));
 6977     __ stpd(v10, v11, Address(sp, 16));
 6978     __ stpd(v12, v13, Address(sp, 32));
 6979     __ stpd(v14, v15, Address(sp, 48));
 6980 
 6981     // populate constant registers
 6982     __ mov(tmp, zr);
 6983     __ add(tmp, tmp, 1);
 6984     __ dup(one[0], __ T4S, tmp); // 1
 6985     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6986     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6987     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6988     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6989     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6990     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6991 
 6992     __ mov(len, zr);
 6993     __ add(len, len, 1024);
 6994 
 6995     __ BIND(L_loop);
 6996 
 6997     // load next 4x4S inputs interleaved: rplus --> vs1
 6998     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 6999 
 7000     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7001     vs_addv(vtmp, __ T4S, vs1, qadd);
 7002     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7003     vs_mulv(vtmp, __ T4S, vtmp, q);
 7004     vs_subv(vs1, __ T4S, vs1, vtmp);
 7005 
 7006     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7007     vs_sshr(vtmp, __ T4S, vs1, 31);
 7008     vs_andr(vtmp, vtmp, q);
 7009     vs_addv(vs1, __ T4S, vs1, vtmp);
 7010 
 7011     // quotient --> vs2
 7012     // int quotient = (rplus * multiplier) >> 22;
 7013     vs_mulv(vtmp, __ T4S, vs1, mult);
 7014     vs_sshr(vs2, __ T4S, vtmp, 22);
 7015 
 7016     // r0 --> vs3
 7017     // int r0 = rplus - quotient * twoGamma2;
 7018     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7019     vs_subv(vs3, __ T4S, vs1, vtmp);
 7020 
 7021     // mask --> vs4
 7022     // int mask = (twoGamma2 - r0) >> 22;
 7023     vs_subv(vtmp, __ T4S, twog2, vs3);
 7024     vs_sshr(vs4, __ T4S, vtmp, 22);
 7025 
 7026     // r0 -= (mask & twoGamma2);
 7027     vs_andr(vtmp, vs4, twog2);
 7028     vs_subv(vs3, __ T4S, vs3, vtmp);
 7029 
 7030     //  quotient += (mask & 1);
 7031     vs_andr(vtmp, vs4, one);
 7032     vs_addv(vs2, __ T4S, vs2, vtmp);
 7033 
 7034     // mask = (twoGamma2 / 2 - r0) >> 31;
 7035     vs_subv(vtmp, __ T4S, g2, vs3);
 7036     vs_sshr(vs4, __ T4S, vtmp, 31);
 7037 
 7038     // r0 -= (mask & twoGamma2);
 7039     vs_andr(vtmp, vs4, twog2);
 7040     vs_subv(vs3, __ T4S, vs3, vtmp);
 7041 
 7042     // quotient += (mask & 1);
 7043     vs_andr(vtmp, vs4, one);
 7044     vs_addv(vs2, __ T4S, vs2, vtmp);
 7045 
 7046     // r1 --> vs5
 7047     // int r1 = rplus - r0 - (dilithium_q - 1);
 7048     vs_subv(vtmp, __ T4S, vs1, vs3);
 7049     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7050 
 7051     // r1 --> vs1 (overwriting rplus)
 7052     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7053     vs_negr(vtmp, __ T4S, vs5);
 7054     vs_orr(vtmp, vs5, vtmp);
 7055     vs_sshr(vs1, __ T4S, vtmp, 31);
 7056 
 7057     // r0 += ~r1;
 7058     vs_notr(vtmp, vs1);
 7059     vs_addv(vs3, __ T4S, vs3, vtmp);
 7060 
 7061     // r1 = r1 & quotient;
 7062     vs_andr(vs1, vs2, vs1);
 7063 
 7064     // store results inteleaved
 7065     // lowPart[m] = r0;
 7066     // highPart[m] = r1;
 7067     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7068     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7069 
 7070     __ sub(len, len, 64);
 7071     __ cmp(len, (u1)64);
 7072     __ br(Assembler::GE, L_loop);
 7073 
 7074     // restore callee-saved vector registers
 7075     __ ldpd(v14, v15, Address(sp, 48));
 7076     __ ldpd(v12, v13, Address(sp, 32));
 7077     __ ldpd(v10, v11, Address(sp, 16));
 7078     __ ldpd(v8, v9, __ post(sp, 64));
 7079 
 7080     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7081     __ mov(r0, zr); // return 0
 7082     __ ret(lr);
 7083 
 7084     return start;
 7085   }
 7086 
 7087   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7088              Register tmp0, Register tmp1, Register tmp2) {
 7089     __ bic(tmp0, a2, a1); // for a0
 7090     __ bic(tmp1, a3, a2); // for a1
 7091     __ bic(tmp2, a4, a3); // for a2
 7092     __ eor(a2, a2, tmp2);
 7093     __ bic(tmp2, a0, a4); // for a3
 7094     __ eor(a3, a3, tmp2);
 7095     __ bic(tmp2, a1, a0); // for a4
 7096     __ eor(a0, a0, tmp0);
 7097     __ eor(a1, a1, tmp1);
 7098     __ eor(a4, a4, tmp2);
 7099   }
 7100 
 7101   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7102                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7103                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7104                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7105                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7106                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7107                         Register tmp0, Register tmp1, Register tmp2) {
 7108     __ eor3(tmp1, a4, a9, a14);
 7109     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7110     __ eor3(tmp2, a1, a6, a11);
 7111     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7112     __ rax1(tmp2, tmp0, tmp1); // d0
 7113     {
 7114 
 7115       Register tmp3, tmp4;
 7116       if (can_use_fp && can_use_r18) {
 7117         tmp3 = rfp;
 7118         tmp4 = r18_tls;
 7119       } else {
 7120         tmp3 = a4;
 7121         tmp4 = a9;
 7122         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7123       }
 7124 
 7125       __ eor3(tmp3, a0, a5, a10);
 7126       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7127       __ eor(a0, a0, tmp2);
 7128       __ eor(a5, a5, tmp2);
 7129       __ eor(a10, a10, tmp2);
 7130       __ eor(a15, a15, tmp2);
 7131       __ eor(a20, a20, tmp2); // d0(tmp2)
 7132       __ eor3(tmp3, a2, a7, a12);
 7133       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7134       __ rax1(tmp3, tmp4, tmp2); // d1
 7135       __ eor(a1, a1, tmp3);
 7136       __ eor(a6, a6, tmp3);
 7137       __ eor(a11, a11, tmp3);
 7138       __ eor(a16, a16, tmp3);
 7139       __ eor(a21, a21, tmp3); // d1(tmp3)
 7140       __ rax1(tmp3, tmp2, tmp0); // d3
 7141       __ eor3(tmp2, a3, a8, a13);
 7142       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7143       __ eor(a3, a3, tmp3);
 7144       __ eor(a8, a8, tmp3);
 7145       __ eor(a13, a13, tmp3);
 7146       __ eor(a18, a18, tmp3);
 7147       __ eor(a23, a23, tmp3);
 7148       __ rax1(tmp2, tmp1, tmp0); // d2
 7149       __ eor(a2, a2, tmp2);
 7150       __ eor(a7, a7, tmp2);
 7151       __ eor(a12, a12, tmp2);
 7152       __ rax1(tmp0, tmp0, tmp4); // d4
 7153       if (!can_use_fp || !can_use_r18) {
 7154         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7155       }
 7156       __ eor(a17, a17, tmp2);
 7157       __ eor(a22, a22, tmp2);
 7158       __ eor(a4, a4, tmp0);
 7159       __ eor(a9, a9, tmp0);
 7160       __ eor(a14, a14, tmp0);
 7161       __ eor(a19, a19, tmp0);
 7162       __ eor(a24, a24, tmp0);
 7163     }
 7164 
 7165     __ rol(tmp0, a10, 3);
 7166     __ rol(a10, a1, 1);
 7167     __ rol(a1, a6, 44);
 7168     __ rol(a6, a9, 20);
 7169     __ rol(a9, a22, 61);
 7170     __ rol(a22, a14, 39);
 7171     __ rol(a14, a20, 18);
 7172     __ rol(a20, a2, 62);
 7173     __ rol(a2, a12, 43);
 7174     __ rol(a12, a13, 25);
 7175     __ rol(a13, a19, 8) ;
 7176     __ rol(a19, a23, 56);
 7177     __ rol(a23, a15, 41);
 7178     __ rol(a15, a4, 27);
 7179     __ rol(a4, a24, 14);
 7180     __ rol(a24, a21, 2);
 7181     __ rol(a21, a8, 55);
 7182     __ rol(a8, a16, 45);
 7183     __ rol(a16, a5, 36);
 7184     __ rol(a5, a3, 28);
 7185     __ rol(a3, a18, 21);
 7186     __ rol(a18, a17, 15);
 7187     __ rol(a17, a11, 10);
 7188     __ rol(a11, a7, 6);
 7189     __ mov(a7, tmp0);
 7190 
 7191     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7192     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7193     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7194     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7195     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7196 
 7197     __ ldr(tmp1, __ post(rc, 8));
 7198     __ eor(a0, a0, tmp1);
 7199 
 7200   }
 7201 
 7202   // Arguments:
 7203   //
 7204   // Inputs:
 7205   //   c_rarg0   - byte[]  source+offset
 7206   //   c_rarg1   - byte[]  SHA.state
 7207   //   c_rarg2   - int     block_size
 7208   //   c_rarg3   - int     offset
 7209   //   c_rarg4   - int     limit
 7210   //
 7211   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7212     bool multi_block;
 7213     switch (stub_id) {
 7214     case StubId::stubgen_sha3_implCompress_id:
 7215       multi_block = false;
 7216       break;
 7217     case StubId::stubgen_sha3_implCompressMB_id:
 7218       multi_block = true;
 7219       break;
 7220     default:
 7221       ShouldNotReachHere();
 7222     }
 7223 
 7224     static const uint64_t round_consts[24] = {
 7225       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7226       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7227       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7228       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7229       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7230       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7231       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7232       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7233     };
 7234 
 7235     __ align(CodeEntryAlignment);
 7236     StubCodeMark mark(this, stub_id);
 7237     address start = __ pc();
 7238 
 7239     Register buf           = c_rarg0;
 7240     Register state         = c_rarg1;
 7241     Register block_size    = c_rarg2;
 7242     Register ofs           = c_rarg3;
 7243     Register limit         = c_rarg4;
 7244 
 7245     // use r3.r17,r19..r28 to keep a0..a24.
 7246     // a0..a24 are respective locals from SHA3.java
 7247     Register a0 = r25,
 7248              a1 = r26,
 7249              a2 = r27,
 7250              a3 = r3,
 7251              a4 = r4,
 7252              a5 = r5,
 7253              a6 = r6,
 7254              a7 = r7,
 7255              a8 = rscratch1, // r8
 7256              a9 = rscratch2, // r9
 7257              a10 = r10,
 7258              a11 = r11,
 7259              a12 = r12,
 7260              a13 = r13,
 7261              a14 = r14,
 7262              a15 = r15,
 7263              a16 = r16,
 7264              a17 = r17,
 7265              a18 = r28,
 7266              a19 = r19,
 7267              a20 = r20,
 7268              a21 = r21,
 7269              a22 = r22,
 7270              a23 = r23,
 7271              a24 = r24;
 7272 
 7273     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7274 
 7275     Label sha3_loop, rounds24_preloop, loop_body;
 7276     Label sha3_512_or_sha3_384, shake128;
 7277 
 7278     bool can_use_r18 = false;
 7279 #ifndef R18_RESERVED
 7280     can_use_r18 = true;
 7281 #endif
 7282     bool can_use_fp = !PreserveFramePointer;
 7283 
 7284     __ enter();
 7285 
 7286     // save almost all yet unsaved gpr registers on stack
 7287     __ str(block_size, __ pre(sp, -128));
 7288     if (multi_block) {
 7289       __ stpw(ofs, limit, Address(sp, 8));
 7290     }
 7291     // 8 bytes at sp+16 will be used to keep buf
 7292     __ stp(r19, r20, Address(sp, 32));
 7293     __ stp(r21, r22, Address(sp, 48));
 7294     __ stp(r23, r24, Address(sp, 64));
 7295     __ stp(r25, r26, Address(sp, 80));
 7296     __ stp(r27, r28, Address(sp, 96));
 7297     if (can_use_r18 && can_use_fp) {
 7298       __ stp(r18_tls, state, Address(sp, 112));
 7299     } else {
 7300       __ str(state, Address(sp, 112));
 7301     }
 7302 
 7303     // begin sha3 calculations: loading a0..a24 from state arrary
 7304     __ ldp(a0, a1, state);
 7305     __ ldp(a2, a3, Address(state, 16));
 7306     __ ldp(a4, a5, Address(state, 32));
 7307     __ ldp(a6, a7, Address(state, 48));
 7308     __ ldp(a8, a9, Address(state, 64));
 7309     __ ldp(a10, a11, Address(state, 80));
 7310     __ ldp(a12, a13, Address(state, 96));
 7311     __ ldp(a14, a15, Address(state, 112));
 7312     __ ldp(a16, a17, Address(state, 128));
 7313     __ ldp(a18, a19, Address(state, 144));
 7314     __ ldp(a20, a21, Address(state, 160));
 7315     __ ldp(a22, a23, Address(state, 176));
 7316     __ ldr(a24, Address(state, 192));
 7317 
 7318     __ BIND(sha3_loop);
 7319 
 7320     // load input
 7321     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7322     __ eor(a0, a0, tmp3);
 7323     __ eor(a1, a1, tmp2);
 7324     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7325     __ eor(a2, a2, tmp3);
 7326     __ eor(a3, a3, tmp2);
 7327     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7328     __ eor(a4, a4, tmp3);
 7329     __ eor(a5, a5, tmp2);
 7330     __ ldr(tmp3, __ post(buf, 8));
 7331     __ eor(a6, a6, tmp3);
 7332 
 7333     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7334     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7335 
 7336     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7337     __ eor(a7, a7, tmp3);
 7338     __ eor(a8, a8, tmp2);
 7339     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7340     __ eor(a9, a9, tmp3);
 7341     __ eor(a10, a10, tmp2);
 7342     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7343     __ eor(a11, a11, tmp3);
 7344     __ eor(a12, a12, tmp2);
 7345     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7346     __ eor(a13, a13, tmp3);
 7347     __ eor(a14, a14, tmp2);
 7348     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7349     __ eor(a15, a15, tmp3);
 7350     __ eor(a16, a16, tmp2);
 7351 
 7352     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7353     __ andw(tmp2, block_size, 48);
 7354     __ cbzw(tmp2, rounds24_preloop);
 7355     __ tbnz(block_size, 5, shake128);
 7356     // block_size == 144, bit5 == 0, SHA3-244
 7357     __ ldr(tmp3, __ post(buf, 8));
 7358     __ eor(a17, a17, tmp3);
 7359     __ b(rounds24_preloop);
 7360 
 7361     __ BIND(shake128);
 7362     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7363     __ eor(a17, a17, tmp3);
 7364     __ eor(a18, a18, tmp2);
 7365     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7366     __ eor(a19, a19, tmp3);
 7367     __ eor(a20, a20, tmp2);
 7368     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7369 
 7370     __ BIND(sha3_512_or_sha3_384);
 7371     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7372     __ eor(a7, a7, tmp3);
 7373     __ eor(a8, a8, tmp2);
 7374     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7375 
 7376     // SHA3-384
 7377     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7378     __ eor(a9, a9, tmp3);
 7379     __ eor(a10, a10, tmp2);
 7380     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7381     __ eor(a11, a11, tmp3);
 7382     __ eor(a12, a12, tmp2);
 7383 
 7384     __ BIND(rounds24_preloop);
 7385     __ fmovs(v0, 24.0); // float loop counter,
 7386     __ fmovs(v1, 1.0);  // exact representation
 7387 
 7388     __ str(buf, Address(sp, 16));
 7389     __ lea(tmp3, ExternalAddress((address) round_consts));
 7390 
 7391     __ BIND(loop_body);
 7392     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7393                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7394                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7395                      tmp0, tmp1, tmp2);
 7396     __ fsubs(v0, v0, v1);
 7397     __ fcmps(v0, 0.0);
 7398     __ br(__ NE, loop_body);
 7399 
 7400     if (multi_block) {
 7401       __ ldrw(block_size, sp); // block_size
 7402       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7403       __ addw(tmp2, tmp2, block_size);
 7404       __ cmpw(tmp2, tmp1);
 7405       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7406       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7407       __ br(Assembler::LE, sha3_loop);
 7408       __ movw(c_rarg0, tmp2); // return offset
 7409     }
 7410     if (can_use_fp && can_use_r18) {
 7411       __ ldp(r18_tls, state, Address(sp, 112));
 7412     } else {
 7413       __ ldr(state, Address(sp, 112));
 7414     }
 7415     // save calculated sha3 state
 7416     __ stp(a0, a1, Address(state));
 7417     __ stp(a2, a3, Address(state, 16));
 7418     __ stp(a4, a5, Address(state, 32));
 7419     __ stp(a6, a7, Address(state, 48));
 7420     __ stp(a8, a9, Address(state, 64));
 7421     __ stp(a10, a11, Address(state, 80));
 7422     __ stp(a12, a13, Address(state, 96));
 7423     __ stp(a14, a15, Address(state, 112));
 7424     __ stp(a16, a17, Address(state, 128));
 7425     __ stp(a18, a19, Address(state, 144));
 7426     __ stp(a20, a21, Address(state, 160));
 7427     __ stp(a22, a23, Address(state, 176));
 7428     __ str(a24, Address(state, 192));
 7429 
 7430     // restore required registers from stack
 7431     __ ldp(r19, r20, Address(sp, 32));
 7432     __ ldp(r21, r22, Address(sp, 48));
 7433     __ ldp(r23, r24, Address(sp, 64));
 7434     __ ldp(r25, r26, Address(sp, 80));
 7435     __ ldp(r27, r28, Address(sp, 96));
 7436     if (can_use_fp && can_use_r18) {
 7437       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7438     } // else no need to recalculate rfp, since it wasn't changed
 7439 
 7440     __ leave();
 7441 
 7442     __ ret(lr);
 7443 
 7444     return start;
 7445   }
 7446 
 7447   /**
 7448    *  Arguments:
 7449    *
 7450    * Inputs:
 7451    *   c_rarg0   - int crc
 7452    *   c_rarg1   - byte* buf
 7453    *   c_rarg2   - int length
 7454    *
 7455    * Output:
 7456    *       rax   - int crc result
 7457    */
 7458   address generate_updateBytesCRC32() {
 7459     assert(UseCRC32Intrinsics, "what are we doing here?");
 7460 
 7461     __ align(CodeEntryAlignment);
 7462     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7463     StubCodeMark mark(this, stub_id);
 7464 
 7465     address start = __ pc();
 7466 
 7467     const Register crc   = c_rarg0;  // crc
 7468     const Register buf   = c_rarg1;  // source java byte array address
 7469     const Register len   = c_rarg2;  // length
 7470     const Register table0 = c_rarg3; // crc_table address
 7471     const Register table1 = c_rarg4;
 7472     const Register table2 = c_rarg5;
 7473     const Register table3 = c_rarg6;
 7474     const Register tmp3 = c_rarg7;
 7475 
 7476     BLOCK_COMMENT("Entry:");
 7477     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7478 
 7479     __ kernel_crc32(crc, buf, len,
 7480               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7481 
 7482     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7483     __ ret(lr);
 7484 
 7485     return start;
 7486   }
 7487 
 7488   /**
 7489    *  Arguments:
 7490    *
 7491    * Inputs:
 7492    *   c_rarg0   - int crc
 7493    *   c_rarg1   - byte* buf
 7494    *   c_rarg2   - int length
 7495    *   c_rarg3   - int* table
 7496    *
 7497    * Output:
 7498    *       r0   - int crc result
 7499    */
 7500   address generate_updateBytesCRC32C() {
 7501     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7502 
 7503     __ align(CodeEntryAlignment);
 7504     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7505     StubCodeMark mark(this, stub_id);
 7506 
 7507     address start = __ pc();
 7508 
 7509     const Register crc   = c_rarg0;  // crc
 7510     const Register buf   = c_rarg1;  // source java byte array address
 7511     const Register len   = c_rarg2;  // length
 7512     const Register table0 = c_rarg3; // crc_table address
 7513     const Register table1 = c_rarg4;
 7514     const Register table2 = c_rarg5;
 7515     const Register table3 = c_rarg6;
 7516     const Register tmp3 = c_rarg7;
 7517 
 7518     BLOCK_COMMENT("Entry:");
 7519     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7520 
 7521     __ kernel_crc32c(crc, buf, len,
 7522               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7523 
 7524     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7525     __ ret(lr);
 7526 
 7527     return start;
 7528   }
 7529 
 7530   /***
 7531    *  Arguments:
 7532    *
 7533    *  Inputs:
 7534    *   c_rarg0   - int   adler
 7535    *   c_rarg1   - byte* buff
 7536    *   c_rarg2   - int   len
 7537    *
 7538    * Output:
 7539    *   c_rarg0   - int adler result
 7540    */
 7541   address generate_updateBytesAdler32() {
 7542     __ align(CodeEntryAlignment);
 7543     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7544     StubCodeMark mark(this, stub_id);
 7545     address start = __ pc();
 7546 
 7547     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7548 
 7549     // Aliases
 7550     Register adler  = c_rarg0;
 7551     Register s1     = c_rarg0;
 7552     Register s2     = c_rarg3;
 7553     Register buff   = c_rarg1;
 7554     Register len    = c_rarg2;
 7555     Register nmax  = r4;
 7556     Register base  = r5;
 7557     Register count = r6;
 7558     Register temp0 = rscratch1;
 7559     Register temp1 = rscratch2;
 7560     FloatRegister vbytes = v0;
 7561     FloatRegister vs1acc = v1;
 7562     FloatRegister vs2acc = v2;
 7563     FloatRegister vtable = v3;
 7564 
 7565     // Max number of bytes we can process before having to take the mod
 7566     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7567     uint64_t BASE = 0xfff1;
 7568     uint64_t NMAX = 0x15B0;
 7569 
 7570     __ mov(base, BASE);
 7571     __ mov(nmax, NMAX);
 7572 
 7573     // Load accumulation coefficients for the upper 16 bits
 7574     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7575     __ ld1(vtable, __ T16B, Address(temp0));
 7576 
 7577     // s1 is initialized to the lower 16 bits of adler
 7578     // s2 is initialized to the upper 16 bits of adler
 7579     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7580     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7581 
 7582     // The pipelined loop needs at least 16 elements for 1 iteration
 7583     // It does check this, but it is more effective to skip to the cleanup loop
 7584     __ cmp(len, (u1)16);
 7585     __ br(Assembler::HS, L_nmax);
 7586     __ cbz(len, L_combine);
 7587 
 7588     __ bind(L_simple_by1_loop);
 7589     __ ldrb(temp0, Address(__ post(buff, 1)));
 7590     __ add(s1, s1, temp0);
 7591     __ add(s2, s2, s1);
 7592     __ subs(len, len, 1);
 7593     __ br(Assembler::HI, L_simple_by1_loop);
 7594 
 7595     // s1 = s1 % BASE
 7596     __ subs(temp0, s1, base);
 7597     __ csel(s1, temp0, s1, Assembler::HS);
 7598 
 7599     // s2 = s2 % BASE
 7600     __ lsr(temp0, s2, 16);
 7601     __ lsl(temp1, temp0, 4);
 7602     __ sub(temp1, temp1, temp0);
 7603     __ add(s2, temp1, s2, ext::uxth);
 7604 
 7605     __ subs(temp0, s2, base);
 7606     __ csel(s2, temp0, s2, Assembler::HS);
 7607 
 7608     __ b(L_combine);
 7609 
 7610     __ bind(L_nmax);
 7611     __ subs(len, len, nmax);
 7612     __ sub(count, nmax, 16);
 7613     __ br(Assembler::LO, L_by16);
 7614 
 7615     __ bind(L_nmax_loop);
 7616 
 7617     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7618                                       vbytes, vs1acc, vs2acc, vtable);
 7619 
 7620     __ subs(count, count, 16);
 7621     __ br(Assembler::HS, L_nmax_loop);
 7622 
 7623     // s1 = s1 % BASE
 7624     __ lsr(temp0, s1, 16);
 7625     __ lsl(temp1, temp0, 4);
 7626     __ sub(temp1, temp1, temp0);
 7627     __ add(temp1, temp1, s1, ext::uxth);
 7628 
 7629     __ lsr(temp0, temp1, 16);
 7630     __ lsl(s1, temp0, 4);
 7631     __ sub(s1, s1, temp0);
 7632     __ add(s1, s1, temp1, ext:: uxth);
 7633 
 7634     __ subs(temp0, s1, base);
 7635     __ csel(s1, temp0, s1, Assembler::HS);
 7636 
 7637     // s2 = s2 % BASE
 7638     __ lsr(temp0, s2, 16);
 7639     __ lsl(temp1, temp0, 4);
 7640     __ sub(temp1, temp1, temp0);
 7641     __ add(temp1, temp1, s2, ext::uxth);
 7642 
 7643     __ lsr(temp0, temp1, 16);
 7644     __ lsl(s2, temp0, 4);
 7645     __ sub(s2, s2, temp0);
 7646     __ add(s2, s2, temp1, ext:: uxth);
 7647 
 7648     __ subs(temp0, s2, base);
 7649     __ csel(s2, temp0, s2, Assembler::HS);
 7650 
 7651     __ subs(len, len, nmax);
 7652     __ sub(count, nmax, 16);
 7653     __ br(Assembler::HS, L_nmax_loop);
 7654 
 7655     __ bind(L_by16);
 7656     __ adds(len, len, count);
 7657     __ br(Assembler::LO, L_by1);
 7658 
 7659     __ bind(L_by16_loop);
 7660 
 7661     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7662                                       vbytes, vs1acc, vs2acc, vtable);
 7663 
 7664     __ subs(len, len, 16);
 7665     __ br(Assembler::HS, L_by16_loop);
 7666 
 7667     __ bind(L_by1);
 7668     __ adds(len, len, 15);
 7669     __ br(Assembler::LO, L_do_mod);
 7670 
 7671     __ bind(L_by1_loop);
 7672     __ ldrb(temp0, Address(__ post(buff, 1)));
 7673     __ add(s1, temp0, s1);
 7674     __ add(s2, s2, s1);
 7675     __ subs(len, len, 1);
 7676     __ br(Assembler::HS, L_by1_loop);
 7677 
 7678     __ bind(L_do_mod);
 7679     // s1 = s1 % BASE
 7680     __ lsr(temp0, s1, 16);
 7681     __ lsl(temp1, temp0, 4);
 7682     __ sub(temp1, temp1, temp0);
 7683     __ add(temp1, temp1, s1, ext::uxth);
 7684 
 7685     __ lsr(temp0, temp1, 16);
 7686     __ lsl(s1, temp0, 4);
 7687     __ sub(s1, s1, temp0);
 7688     __ add(s1, s1, temp1, ext:: uxth);
 7689 
 7690     __ subs(temp0, s1, base);
 7691     __ csel(s1, temp0, s1, Assembler::HS);
 7692 
 7693     // s2 = s2 % BASE
 7694     __ lsr(temp0, s2, 16);
 7695     __ lsl(temp1, temp0, 4);
 7696     __ sub(temp1, temp1, temp0);
 7697     __ add(temp1, temp1, s2, ext::uxth);
 7698 
 7699     __ lsr(temp0, temp1, 16);
 7700     __ lsl(s2, temp0, 4);
 7701     __ sub(s2, s2, temp0);
 7702     __ add(s2, s2, temp1, ext:: uxth);
 7703 
 7704     __ subs(temp0, s2, base);
 7705     __ csel(s2, temp0, s2, Assembler::HS);
 7706 
 7707     // Combine lower bits and higher bits
 7708     __ bind(L_combine);
 7709     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7710 
 7711     __ ret(lr);
 7712 
 7713     return start;
 7714   }
 7715 
 7716   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7717           Register temp0, Register temp1, FloatRegister vbytes,
 7718           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7719     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7720     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7721     // In non-vectorized code, we update s1 and s2 as:
 7722     //   s1 <- s1 + b1
 7723     //   s2 <- s2 + s1
 7724     //   s1 <- s1 + b2
 7725     //   s2 <- s2 + b1
 7726     //   ...
 7727     //   s1 <- s1 + b16
 7728     //   s2 <- s2 + s1
 7729     // Putting above assignments together, we have:
 7730     //   s1_new = s1 + b1 + b2 + ... + b16
 7731     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7732     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7733     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7734     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7735 
 7736     // s2 = s2 + s1 * 16
 7737     __ add(s2, s2, s1, Assembler::LSL, 4);
 7738 
 7739     // vs1acc = b1 + b2 + b3 + ... + b16
 7740     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7741     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7742     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7743     __ uaddlv(vs1acc, __ T16B, vbytes);
 7744     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7745 
 7746     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7747     __ fmovd(temp0, vs1acc);
 7748     __ fmovd(temp1, vs2acc);
 7749     __ add(s1, s1, temp0);
 7750     __ add(s2, s2, temp1);
 7751   }
 7752 
 7753   /**
 7754    *  Arguments:
 7755    *
 7756    *  Input:
 7757    *    c_rarg0   - x address
 7758    *    c_rarg1   - x length
 7759    *    c_rarg2   - y address
 7760    *    c_rarg3   - y length
 7761    *    c_rarg4   - z address
 7762    */
 7763   address generate_multiplyToLen() {
 7764     __ align(CodeEntryAlignment);
 7765     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7766     StubCodeMark mark(this, stub_id);
 7767 
 7768     address start = __ pc();
 7769     const Register x     = r0;
 7770     const Register xlen  = r1;
 7771     const Register y     = r2;
 7772     const Register ylen  = r3;
 7773     const Register z     = r4;
 7774 
 7775     const Register tmp0  = r5;
 7776     const Register tmp1  = r10;
 7777     const Register tmp2  = r11;
 7778     const Register tmp3  = r12;
 7779     const Register tmp4  = r13;
 7780     const Register tmp5  = r14;
 7781     const Register tmp6  = r15;
 7782     const Register tmp7  = r16;
 7783 
 7784     BLOCK_COMMENT("Entry:");
 7785     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7786     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7787     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7788     __ ret(lr);
 7789 
 7790     return start;
 7791   }
 7792 
 7793   address generate_squareToLen() {
 7794     // squareToLen algorithm for sizes 1..127 described in java code works
 7795     // faster than multiply_to_len on some CPUs and slower on others, but
 7796     // multiply_to_len shows a bit better overall results
 7797     __ align(CodeEntryAlignment);
 7798     StubId stub_id = StubId::stubgen_squareToLen_id;
 7799     StubCodeMark mark(this, stub_id);
 7800     address start = __ pc();
 7801 
 7802     const Register x     = r0;
 7803     const Register xlen  = r1;
 7804     const Register z     = r2;
 7805     const Register y     = r4; // == x
 7806     const Register ylen  = r5; // == xlen
 7807 
 7808     const Register tmp0  = r3;
 7809     const Register tmp1  = r10;
 7810     const Register tmp2  = r11;
 7811     const Register tmp3  = r12;
 7812     const Register tmp4  = r13;
 7813     const Register tmp5  = r14;
 7814     const Register tmp6  = r15;
 7815     const Register tmp7  = r16;
 7816 
 7817     RegSet spilled_regs = RegSet::of(y, ylen);
 7818     BLOCK_COMMENT("Entry:");
 7819     __ enter();
 7820     __ push(spilled_regs, sp);
 7821     __ mov(y, x);
 7822     __ mov(ylen, xlen);
 7823     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7824     __ pop(spilled_regs, sp);
 7825     __ leave();
 7826     __ ret(lr);
 7827     return start;
 7828   }
 7829 
 7830   address generate_mulAdd() {
 7831     __ align(CodeEntryAlignment);
 7832     StubId stub_id = StubId::stubgen_mulAdd_id;
 7833     StubCodeMark mark(this, stub_id);
 7834 
 7835     address start = __ pc();
 7836 
 7837     const Register out     = r0;
 7838     const Register in      = r1;
 7839     const Register offset  = r2;
 7840     const Register len     = r3;
 7841     const Register k       = r4;
 7842 
 7843     BLOCK_COMMENT("Entry:");
 7844     __ enter();
 7845     __ mul_add(out, in, offset, len, k);
 7846     __ leave();
 7847     __ ret(lr);
 7848 
 7849     return start;
 7850   }
 7851 
 7852   // Arguments:
 7853   //
 7854   // Input:
 7855   //   c_rarg0   - newArr address
 7856   //   c_rarg1   - oldArr address
 7857   //   c_rarg2   - newIdx
 7858   //   c_rarg3   - shiftCount
 7859   //   c_rarg4   - numIter
 7860   //
 7861   address generate_bigIntegerRightShift() {
 7862     __ align(CodeEntryAlignment);
 7863     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7864     StubCodeMark mark(this, stub_id);
 7865     address start = __ pc();
 7866 
 7867     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7868 
 7869     Register newArr        = c_rarg0;
 7870     Register oldArr        = c_rarg1;
 7871     Register newIdx        = c_rarg2;
 7872     Register shiftCount    = c_rarg3;
 7873     Register numIter       = c_rarg4;
 7874     Register idx           = numIter;
 7875 
 7876     Register newArrCur     = rscratch1;
 7877     Register shiftRevCount = rscratch2;
 7878     Register oldArrCur     = r13;
 7879     Register oldArrNext    = r14;
 7880 
 7881     FloatRegister oldElem0        = v0;
 7882     FloatRegister oldElem1        = v1;
 7883     FloatRegister newElem         = v2;
 7884     FloatRegister shiftVCount     = v3;
 7885     FloatRegister shiftVRevCount  = v4;
 7886 
 7887     __ cbz(idx, Exit);
 7888 
 7889     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7890 
 7891     // left shift count
 7892     __ movw(shiftRevCount, 32);
 7893     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7894 
 7895     // numIter too small to allow a 4-words SIMD loop, rolling back
 7896     __ cmp(numIter, (u1)4);
 7897     __ br(Assembler::LT, ShiftThree);
 7898 
 7899     __ dup(shiftVCount,    __ T4S, shiftCount);
 7900     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7901     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7902 
 7903     __ BIND(ShiftSIMDLoop);
 7904 
 7905     // Calculate the load addresses
 7906     __ sub(idx, idx, 4);
 7907     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7908     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7909     __ add(oldArrCur,  oldArrNext, 4);
 7910 
 7911     // Load 4 words and process
 7912     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7913     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7914     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7915     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7916     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7917     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7918 
 7919     __ cmp(idx, (u1)4);
 7920     __ br(Assembler::LT, ShiftTwoLoop);
 7921     __ b(ShiftSIMDLoop);
 7922 
 7923     __ BIND(ShiftTwoLoop);
 7924     __ cbz(idx, Exit);
 7925     __ cmp(idx, (u1)1);
 7926     __ br(Assembler::EQ, ShiftOne);
 7927 
 7928     // Calculate the load addresses
 7929     __ sub(idx, idx, 2);
 7930     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7931     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7932     __ add(oldArrCur,  oldArrNext, 4);
 7933 
 7934     // Load 2 words and process
 7935     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7936     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7937     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7938     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7939     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7940     __ st1(newElem,   __ T2S, Address(newArrCur));
 7941     __ b(ShiftTwoLoop);
 7942 
 7943     __ BIND(ShiftThree);
 7944     __ tbz(idx, 1, ShiftOne);
 7945     __ tbz(idx, 0, ShiftTwo);
 7946     __ ldrw(r10,  Address(oldArr, 12));
 7947     __ ldrw(r11,  Address(oldArr, 8));
 7948     __ lsrvw(r10, r10, shiftCount);
 7949     __ lslvw(r11, r11, shiftRevCount);
 7950     __ orrw(r12,  r10, r11);
 7951     __ strw(r12,  Address(newArr, 8));
 7952 
 7953     __ BIND(ShiftTwo);
 7954     __ ldrw(r10,  Address(oldArr, 8));
 7955     __ ldrw(r11,  Address(oldArr, 4));
 7956     __ lsrvw(r10, r10, shiftCount);
 7957     __ lslvw(r11, r11, shiftRevCount);
 7958     __ orrw(r12,  r10, r11);
 7959     __ strw(r12,  Address(newArr, 4));
 7960 
 7961     __ BIND(ShiftOne);
 7962     __ ldrw(r10,  Address(oldArr, 4));
 7963     __ ldrw(r11,  Address(oldArr));
 7964     __ lsrvw(r10, r10, shiftCount);
 7965     __ lslvw(r11, r11, shiftRevCount);
 7966     __ orrw(r12,  r10, r11);
 7967     __ strw(r12,  Address(newArr));
 7968 
 7969     __ BIND(Exit);
 7970     __ ret(lr);
 7971 
 7972     return start;
 7973   }
 7974 
 7975   // Arguments:
 7976   //
 7977   // Input:
 7978   //   c_rarg0   - newArr address
 7979   //   c_rarg1   - oldArr address
 7980   //   c_rarg2   - newIdx
 7981   //   c_rarg3   - shiftCount
 7982   //   c_rarg4   - numIter
 7983   //
 7984   address generate_bigIntegerLeftShift() {
 7985     __ align(CodeEntryAlignment);
 7986     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 7987     StubCodeMark mark(this, stub_id);
 7988     address start = __ pc();
 7989 
 7990     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7991 
 7992     Register newArr        = c_rarg0;
 7993     Register oldArr        = c_rarg1;
 7994     Register newIdx        = c_rarg2;
 7995     Register shiftCount    = c_rarg3;
 7996     Register numIter       = c_rarg4;
 7997 
 7998     Register shiftRevCount = rscratch1;
 7999     Register oldArrNext    = rscratch2;
 8000 
 8001     FloatRegister oldElem0        = v0;
 8002     FloatRegister oldElem1        = v1;
 8003     FloatRegister newElem         = v2;
 8004     FloatRegister shiftVCount     = v3;
 8005     FloatRegister shiftVRevCount  = v4;
 8006 
 8007     __ cbz(numIter, Exit);
 8008 
 8009     __ add(oldArrNext, oldArr, 4);
 8010     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8011 
 8012     // right shift count
 8013     __ movw(shiftRevCount, 32);
 8014     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8015 
 8016     // numIter too small to allow a 4-words SIMD loop, rolling back
 8017     __ cmp(numIter, (u1)4);
 8018     __ br(Assembler::LT, ShiftThree);
 8019 
 8020     __ dup(shiftVCount,     __ T4S, shiftCount);
 8021     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8022     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8023 
 8024     __ BIND(ShiftSIMDLoop);
 8025 
 8026     // load 4 words and process
 8027     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8028     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8029     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8030     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8031     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8032     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8033     __ sub(numIter,   numIter, 4);
 8034 
 8035     __ cmp(numIter, (u1)4);
 8036     __ br(Assembler::LT, ShiftTwoLoop);
 8037     __ b(ShiftSIMDLoop);
 8038 
 8039     __ BIND(ShiftTwoLoop);
 8040     __ cbz(numIter, Exit);
 8041     __ cmp(numIter, (u1)1);
 8042     __ br(Assembler::EQ, ShiftOne);
 8043 
 8044     // load 2 words and process
 8045     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8046     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8047     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8048     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8049     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8050     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8051     __ sub(numIter,   numIter, 2);
 8052     __ b(ShiftTwoLoop);
 8053 
 8054     __ BIND(ShiftThree);
 8055     __ ldrw(r10,  __ post(oldArr, 4));
 8056     __ ldrw(r11,  __ post(oldArrNext, 4));
 8057     __ lslvw(r10, r10, shiftCount);
 8058     __ lsrvw(r11, r11, shiftRevCount);
 8059     __ orrw(r12,  r10, r11);
 8060     __ strw(r12,  __ post(newArr, 4));
 8061     __ tbz(numIter, 1, Exit);
 8062     __ tbz(numIter, 0, ShiftOne);
 8063 
 8064     __ BIND(ShiftTwo);
 8065     __ ldrw(r10,  __ post(oldArr, 4));
 8066     __ ldrw(r11,  __ post(oldArrNext, 4));
 8067     __ lslvw(r10, r10, shiftCount);
 8068     __ lsrvw(r11, r11, shiftRevCount);
 8069     __ orrw(r12,  r10, r11);
 8070     __ strw(r12,  __ post(newArr, 4));
 8071 
 8072     __ BIND(ShiftOne);
 8073     __ ldrw(r10,  Address(oldArr));
 8074     __ ldrw(r11,  Address(oldArrNext));
 8075     __ lslvw(r10, r10, shiftCount);
 8076     __ lsrvw(r11, r11, shiftRevCount);
 8077     __ orrw(r12,  r10, r11);
 8078     __ strw(r12,  Address(newArr));
 8079 
 8080     __ BIND(Exit);
 8081     __ ret(lr);
 8082 
 8083     return start;
 8084   }
 8085 
 8086   address generate_count_positives(address &count_positives_long) {
 8087     const u1 large_loop_size = 64;
 8088     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8089     int dcache_line = VM_Version::dcache_line_size();
 8090 
 8091     Register ary1 = r1, len = r2, result = r0;
 8092 
 8093     __ align(CodeEntryAlignment);
 8094 
 8095     StubId stub_id = StubId::stubgen_count_positives_id;
 8096     StubCodeMark mark(this, stub_id);
 8097 
 8098     address entry = __ pc();
 8099 
 8100     __ enter();
 8101     // precondition: a copy of len is already in result
 8102     // __ mov(result, len);
 8103 
 8104   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8105         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8106 
 8107   __ cmp(len, (u1)15);
 8108   __ br(Assembler::GT, LEN_OVER_15);
 8109   // The only case when execution falls into this code is when pointer is near
 8110   // the end of memory page and we have to avoid reading next page
 8111   __ add(ary1, ary1, len);
 8112   __ subs(len, len, 8);
 8113   __ br(Assembler::GT, LEN_OVER_8);
 8114   __ ldr(rscratch2, Address(ary1, -8));
 8115   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8116   __ lsrv(rscratch2, rscratch2, rscratch1);
 8117   __ tst(rscratch2, UPPER_BIT_MASK);
 8118   __ csel(result, zr, result, Assembler::NE);
 8119   __ leave();
 8120   __ ret(lr);
 8121   __ bind(LEN_OVER_8);
 8122   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8123   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8124   __ tst(rscratch2, UPPER_BIT_MASK);
 8125   __ br(Assembler::NE, RET_NO_POP);
 8126   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8127   __ lsrv(rscratch1, rscratch1, rscratch2);
 8128   __ tst(rscratch1, UPPER_BIT_MASK);
 8129   __ bind(RET_NO_POP);
 8130   __ csel(result, zr, result, Assembler::NE);
 8131   __ leave();
 8132   __ ret(lr);
 8133 
 8134   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8135   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8136 
 8137   count_positives_long = __ pc(); // 2nd entry point
 8138 
 8139   __ enter();
 8140 
 8141   __ bind(LEN_OVER_15);
 8142     __ push(spilled_regs, sp);
 8143     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8144     __ cbz(rscratch2, ALIGNED);
 8145     __ ldp(tmp6, tmp1, Address(ary1));
 8146     __ mov(tmp5, 16);
 8147     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8148     __ add(ary1, ary1, rscratch1);
 8149     __ orr(tmp6, tmp6, tmp1);
 8150     __ tst(tmp6, UPPER_BIT_MASK);
 8151     __ br(Assembler::NE, RET_ADJUST);
 8152     __ sub(len, len, rscratch1);
 8153 
 8154   __ bind(ALIGNED);
 8155     __ cmp(len, large_loop_size);
 8156     __ br(Assembler::LT, CHECK_16);
 8157     // Perform 16-byte load as early return in pre-loop to handle situation
 8158     // when initially aligned large array has negative values at starting bytes,
 8159     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8160     // slower. Cases with negative bytes further ahead won't be affected that
 8161     // much. In fact, it'll be faster due to early loads, less instructions and
 8162     // less branches in LARGE_LOOP.
 8163     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8164     __ sub(len, len, 16);
 8165     __ orr(tmp6, tmp6, tmp1);
 8166     __ tst(tmp6, UPPER_BIT_MASK);
 8167     __ br(Assembler::NE, RET_ADJUST_16);
 8168     __ cmp(len, large_loop_size);
 8169     __ br(Assembler::LT, CHECK_16);
 8170 
 8171     if (SoftwarePrefetchHintDistance >= 0
 8172         && SoftwarePrefetchHintDistance >= dcache_line) {
 8173       // initial prefetch
 8174       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8175     }
 8176   __ bind(LARGE_LOOP);
 8177     if (SoftwarePrefetchHintDistance >= 0) {
 8178       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8179     }
 8180     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8181     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8182     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8183     // instructions per cycle and have less branches, but this approach disables
 8184     // early return, thus, all 64 bytes are loaded and checked every time.
 8185     __ ldp(tmp2, tmp3, Address(ary1));
 8186     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8187     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8188     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8189     __ add(ary1, ary1, large_loop_size);
 8190     __ sub(len, len, large_loop_size);
 8191     __ orr(tmp2, tmp2, tmp3);
 8192     __ orr(tmp4, tmp4, tmp5);
 8193     __ orr(rscratch1, rscratch1, rscratch2);
 8194     __ orr(tmp6, tmp6, tmp1);
 8195     __ orr(tmp2, tmp2, tmp4);
 8196     __ orr(rscratch1, rscratch1, tmp6);
 8197     __ orr(tmp2, tmp2, rscratch1);
 8198     __ tst(tmp2, UPPER_BIT_MASK);
 8199     __ br(Assembler::NE, RET_ADJUST_LONG);
 8200     __ cmp(len, large_loop_size);
 8201     __ br(Assembler::GE, LARGE_LOOP);
 8202 
 8203   __ bind(CHECK_16); // small 16-byte load pre-loop
 8204     __ cmp(len, (u1)16);
 8205     __ br(Assembler::LT, POST_LOOP16);
 8206 
 8207   __ bind(LOOP16); // small 16-byte load loop
 8208     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8209     __ sub(len, len, 16);
 8210     __ orr(tmp2, tmp2, tmp3);
 8211     __ tst(tmp2, UPPER_BIT_MASK);
 8212     __ br(Assembler::NE, RET_ADJUST_16);
 8213     __ cmp(len, (u1)16);
 8214     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8215 
 8216   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8217     __ cmp(len, (u1)8);
 8218     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8219     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8220     __ tst(tmp3, UPPER_BIT_MASK);
 8221     __ br(Assembler::NE, RET_ADJUST);
 8222     __ sub(len, len, 8);
 8223 
 8224   __ bind(POST_LOOP16_LOAD_TAIL);
 8225     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8226     __ ldr(tmp1, Address(ary1));
 8227     __ mov(tmp2, 64);
 8228     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8229     __ lslv(tmp1, tmp1, tmp4);
 8230     __ tst(tmp1, UPPER_BIT_MASK);
 8231     __ br(Assembler::NE, RET_ADJUST);
 8232     // Fallthrough
 8233 
 8234   __ bind(RET_LEN);
 8235     __ pop(spilled_regs, sp);
 8236     __ leave();
 8237     __ ret(lr);
 8238 
 8239     // difference result - len is the count of guaranteed to be
 8240     // positive bytes
 8241 
 8242   __ bind(RET_ADJUST_LONG);
 8243     __ add(len, len, (u1)(large_loop_size - 16));
 8244   __ bind(RET_ADJUST_16);
 8245     __ add(len, len, 16);
 8246   __ bind(RET_ADJUST);
 8247     __ pop(spilled_regs, sp);
 8248     __ leave();
 8249     __ sub(result, result, len);
 8250     __ ret(lr);
 8251 
 8252     return entry;
 8253   }
 8254 
 8255   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8256         bool usePrefetch, Label &NOT_EQUAL) {
 8257     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8258         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8259         tmp7 = r12, tmp8 = r13;
 8260     Label LOOP;
 8261 
 8262     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8263     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8264     __ bind(LOOP);
 8265     if (usePrefetch) {
 8266       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8267       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8268     }
 8269     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8270     __ eor(tmp1, tmp1, tmp2);
 8271     __ eor(tmp3, tmp3, tmp4);
 8272     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8273     __ orr(tmp1, tmp1, tmp3);
 8274     __ cbnz(tmp1, NOT_EQUAL);
 8275     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8276     __ eor(tmp5, tmp5, tmp6);
 8277     __ eor(tmp7, tmp7, tmp8);
 8278     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8279     __ orr(tmp5, tmp5, tmp7);
 8280     __ cbnz(tmp5, NOT_EQUAL);
 8281     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8282     __ eor(tmp1, tmp1, tmp2);
 8283     __ eor(tmp3, tmp3, tmp4);
 8284     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8285     __ orr(tmp1, tmp1, tmp3);
 8286     __ cbnz(tmp1, NOT_EQUAL);
 8287     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8288     __ eor(tmp5, tmp5, tmp6);
 8289     __ sub(cnt1, cnt1, 8 * wordSize);
 8290     __ eor(tmp7, tmp7, tmp8);
 8291     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8292     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8293     // cmp) because subs allows an unlimited range of immediate operand.
 8294     __ subs(tmp6, cnt1, loopThreshold);
 8295     __ orr(tmp5, tmp5, tmp7);
 8296     __ cbnz(tmp5, NOT_EQUAL);
 8297     __ br(__ GE, LOOP);
 8298     // post-loop
 8299     __ eor(tmp1, tmp1, tmp2);
 8300     __ eor(tmp3, tmp3, tmp4);
 8301     __ orr(tmp1, tmp1, tmp3);
 8302     __ sub(cnt1, cnt1, 2 * wordSize);
 8303     __ cbnz(tmp1, NOT_EQUAL);
 8304   }
 8305 
 8306   void generate_large_array_equals_loop_simd(int loopThreshold,
 8307         bool usePrefetch, Label &NOT_EQUAL) {
 8308     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8309         tmp2 = rscratch2;
 8310     Label LOOP;
 8311 
 8312     __ bind(LOOP);
 8313     if (usePrefetch) {
 8314       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8315       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8316     }
 8317     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8318     __ sub(cnt1, cnt1, 8 * wordSize);
 8319     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8320     __ subs(tmp1, cnt1, loopThreshold);
 8321     __ eor(v0, __ T16B, v0, v4);
 8322     __ eor(v1, __ T16B, v1, v5);
 8323     __ eor(v2, __ T16B, v2, v6);
 8324     __ eor(v3, __ T16B, v3, v7);
 8325     __ orr(v0, __ T16B, v0, v1);
 8326     __ orr(v1, __ T16B, v2, v3);
 8327     __ orr(v0, __ T16B, v0, v1);
 8328     __ umov(tmp1, v0, __ D, 0);
 8329     __ umov(tmp2, v0, __ D, 1);
 8330     __ orr(tmp1, tmp1, tmp2);
 8331     __ cbnz(tmp1, NOT_EQUAL);
 8332     __ br(__ GE, LOOP);
 8333   }
 8334 
 8335   // a1 = r1 - array1 address
 8336   // a2 = r2 - array2 address
 8337   // result = r0 - return value. Already contains "false"
 8338   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8339   // r3-r5 are reserved temporary registers
 8340   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8341   address generate_large_array_equals() {
 8342     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8343         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8344         tmp7 = r12, tmp8 = r13;
 8345     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8346         SMALL_LOOP, POST_LOOP;
 8347     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8348     // calculate if at least 32 prefetched bytes are used
 8349     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8350     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8351     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8352     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8353         tmp5, tmp6, tmp7, tmp8);
 8354 
 8355     __ align(CodeEntryAlignment);
 8356 
 8357     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8358     StubCodeMark mark(this, stub_id);
 8359 
 8360     address entry = __ pc();
 8361     __ enter();
 8362     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8363     // also advance pointers to use post-increment instead of pre-increment
 8364     __ add(a1, a1, wordSize);
 8365     __ add(a2, a2, wordSize);
 8366     if (AvoidUnalignedAccesses) {
 8367       // both implementations (SIMD/nonSIMD) are using relatively large load
 8368       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8369       // on some CPUs in case of address is not at least 16-byte aligned.
 8370       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8371       // load if needed at least for 1st address and make if 16-byte aligned.
 8372       Label ALIGNED16;
 8373       __ tbz(a1, 3, ALIGNED16);
 8374       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8375       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8376       __ sub(cnt1, cnt1, wordSize);
 8377       __ eor(tmp1, tmp1, tmp2);
 8378       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8379       __ bind(ALIGNED16);
 8380     }
 8381     if (UseSIMDForArrayEquals) {
 8382       if (SoftwarePrefetchHintDistance >= 0) {
 8383         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8384         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8385         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8386             /* prfm = */ true, NOT_EQUAL);
 8387         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8388         __ br(__ LT, TAIL);
 8389       }
 8390       __ bind(NO_PREFETCH_LARGE_LOOP);
 8391       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8392           /* prfm = */ false, NOT_EQUAL);
 8393     } else {
 8394       __ push(spilled_regs, sp);
 8395       if (SoftwarePrefetchHintDistance >= 0) {
 8396         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8397         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8398         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8399             /* prfm = */ true, NOT_EQUAL);
 8400         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8401         __ br(__ LT, TAIL);
 8402       }
 8403       __ bind(NO_PREFETCH_LARGE_LOOP);
 8404       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8405           /* prfm = */ false, NOT_EQUAL);
 8406     }
 8407     __ bind(TAIL);
 8408       __ cbz(cnt1, EQUAL);
 8409       __ subs(cnt1, cnt1, wordSize);
 8410       __ br(__ LE, POST_LOOP);
 8411     __ bind(SMALL_LOOP);
 8412       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8413       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8414       __ subs(cnt1, cnt1, wordSize);
 8415       __ eor(tmp1, tmp1, tmp2);
 8416       __ cbnz(tmp1, NOT_EQUAL);
 8417       __ br(__ GT, SMALL_LOOP);
 8418     __ bind(POST_LOOP);
 8419       __ ldr(tmp1, Address(a1, cnt1));
 8420       __ ldr(tmp2, Address(a2, cnt1));
 8421       __ eor(tmp1, tmp1, tmp2);
 8422       __ cbnz(tmp1, NOT_EQUAL);
 8423     __ bind(EQUAL);
 8424       __ mov(result, true);
 8425     __ bind(NOT_EQUAL);
 8426       if (!UseSIMDForArrayEquals) {
 8427         __ pop(spilled_regs, sp);
 8428       }
 8429     __ bind(NOT_EQUAL_NO_POP);
 8430     __ leave();
 8431     __ ret(lr);
 8432     return entry;
 8433   }
 8434 
 8435   // result = r0 - return value. Contains initial hashcode value on entry.
 8436   // ary = r1 - array address
 8437   // cnt = r2 - elements count
 8438   // Clobbers: v0-v13, rscratch1, rscratch2
 8439   address generate_large_arrays_hashcode(BasicType eltype) {
 8440     const Register result = r0, ary = r1, cnt = r2;
 8441     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8442     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8443     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8444     const FloatRegister vpowm = v13;
 8445 
 8446     ARRAYS_HASHCODE_REGISTERS;
 8447 
 8448     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8449 
 8450     unsigned int vf; // vectorization factor
 8451     bool multiply_by_halves;
 8452     Assembler::SIMD_Arrangement load_arrangement;
 8453     switch (eltype) {
 8454     case T_BOOLEAN:
 8455     case T_BYTE:
 8456       load_arrangement = Assembler::T8B;
 8457       multiply_by_halves = true;
 8458       vf = 8;
 8459       break;
 8460     case T_CHAR:
 8461     case T_SHORT:
 8462       load_arrangement = Assembler::T8H;
 8463       multiply_by_halves = true;
 8464       vf = 8;
 8465       break;
 8466     case T_INT:
 8467       load_arrangement = Assembler::T4S;
 8468       multiply_by_halves = false;
 8469       vf = 4;
 8470       break;
 8471     default:
 8472       ShouldNotReachHere();
 8473     }
 8474 
 8475     // Unroll factor
 8476     const unsigned uf = 4;
 8477 
 8478     // Effective vectorization factor
 8479     const unsigned evf = vf * uf;
 8480 
 8481     __ align(CodeEntryAlignment);
 8482 
 8483     StubId stub_id;
 8484     switch (eltype) {
 8485     case T_BOOLEAN:
 8486       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8487       break;
 8488     case T_BYTE:
 8489       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8490       break;
 8491     case T_CHAR:
 8492       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8493       break;
 8494     case T_SHORT:
 8495       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8496       break;
 8497     case T_INT:
 8498       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8499       break;
 8500     default:
 8501       stub_id = StubId::NO_STUBID;
 8502       ShouldNotReachHere();
 8503     };
 8504 
 8505     StubCodeMark mark(this, stub_id);
 8506 
 8507     address entry = __ pc();
 8508     __ enter();
 8509 
 8510     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8511     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8512     // value shouldn't change throughout both loops.
 8513     __ movw(rscratch1, intpow(31U, 3));
 8514     __ mov(vpow, Assembler::S, 0, rscratch1);
 8515     __ movw(rscratch1, intpow(31U, 2));
 8516     __ mov(vpow, Assembler::S, 1, rscratch1);
 8517     __ movw(rscratch1, intpow(31U, 1));
 8518     __ mov(vpow, Assembler::S, 2, rscratch1);
 8519     __ movw(rscratch1, intpow(31U, 0));
 8520     __ mov(vpow, Assembler::S, 3, rscratch1);
 8521 
 8522     __ mov(vmul0, Assembler::T16B, 0);
 8523     __ mov(vmul0, Assembler::S, 3, result);
 8524 
 8525     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8526     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8527 
 8528     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8529     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8530 
 8531     // SMALL LOOP
 8532     __ bind(SMALL_LOOP);
 8533 
 8534     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8535     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8536     __ subsw(rscratch2, rscratch2, vf);
 8537 
 8538     if (load_arrangement == Assembler::T8B) {
 8539       // Extend 8B to 8H to be able to use vector multiply
 8540       // instructions
 8541       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8542       if (is_signed_subword_type(eltype)) {
 8543         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8544       } else {
 8545         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8546       }
 8547     }
 8548 
 8549     switch (load_arrangement) {
 8550     case Assembler::T4S:
 8551       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8552       break;
 8553     case Assembler::T8B:
 8554     case Assembler::T8H:
 8555       assert(is_subword_type(eltype), "subword type expected");
 8556       if (is_signed_subword_type(eltype)) {
 8557         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8558       } else {
 8559         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8560       }
 8561       break;
 8562     default:
 8563       __ should_not_reach_here();
 8564     }
 8565 
 8566     // Process the upper half of a vector
 8567     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8568       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8569       if (is_signed_subword_type(eltype)) {
 8570         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8571       } else {
 8572         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8573       }
 8574     }
 8575 
 8576     __ br(Assembler::HI, SMALL_LOOP);
 8577 
 8578     // SMALL LOOP'S EPILOQUE
 8579     __ lsr(rscratch2, cnt, exact_log2(evf));
 8580     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8581 
 8582     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8583     __ addv(vmul0, Assembler::T4S, vmul0);
 8584     __ umov(result, vmul0, Assembler::S, 0);
 8585 
 8586     // TAIL
 8587     __ bind(TAIL);
 8588 
 8589     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8590     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8591     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8592     __ andr(rscratch2, cnt, vf - 1);
 8593     __ bind(TAIL_SHORTCUT);
 8594     __ adr(rscratch1, BR_BASE);
 8595     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8596     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8597     __ movw(rscratch2, 0x1f);
 8598     __ br(rscratch1);
 8599 
 8600     for (size_t i = 0; i < vf - 1; ++i) {
 8601       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8602                                    eltype);
 8603       __ maddw(result, result, rscratch2, rscratch1);
 8604       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8605       // Generate 2nd nop to have 4 instructions per iteration.
 8606       if (VM_Version::supports_a53mac()) {
 8607         __ nop();
 8608       }
 8609     }
 8610     __ bind(BR_BASE);
 8611 
 8612     __ leave();
 8613     __ ret(lr);
 8614 
 8615     // LARGE LOOP
 8616     __ bind(LARGE_LOOP_PREHEADER);
 8617 
 8618     __ lsr(rscratch2, cnt, exact_log2(evf));
 8619 
 8620     if (multiply_by_halves) {
 8621       // 31^4 - multiplier between lower and upper parts of a register
 8622       __ movw(rscratch1, intpow(31U, vf / 2));
 8623       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8624       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8625       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8626       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8627     } else {
 8628       // 31^16
 8629       __ movw(rscratch1, intpow(31U, evf));
 8630       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8631     }
 8632 
 8633     __ mov(vmul3, Assembler::T16B, 0);
 8634     __ mov(vmul2, Assembler::T16B, 0);
 8635     __ mov(vmul1, Assembler::T16B, 0);
 8636 
 8637     __ bind(LARGE_LOOP);
 8638 
 8639     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8640     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8641     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8642     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8643 
 8644     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8645            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8646 
 8647     if (load_arrangement == Assembler::T8B) {
 8648       // Extend 8B to 8H to be able to use vector multiply
 8649       // instructions
 8650       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8651       if (is_signed_subword_type(eltype)) {
 8652         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8653         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8654         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8655         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8656       } else {
 8657         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8658         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8659         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8660         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8661       }
 8662     }
 8663 
 8664     switch (load_arrangement) {
 8665     case Assembler::T4S:
 8666       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8667       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8668       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8669       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8670       break;
 8671     case Assembler::T8B:
 8672     case Assembler::T8H:
 8673       assert(is_subword_type(eltype), "subword type expected");
 8674       if (is_signed_subword_type(eltype)) {
 8675         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8676         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8677         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8678         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8679       } else {
 8680         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8681         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8682         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8683         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8684       }
 8685       break;
 8686     default:
 8687       __ should_not_reach_here();
 8688     }
 8689 
 8690     // Process the upper half of a vector
 8691     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8692       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8693       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8694       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8695       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8696       if (is_signed_subword_type(eltype)) {
 8697         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8698         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8699         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8700         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8701       } else {
 8702         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8703         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8704         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8705         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8706       }
 8707     }
 8708 
 8709     __ subsw(rscratch2, rscratch2, 1);
 8710     __ br(Assembler::HI, LARGE_LOOP);
 8711 
 8712     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8713     __ addv(vmul3, Assembler::T4S, vmul3);
 8714     __ umov(result, vmul3, Assembler::S, 0);
 8715 
 8716     __ mov(rscratch2, intpow(31U, vf));
 8717 
 8718     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8719     __ addv(vmul2, Assembler::T4S, vmul2);
 8720     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8721     __ maddw(result, result, rscratch2, rscratch1);
 8722 
 8723     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8724     __ addv(vmul1, Assembler::T4S, vmul1);
 8725     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8726     __ maddw(result, result, rscratch2, rscratch1);
 8727 
 8728     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8729     __ addv(vmul0, Assembler::T4S, vmul0);
 8730     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8731     __ maddw(result, result, rscratch2, rscratch1);
 8732 
 8733     __ andr(rscratch2, cnt, vf - 1);
 8734     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8735 
 8736     __ leave();
 8737     __ ret(lr);
 8738 
 8739     return entry;
 8740   }
 8741 
 8742   address generate_dsin_dcos(bool isCos) {
 8743     __ align(CodeEntryAlignment);
 8744     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8745     StubCodeMark mark(this, stub_id);
 8746     address start = __ pc();
 8747     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8748         (address)StubRoutines::aarch64::_two_over_pi,
 8749         (address)StubRoutines::aarch64::_pio2,
 8750         (address)StubRoutines::aarch64::_dsin_coef,
 8751         (address)StubRoutines::aarch64::_dcos_coef);
 8752     return start;
 8753   }
 8754 
 8755   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8756   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8757       Label &DIFF2) {
 8758     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8759     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8760 
 8761     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8762     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8763     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8764     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8765 
 8766     __ fmovd(tmpL, vtmp3);
 8767     __ eor(rscratch2, tmp3, tmpL);
 8768     __ cbnz(rscratch2, DIFF2);
 8769 
 8770     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8771     __ umov(tmpL, vtmp3, __ D, 1);
 8772     __ eor(rscratch2, tmpU, tmpL);
 8773     __ cbnz(rscratch2, DIFF1);
 8774 
 8775     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8776     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8777     __ fmovd(tmpL, vtmp);
 8778     __ eor(rscratch2, tmp3, tmpL);
 8779     __ cbnz(rscratch2, DIFF2);
 8780 
 8781     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8782     __ umov(tmpL, vtmp, __ D, 1);
 8783     __ eor(rscratch2, tmpU, tmpL);
 8784     __ cbnz(rscratch2, DIFF1);
 8785   }
 8786 
 8787   // r0  = result
 8788   // r1  = str1
 8789   // r2  = cnt1
 8790   // r3  = str2
 8791   // r4  = cnt2
 8792   // r10 = tmp1
 8793   // r11 = tmp2
 8794   address generate_compare_long_string_different_encoding(bool isLU) {
 8795     __ align(CodeEntryAlignment);
 8796     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8797     StubCodeMark mark(this, stub_id);
 8798     address entry = __ pc();
 8799     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8800         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8801         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8802     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8803         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8804     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8805     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8806 
 8807     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8808 
 8809     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8810     // cnt2 == amount of characters left to compare
 8811     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8812     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8813     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8814     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8815     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8816     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8817     __ eor(rscratch2, tmp1, tmp2);
 8818     __ mov(rscratch1, tmp2);
 8819     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8820     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8821              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8822     __ push(spilled_regs, sp);
 8823     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8824     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8825 
 8826     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8827 
 8828     if (SoftwarePrefetchHintDistance >= 0) {
 8829       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8830       __ br(__ LT, NO_PREFETCH);
 8831       __ bind(LARGE_LOOP_PREFETCH);
 8832         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8833         __ mov(tmp4, 2);
 8834         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8835         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8836           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8837           __ subs(tmp4, tmp4, 1);
 8838           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8839           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8840           __ mov(tmp4, 2);
 8841         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8842           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8843           __ subs(tmp4, tmp4, 1);
 8844           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8845           __ sub(cnt2, cnt2, 64);
 8846           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8847           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8848     }
 8849     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8850     __ bind(NO_PREFETCH);
 8851     __ subs(cnt2, cnt2, 16);
 8852     __ br(__ LT, TAIL);
 8853     __ align(OptoLoopAlignment);
 8854     __ bind(SMALL_LOOP); // smaller loop
 8855       __ subs(cnt2, cnt2, 16);
 8856       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8857       __ br(__ GE, SMALL_LOOP);
 8858       __ cmn(cnt2, (u1)16);
 8859       __ br(__ EQ, LOAD_LAST);
 8860     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8861       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8862       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8863       __ ldr(tmp3, Address(cnt1, -8));
 8864       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8865       __ b(LOAD_LAST);
 8866     __ bind(DIFF2);
 8867       __ mov(tmpU, tmp3);
 8868     __ bind(DIFF1);
 8869       __ pop(spilled_regs, sp);
 8870       __ b(CALCULATE_DIFFERENCE);
 8871     __ bind(LOAD_LAST);
 8872       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8873       // No need to load it again
 8874       __ mov(tmpU, tmp3);
 8875       __ pop(spilled_regs, sp);
 8876 
 8877       // tmp2 points to the address of the last 4 Latin1 characters right now
 8878       __ ldrs(vtmp, Address(tmp2));
 8879       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8880       __ fmovd(tmpL, vtmp);
 8881 
 8882       __ eor(rscratch2, tmpU, tmpL);
 8883       __ cbz(rscratch2, DONE);
 8884 
 8885     // Find the first different characters in the longwords and
 8886     // compute their difference.
 8887     __ bind(CALCULATE_DIFFERENCE);
 8888       __ rev(rscratch2, rscratch2);
 8889       __ clz(rscratch2, rscratch2);
 8890       __ andr(rscratch2, rscratch2, -16);
 8891       __ lsrv(tmp1, tmp1, rscratch2);
 8892       __ uxthw(tmp1, tmp1);
 8893       __ lsrv(rscratch1, rscratch1, rscratch2);
 8894       __ uxthw(rscratch1, rscratch1);
 8895       __ subw(result, tmp1, rscratch1);
 8896     __ bind(DONE);
 8897       __ ret(lr);
 8898     return entry;
 8899   }
 8900 
 8901   // r0 = input (float16)
 8902   // v0 = result (float)
 8903   // v1 = temporary float register
 8904   address generate_float16ToFloat() {
 8905     __ align(CodeEntryAlignment);
 8906     StubId stub_id = StubId::stubgen_hf2f_id;
 8907     StubCodeMark mark(this, stub_id);
 8908     address entry = __ pc();
 8909     BLOCK_COMMENT("Entry:");
 8910     __ flt16_to_flt(v0, r0, v1);
 8911     __ ret(lr);
 8912     return entry;
 8913   }
 8914 
 8915   // v0 = input (float)
 8916   // r0 = result (float16)
 8917   // v1 = temporary float register
 8918   address generate_floatToFloat16() {
 8919     __ align(CodeEntryAlignment);
 8920     StubId stub_id = StubId::stubgen_f2hf_id;
 8921     StubCodeMark mark(this, stub_id);
 8922     address entry = __ pc();
 8923     BLOCK_COMMENT("Entry:");
 8924     __ flt_to_flt16(r0, v0, v1);
 8925     __ ret(lr);
 8926     return entry;
 8927   }
 8928 
 8929   address generate_method_entry_barrier() {
 8930     __ align(CodeEntryAlignment);
 8931     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 8932     StubCodeMark mark(this, stub_id);
 8933 
 8934     Label deoptimize_label;
 8935 
 8936     address start = __ pc();
 8937 
 8938     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8939 
 8940     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8941       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8942       // We can get here despite the nmethod being good, if we have not
 8943       // yet applied our cross modification fence (or data fence).
 8944       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8945       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8946       __ ldrw(rscratch2, rscratch2);
 8947       __ strw(rscratch2, thread_epoch_addr);
 8948       __ isb();
 8949       __ membar(__ LoadLoad);
 8950     }
 8951 
 8952     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8953 
 8954     __ enter();
 8955     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8956 
 8957     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8958 
 8959     __ push_call_clobbered_registers();
 8960 
 8961     __ mov(c_rarg0, rscratch2);
 8962     __ call_VM_leaf
 8963          (CAST_FROM_FN_PTR
 8964           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8965 
 8966     __ reset_last_Java_frame(true);
 8967 
 8968     __ mov(rscratch1, r0);
 8969 
 8970     __ pop_call_clobbered_registers();
 8971 
 8972     __ cbnz(rscratch1, deoptimize_label);
 8973 
 8974     __ leave();
 8975     __ ret(lr);
 8976 
 8977     __ BIND(deoptimize_label);
 8978 
 8979     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8980     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8981 
 8982     __ mov(sp, rscratch1);
 8983     __ br(rscratch2);
 8984 
 8985     return start;
 8986   }
 8987 
 8988   // r0  = result
 8989   // r1  = str1
 8990   // r2  = cnt1
 8991   // r3  = str2
 8992   // r4  = cnt2
 8993   // r10 = tmp1
 8994   // r11 = tmp2
 8995   address generate_compare_long_string_same_encoding(bool isLL) {
 8996     __ align(CodeEntryAlignment);
 8997     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 8998     StubCodeMark mark(this, stub_id);
 8999     address entry = __ pc();
 9000     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9001         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9002 
 9003     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9004 
 9005     // exit from large loop when less than 64 bytes left to read or we're about
 9006     // to prefetch memory behind array border
 9007     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9008 
 9009     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9010     __ eor(rscratch2, tmp1, tmp2);
 9011     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9012 
 9013     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9014     // update pointers, because of previous read
 9015     __ add(str1, str1, wordSize);
 9016     __ add(str2, str2, wordSize);
 9017     if (SoftwarePrefetchHintDistance >= 0) {
 9018       __ align(OptoLoopAlignment);
 9019       __ bind(LARGE_LOOP_PREFETCH);
 9020         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9021         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9022 
 9023         for (int i = 0; i < 4; i++) {
 9024           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9025           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9026           __ cmp(tmp1, tmp2);
 9027           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9028           __ br(Assembler::NE, DIFF);
 9029         }
 9030         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9031         __ add(str1, str1, 64);
 9032         __ add(str2, str2, 64);
 9033         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9034         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9035         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9036     }
 9037 
 9038     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9039     __ br(Assembler::LE, LESS16);
 9040     __ align(OptoLoopAlignment);
 9041     __ bind(LOOP_COMPARE16);
 9042       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9043       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9044       __ cmp(tmp1, tmp2);
 9045       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9046       __ br(Assembler::NE, DIFF);
 9047       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9048       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9049       __ br(Assembler::LT, LESS16);
 9050 
 9051       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9052       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9053       __ cmp(tmp1, tmp2);
 9054       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9055       __ br(Assembler::NE, DIFF);
 9056       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9057       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9058       __ br(Assembler::GE, LOOP_COMPARE16);
 9059       __ cbz(cnt2, LENGTH_DIFF);
 9060 
 9061     __ bind(LESS16);
 9062       // each 8 compare
 9063       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9064       __ br(Assembler::LE, LESS8);
 9065       __ ldr(tmp1, Address(__ post(str1, 8)));
 9066       __ ldr(tmp2, Address(__ post(str2, 8)));
 9067       __ eor(rscratch2, tmp1, tmp2);
 9068       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9069       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9070 
 9071     __ bind(LESS8); // directly load last 8 bytes
 9072       if (!isLL) {
 9073         __ add(cnt2, cnt2, cnt2);
 9074       }
 9075       __ ldr(tmp1, Address(str1, cnt2));
 9076       __ ldr(tmp2, Address(str2, cnt2));
 9077       __ eor(rscratch2, tmp1, tmp2);
 9078       __ cbz(rscratch2, LENGTH_DIFF);
 9079       __ b(CAL_DIFFERENCE);
 9080 
 9081     __ bind(DIFF);
 9082       __ cmp(tmp1, tmp2);
 9083       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9084       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9085       // reuse rscratch2 register for the result of eor instruction
 9086       __ eor(rscratch2, tmp1, tmp2);
 9087 
 9088     __ bind(CAL_DIFFERENCE);
 9089       __ rev(rscratch2, rscratch2);
 9090       __ clz(rscratch2, rscratch2);
 9091       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9092       __ lsrv(tmp1, tmp1, rscratch2);
 9093       __ lsrv(tmp2, tmp2, rscratch2);
 9094       if (isLL) {
 9095         __ uxtbw(tmp1, tmp1);
 9096         __ uxtbw(tmp2, tmp2);
 9097       } else {
 9098         __ uxthw(tmp1, tmp1);
 9099         __ uxthw(tmp2, tmp2);
 9100       }
 9101       __ subw(result, tmp1, tmp2);
 9102 
 9103     __ bind(LENGTH_DIFF);
 9104       __ ret(lr);
 9105     return entry;
 9106   }
 9107 
 9108   enum string_compare_mode {
 9109     LL,
 9110     LU,
 9111     UL,
 9112     UU,
 9113   };
 9114 
 9115   // The following registers are declared in aarch64.ad
 9116   // r0  = result
 9117   // r1  = str1
 9118   // r2  = cnt1
 9119   // r3  = str2
 9120   // r4  = cnt2
 9121   // r10 = tmp1
 9122   // r11 = tmp2
 9123   // z0  = ztmp1
 9124   // z1  = ztmp2
 9125   // p0  = pgtmp1
 9126   // p1  = pgtmp2
 9127   address generate_compare_long_string_sve(string_compare_mode mode) {
 9128     StubId stub_id;
 9129     switch (mode) {
 9130       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9131       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9132       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9133       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9134       default: ShouldNotReachHere();
 9135     }
 9136 
 9137     __ align(CodeEntryAlignment);
 9138     address entry = __ pc();
 9139     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9140              tmp1 = r10, tmp2 = r11;
 9141 
 9142     Label LOOP, DONE, MISMATCH;
 9143     Register vec_len = tmp1;
 9144     Register idx = tmp2;
 9145     // The minimum of the string lengths has been stored in cnt2.
 9146     Register cnt = cnt2;
 9147     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9148     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9149 
 9150 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9151     switch (mode) {                                                            \
 9152       case LL:                                                                 \
 9153         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9154         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9155         break;                                                                 \
 9156       case LU:                                                                 \
 9157         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9158         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9159         break;                                                                 \
 9160       case UL:                                                                 \
 9161         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9162         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9163         break;                                                                 \
 9164       case UU:                                                                 \
 9165         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9166         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9167         break;                                                                 \
 9168       default:                                                                 \
 9169         ShouldNotReachHere();                                                  \
 9170     }
 9171 
 9172     StubCodeMark mark(this, stub_id);
 9173 
 9174     __ mov(idx, 0);
 9175     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9176 
 9177     if (mode == LL) {
 9178       __ sve_cntb(vec_len);
 9179     } else {
 9180       __ sve_cnth(vec_len);
 9181     }
 9182 
 9183     __ sub(rscratch1, cnt, vec_len);
 9184 
 9185     __ bind(LOOP);
 9186 
 9187       // main loop
 9188       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9189       __ add(idx, idx, vec_len);
 9190       // Compare strings.
 9191       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9192       __ br(__ NE, MISMATCH);
 9193       __ cmp(idx, rscratch1);
 9194       __ br(__ LT, LOOP);
 9195 
 9196     // post loop, last iteration
 9197     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9198 
 9199     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9200     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9201     __ br(__ EQ, DONE);
 9202 
 9203     __ bind(MISMATCH);
 9204 
 9205     // Crop the vector to find its location.
 9206     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9207     // Extract the first different characters of each string.
 9208     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9209     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9210 
 9211     // Compute the difference of the first different characters.
 9212     __ sub(result, rscratch1, rscratch2);
 9213 
 9214     __ bind(DONE);
 9215     __ ret(lr);
 9216 #undef LOAD_PAIR
 9217     return entry;
 9218   }
 9219 
 9220   void generate_compare_long_strings() {
 9221     if (UseSVE == 0) {
 9222       StubRoutines::aarch64::_compare_long_string_LL
 9223           = generate_compare_long_string_same_encoding(true);
 9224       StubRoutines::aarch64::_compare_long_string_UU
 9225           = generate_compare_long_string_same_encoding(false);
 9226       StubRoutines::aarch64::_compare_long_string_LU
 9227           = generate_compare_long_string_different_encoding(true);
 9228       StubRoutines::aarch64::_compare_long_string_UL
 9229           = generate_compare_long_string_different_encoding(false);
 9230     } else {
 9231       StubRoutines::aarch64::_compare_long_string_LL
 9232           = generate_compare_long_string_sve(LL);
 9233       StubRoutines::aarch64::_compare_long_string_UU
 9234           = generate_compare_long_string_sve(UU);
 9235       StubRoutines::aarch64::_compare_long_string_LU
 9236           = generate_compare_long_string_sve(LU);
 9237       StubRoutines::aarch64::_compare_long_string_UL
 9238           = generate_compare_long_string_sve(UL);
 9239     }
 9240   }
 9241 
 9242   // R0 = result
 9243   // R1 = str2
 9244   // R2 = cnt1
 9245   // R3 = str1
 9246   // R4 = cnt2
 9247   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9248   //
 9249   // This generic linear code use few additional ideas, which makes it faster:
 9250   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9251   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9252   // 2) we can use "fast" algorithm of finding single character to search for
 9253   // first symbol with less branches(1 branch per each loaded register instead
 9254   // of branch for each symbol), so, this is where constants like
 9255   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9256   // 3) after loading and analyzing 1st register of source string, it can be
 9257   // used to search for every 1st character entry, saving few loads in
 9258   // comparison with "simplier-but-slower" implementation
 9259   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9260   // re-using/re-initializing/compressing register values, which makes code
 9261   // larger and a bit less readable, however, most of extra operations are
 9262   // issued during loads or branches, so, penalty is minimal
 9263   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9264     StubId stub_id;
 9265     if (str1_isL) {
 9266       if (str2_isL) {
 9267         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9268       } else {
 9269         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9270       }
 9271     } else {
 9272       if (str2_isL) {
 9273         ShouldNotReachHere();
 9274       } else {
 9275         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9276       }
 9277     }
 9278     __ align(CodeEntryAlignment);
 9279     StubCodeMark mark(this, stub_id);
 9280     address entry = __ pc();
 9281 
 9282     int str1_chr_size = str1_isL ? 1 : 2;
 9283     int str2_chr_size = str2_isL ? 1 : 2;
 9284     int str1_chr_shift = str1_isL ? 0 : 1;
 9285     int str2_chr_shift = str2_isL ? 0 : 1;
 9286     bool isL = str1_isL && str2_isL;
 9287    // parameters
 9288     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9289     // temporary registers
 9290     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9291     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9292     // redefinitions
 9293     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9294 
 9295     __ push(spilled_regs, sp);
 9296     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9297         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9298         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9299         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9300         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9301         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9302     // Read whole register from str1. It is safe, because length >=8 here
 9303     __ ldr(ch1, Address(str1));
 9304     // Read whole register from str2. It is safe, because length >=8 here
 9305     __ ldr(ch2, Address(str2));
 9306     __ sub(cnt2, cnt2, cnt1);
 9307     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9308     if (str1_isL != str2_isL) {
 9309       __ eor(v0, __ T16B, v0, v0);
 9310     }
 9311     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9312     __ mul(first, first, tmp1);
 9313     // check if we have less than 1 register to check
 9314     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9315     if (str1_isL != str2_isL) {
 9316       __ fmovd(v1, ch1);
 9317     }
 9318     __ br(__ LE, L_SMALL);
 9319     __ eor(ch2, first, ch2);
 9320     if (str1_isL != str2_isL) {
 9321       __ zip1(v1, __ T16B, v1, v0);
 9322     }
 9323     __ sub(tmp2, ch2, tmp1);
 9324     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9325     __ bics(tmp2, tmp2, ch2);
 9326     if (str1_isL != str2_isL) {
 9327       __ fmovd(ch1, v1);
 9328     }
 9329     __ br(__ NE, L_HAS_ZERO);
 9330     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9331     __ add(result, result, wordSize/str2_chr_size);
 9332     __ add(str2, str2, wordSize);
 9333     __ br(__ LT, L_POST_LOOP);
 9334     __ BIND(L_LOOP);
 9335       __ ldr(ch2, Address(str2));
 9336       __ eor(ch2, first, ch2);
 9337       __ sub(tmp2, ch2, tmp1);
 9338       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9339       __ bics(tmp2, tmp2, ch2);
 9340       __ br(__ NE, L_HAS_ZERO);
 9341     __ BIND(L_LOOP_PROCEED);
 9342       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9343       __ add(str2, str2, wordSize);
 9344       __ add(result, result, wordSize/str2_chr_size);
 9345       __ br(__ GE, L_LOOP);
 9346     __ BIND(L_POST_LOOP);
 9347       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9348       __ br(__ LE, NOMATCH);
 9349       __ ldr(ch2, Address(str2));
 9350       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9351       __ eor(ch2, first, ch2);
 9352       __ sub(tmp2, ch2, tmp1);
 9353       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9354       __ mov(tmp4, -1); // all bits set
 9355       __ b(L_SMALL_PROCEED);
 9356     __ align(OptoLoopAlignment);
 9357     __ BIND(L_SMALL);
 9358       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9359       __ eor(ch2, first, ch2);
 9360       if (str1_isL != str2_isL) {
 9361         __ zip1(v1, __ T16B, v1, v0);
 9362       }
 9363       __ sub(tmp2, ch2, tmp1);
 9364       __ mov(tmp4, -1); // all bits set
 9365       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9366       if (str1_isL != str2_isL) {
 9367         __ fmovd(ch1, v1); // move converted 4 symbols
 9368       }
 9369     __ BIND(L_SMALL_PROCEED);
 9370       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9371       __ bic(tmp2, tmp2, ch2);
 9372       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9373       __ rbit(tmp2, tmp2);
 9374       __ br(__ EQ, NOMATCH);
 9375     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9376       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9377       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9378       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9379       if (str2_isL) { // LL
 9380         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9381         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9382         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9383         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9384         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9385       } else {
 9386         __ mov(ch2, 0xE); // all bits in byte set except last one
 9387         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9388         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9389         __ lslv(tmp2, tmp2, tmp4);
 9390         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9391         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9392         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9393         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9394       }
 9395       __ cmp(ch1, ch2);
 9396       __ mov(tmp4, wordSize/str2_chr_size);
 9397       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9398     __ BIND(L_SMALL_CMP_LOOP);
 9399       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9400                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9401       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9402                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9403       __ add(tmp4, tmp4, 1);
 9404       __ cmp(tmp4, cnt1);
 9405       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9406       __ cmp(first, ch2);
 9407       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9408     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9409       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9410       __ clz(tmp4, tmp2);
 9411       __ add(result, result, 1); // advance index
 9412       __ add(str2, str2, str2_chr_size); // advance pointer
 9413       __ b(L_SMALL_HAS_ZERO_LOOP);
 9414     __ align(OptoLoopAlignment);
 9415     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9416       __ cmp(first, ch2);
 9417       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9418       __ b(DONE);
 9419     __ align(OptoLoopAlignment);
 9420     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9421       if (str2_isL) { // LL
 9422         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9423         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9424         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9425         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9426         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9427       } else {
 9428         __ mov(ch2, 0xE); // all bits in byte set except last one
 9429         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9430         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9431         __ lslv(tmp2, tmp2, tmp4);
 9432         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9433         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9434         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9435         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9436       }
 9437       __ cmp(ch1, ch2);
 9438       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9439       __ b(DONE);
 9440     __ align(OptoLoopAlignment);
 9441     __ BIND(L_HAS_ZERO);
 9442       __ rbit(tmp2, tmp2);
 9443       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9444       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9445       // It's fine because both counters are 32bit and are not changed in this
 9446       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9447       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9448       __ sub(result, result, 1);
 9449     __ BIND(L_HAS_ZERO_LOOP);
 9450       __ mov(cnt1, wordSize/str2_chr_size);
 9451       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9452       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9453       if (str2_isL) {
 9454         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9455         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9456         __ lslv(tmp2, tmp2, tmp4);
 9457         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9458         __ add(tmp4, tmp4, 1);
 9459         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9460         __ lsl(tmp2, tmp2, 1);
 9461         __ mov(tmp4, wordSize/str2_chr_size);
 9462       } else {
 9463         __ mov(ch2, 0xE);
 9464         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9465         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9466         __ lslv(tmp2, tmp2, tmp4);
 9467         __ add(tmp4, tmp4, 1);
 9468         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9469         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9470         __ lsl(tmp2, tmp2, 1);
 9471         __ mov(tmp4, wordSize/str2_chr_size);
 9472         __ sub(str2, str2, str2_chr_size);
 9473       }
 9474       __ cmp(ch1, ch2);
 9475       __ mov(tmp4, wordSize/str2_chr_size);
 9476       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9477     __ BIND(L_CMP_LOOP);
 9478       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9479                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9480       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9481                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9482       __ add(tmp4, tmp4, 1);
 9483       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9484       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9485       __ cmp(cnt1, ch2);
 9486       __ br(__ EQ, L_CMP_LOOP);
 9487     __ BIND(L_CMP_LOOP_NOMATCH);
 9488       // here we're not matched
 9489       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9490       __ clz(tmp4, tmp2);
 9491       __ add(str2, str2, str2_chr_size); // advance pointer
 9492       __ b(L_HAS_ZERO_LOOP);
 9493     __ align(OptoLoopAlignment);
 9494     __ BIND(L_CMP_LOOP_LAST_CMP);
 9495       __ cmp(cnt1, ch2);
 9496       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9497       __ b(DONE);
 9498     __ align(OptoLoopAlignment);
 9499     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9500       if (str2_isL) {
 9501         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9502         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9503         __ lslv(tmp2, tmp2, tmp4);
 9504         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9505         __ add(tmp4, tmp4, 1);
 9506         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9507         __ lsl(tmp2, tmp2, 1);
 9508       } else {
 9509         __ mov(ch2, 0xE);
 9510         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9511         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9512         __ lslv(tmp2, tmp2, tmp4);
 9513         __ add(tmp4, tmp4, 1);
 9514         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9515         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9516         __ lsl(tmp2, tmp2, 1);
 9517         __ sub(str2, str2, str2_chr_size);
 9518       }
 9519       __ cmp(ch1, ch2);
 9520       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9521       __ b(DONE);
 9522     __ align(OptoLoopAlignment);
 9523     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9524       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9525       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9526       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9527       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9528       // result by analyzed characters value, so, we can just reset lower bits
 9529       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9530       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9531       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9532       // index of last analyzed substring inside current octet. So, str2 in at
 9533       // respective start address. We need to advance it to next octet
 9534       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9535       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9536       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9537       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9538       __ movw(cnt2, cnt2);
 9539       __ b(L_LOOP_PROCEED);
 9540     __ align(OptoLoopAlignment);
 9541     __ BIND(NOMATCH);
 9542       __ mov(result, -1);
 9543     __ BIND(DONE);
 9544       __ pop(spilled_regs, sp);
 9545       __ ret(lr);
 9546     return entry;
 9547   }
 9548 
 9549   void generate_string_indexof_stubs() {
 9550     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9551     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9552     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9553   }
 9554 
 9555   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9556       FloatRegister src1, FloatRegister src2) {
 9557     Register dst = r1;
 9558     __ zip1(v1, __ T16B, src1, v0);
 9559     __ zip2(v2, __ T16B, src1, v0);
 9560     if (generatePrfm) {
 9561       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9562     }
 9563     __ zip1(v3, __ T16B, src2, v0);
 9564     __ zip2(v4, __ T16B, src2, v0);
 9565     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9566   }
 9567 
 9568   // R0 = src
 9569   // R1 = dst
 9570   // R2 = len
 9571   // R3 = len >> 3
 9572   // V0 = 0
 9573   // v1 = loaded 8 bytes
 9574   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9575   address generate_large_byte_array_inflate() {
 9576     __ align(CodeEntryAlignment);
 9577     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9578     StubCodeMark mark(this, stub_id);
 9579     address entry = __ pc();
 9580     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9581     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9582     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9583 
 9584     // do one more 8-byte read to have address 16-byte aligned in most cases
 9585     // also use single store instruction
 9586     __ ldrd(v2, __ post(src, 8));
 9587     __ sub(octetCounter, octetCounter, 2);
 9588     __ zip1(v1, __ T16B, v1, v0);
 9589     __ zip1(v2, __ T16B, v2, v0);
 9590     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9591     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9592     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9593     __ br(__ LE, LOOP_START);
 9594     __ b(LOOP_PRFM_START);
 9595     __ bind(LOOP_PRFM);
 9596       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9597     __ bind(LOOP_PRFM_START);
 9598       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9599       __ sub(octetCounter, octetCounter, 8);
 9600       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9601       inflate_and_store_2_fp_registers(true, v3, v4);
 9602       inflate_and_store_2_fp_registers(true, v5, v6);
 9603       __ br(__ GT, LOOP_PRFM);
 9604       __ cmp(octetCounter, (u1)8);
 9605       __ br(__ LT, DONE);
 9606     __ bind(LOOP);
 9607       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9608       __ bind(LOOP_START);
 9609       __ sub(octetCounter, octetCounter, 8);
 9610       __ cmp(octetCounter, (u1)8);
 9611       inflate_and_store_2_fp_registers(false, v3, v4);
 9612       inflate_and_store_2_fp_registers(false, v5, v6);
 9613       __ br(__ GE, LOOP);
 9614     __ bind(DONE);
 9615       __ ret(lr);
 9616     return entry;
 9617   }
 9618 
 9619   /**
 9620    *  Arguments:
 9621    *
 9622    *  Input:
 9623    *  c_rarg0   - current state address
 9624    *  c_rarg1   - H key address
 9625    *  c_rarg2   - data address
 9626    *  c_rarg3   - number of blocks
 9627    *
 9628    *  Output:
 9629    *  Updated state at c_rarg0
 9630    */
 9631   address generate_ghash_processBlocks() {
 9632     // Bafflingly, GCM uses little-endian for the byte order, but
 9633     // big-endian for the bit order.  For example, the polynomial 1 is
 9634     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9635     //
 9636     // So, we must either reverse the bytes in each word and do
 9637     // everything big-endian or reverse the bits in each byte and do
 9638     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9639     // the bits in each byte (we have an instruction, RBIT, to do
 9640     // that) and keep the data in little-endian bit order through the
 9641     // calculation, bit-reversing the inputs and outputs.
 9642 
 9643     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9644     StubCodeMark mark(this, stub_id);
 9645     __ align(wordSize * 2);
 9646     address p = __ pc();
 9647     __ emit_int64(0x87);  // The low-order bits of the field
 9648                           // polynomial (i.e. p = z^7+z^2+z+1)
 9649                           // repeated in the low and high parts of a
 9650                           // 128-bit vector
 9651     __ emit_int64(0x87);
 9652 
 9653     __ align(CodeEntryAlignment);
 9654     address start = __ pc();
 9655 
 9656     Register state   = c_rarg0;
 9657     Register subkeyH = c_rarg1;
 9658     Register data    = c_rarg2;
 9659     Register blocks  = c_rarg3;
 9660 
 9661     FloatRegister vzr = v30;
 9662     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9663 
 9664     __ ldrq(v24, p);    // The field polynomial
 9665 
 9666     __ ldrq(v0, Address(state));
 9667     __ ldrq(v1, Address(subkeyH));
 9668 
 9669     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9670     __ rbit(v0, __ T16B, v0);
 9671     __ rev64(v1, __ T16B, v1);
 9672     __ rbit(v1, __ T16B, v1);
 9673 
 9674     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9675     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9676 
 9677     {
 9678       Label L_ghash_loop;
 9679       __ bind(L_ghash_loop);
 9680 
 9681       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9682                                                  // reversing each byte
 9683       __ rbit(v2, __ T16B, v2);
 9684       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9685 
 9686       // Multiply state in v2 by subkey in v1
 9687       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9688                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9689                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9690       // Reduce v7:v5 by the field polynomial
 9691       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9692 
 9693       __ sub(blocks, blocks, 1);
 9694       __ cbnz(blocks, L_ghash_loop);
 9695     }
 9696 
 9697     // The bit-reversed result is at this point in v0
 9698     __ rev64(v0, __ T16B, v0);
 9699     __ rbit(v0, __ T16B, v0);
 9700 
 9701     __ st1(v0, __ T16B, state);
 9702     __ ret(lr);
 9703 
 9704     return start;
 9705   }
 9706 
 9707   address generate_ghash_processBlocks_wide() {
 9708     address small = generate_ghash_processBlocks();
 9709 
 9710     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9711     StubCodeMark mark(this, stub_id);
 9712     __ align(wordSize * 2);
 9713     address p = __ pc();
 9714     __ emit_int64(0x87);  // The low-order bits of the field
 9715                           // polynomial (i.e. p = z^7+z^2+z+1)
 9716                           // repeated in the low and high parts of a
 9717                           // 128-bit vector
 9718     __ emit_int64(0x87);
 9719 
 9720     __ align(CodeEntryAlignment);
 9721     address start = __ pc();
 9722 
 9723     Register state   = c_rarg0;
 9724     Register subkeyH = c_rarg1;
 9725     Register data    = c_rarg2;
 9726     Register blocks  = c_rarg3;
 9727 
 9728     const int unroll = 4;
 9729 
 9730     __ cmp(blocks, (unsigned char)(unroll * 2));
 9731     __ br(__ LT, small);
 9732 
 9733     if (unroll > 1) {
 9734     // Save state before entering routine
 9735       __ sub(sp, sp, 4 * 16);
 9736       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9737       __ sub(sp, sp, 4 * 16);
 9738       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9739     }
 9740 
 9741     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9742 
 9743     if (unroll > 1) {
 9744       // And restore state
 9745       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9746       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9747     }
 9748 
 9749     __ cmp(blocks, (unsigned char)0);
 9750     __ br(__ GT, small);
 9751 
 9752     __ ret(lr);
 9753 
 9754     return start;
 9755   }
 9756 
 9757   void generate_base64_encode_simdround(Register src, Register dst,
 9758         FloatRegister codec, u8 size) {
 9759 
 9760     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9761     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9762     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9763 
 9764     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9765 
 9766     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9767 
 9768     __ ushr(ind0, arrangement, in0,  2);
 9769 
 9770     __ ushr(ind1, arrangement, in1,  2);
 9771     __ shl(in0,   arrangement, in0,  6);
 9772     __ orr(ind1,  arrangement, ind1, in0);
 9773     __ ushr(ind1, arrangement, ind1, 2);
 9774 
 9775     __ ushr(ind2, arrangement, in2,  4);
 9776     __ shl(in1,   arrangement, in1,  4);
 9777     __ orr(ind2,  arrangement, in1,  ind2);
 9778     __ ushr(ind2, arrangement, ind2, 2);
 9779 
 9780     __ shl(ind3,  arrangement, in2,  2);
 9781     __ ushr(ind3, arrangement, ind3, 2);
 9782 
 9783     __ tbl(out0,  arrangement, codec,  4, ind0);
 9784     __ tbl(out1,  arrangement, codec,  4, ind1);
 9785     __ tbl(out2,  arrangement, codec,  4, ind2);
 9786     __ tbl(out3,  arrangement, codec,  4, ind3);
 9787 
 9788     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9789   }
 9790 
 9791    /**
 9792    *  Arguments:
 9793    *
 9794    *  Input:
 9795    *  c_rarg0   - src_start
 9796    *  c_rarg1   - src_offset
 9797    *  c_rarg2   - src_length
 9798    *  c_rarg3   - dest_start
 9799    *  c_rarg4   - dest_offset
 9800    *  c_rarg5   - isURL
 9801    *
 9802    */
 9803   address generate_base64_encodeBlock() {
 9804 
 9805     static const char toBase64[64] = {
 9806       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9807       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9808       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9809       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9810       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9811     };
 9812 
 9813     static const char toBase64URL[64] = {
 9814       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9815       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9816       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9817       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9818       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9819     };
 9820 
 9821     __ align(CodeEntryAlignment);
 9822     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9823     StubCodeMark mark(this, stub_id);
 9824     address start = __ pc();
 9825 
 9826     Register src   = c_rarg0;  // source array
 9827     Register soff  = c_rarg1;  // source start offset
 9828     Register send  = c_rarg2;  // source end offset
 9829     Register dst   = c_rarg3;  // dest array
 9830     Register doff  = c_rarg4;  // position for writing to dest array
 9831     Register isURL = c_rarg5;  // Base64 or URL character set
 9832 
 9833     // c_rarg6 and c_rarg7 are free to use as temps
 9834     Register codec  = c_rarg6;
 9835     Register length = c_rarg7;
 9836 
 9837     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9838 
 9839     __ add(src, src, soff);
 9840     __ add(dst, dst, doff);
 9841     __ sub(length, send, soff);
 9842 
 9843     // load the codec base address
 9844     __ lea(codec, ExternalAddress((address) toBase64));
 9845     __ cbz(isURL, ProcessData);
 9846     __ lea(codec, ExternalAddress((address) toBase64URL));
 9847 
 9848     __ BIND(ProcessData);
 9849 
 9850     // too short to formup a SIMD loop, roll back
 9851     __ cmp(length, (u1)24);
 9852     __ br(Assembler::LT, Process3B);
 9853 
 9854     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9855 
 9856     __ BIND(Process48B);
 9857     __ cmp(length, (u1)48);
 9858     __ br(Assembler::LT, Process24B);
 9859     generate_base64_encode_simdround(src, dst, v0, 16);
 9860     __ sub(length, length, 48);
 9861     __ b(Process48B);
 9862 
 9863     __ BIND(Process24B);
 9864     __ cmp(length, (u1)24);
 9865     __ br(Assembler::LT, SIMDExit);
 9866     generate_base64_encode_simdround(src, dst, v0, 8);
 9867     __ sub(length, length, 24);
 9868 
 9869     __ BIND(SIMDExit);
 9870     __ cbz(length, Exit);
 9871 
 9872     __ BIND(Process3B);
 9873     //  3 src bytes, 24 bits
 9874     __ ldrb(r10, __ post(src, 1));
 9875     __ ldrb(r11, __ post(src, 1));
 9876     __ ldrb(r12, __ post(src, 1));
 9877     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9878     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9879     // codec index
 9880     __ ubfmw(r15, r12, 18, 23);
 9881     __ ubfmw(r14, r12, 12, 17);
 9882     __ ubfmw(r13, r12, 6,  11);
 9883     __ andw(r12,  r12, 63);
 9884     // get the code based on the codec
 9885     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9886     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9887     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9888     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9889     __ strb(r15, __ post(dst, 1));
 9890     __ strb(r14, __ post(dst, 1));
 9891     __ strb(r13, __ post(dst, 1));
 9892     __ strb(r12, __ post(dst, 1));
 9893     __ sub(length, length, 3);
 9894     __ cbnz(length, Process3B);
 9895 
 9896     __ BIND(Exit);
 9897     __ ret(lr);
 9898 
 9899     return start;
 9900   }
 9901 
 9902   void generate_base64_decode_simdround(Register src, Register dst,
 9903         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9904 
 9905     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9906     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9907 
 9908     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9909     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9910 
 9911     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9912 
 9913     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9914 
 9915     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9916 
 9917     // we need unsigned saturating subtract, to make sure all input values
 9918     // in range [0, 63] will have 0U value in the higher half lookup
 9919     __ uqsubv(decH0, __ T16B, in0, v27);
 9920     __ uqsubv(decH1, __ T16B, in1, v27);
 9921     __ uqsubv(decH2, __ T16B, in2, v27);
 9922     __ uqsubv(decH3, __ T16B, in3, v27);
 9923 
 9924     // lower half lookup
 9925     __ tbl(decL0, arrangement, codecL, 4, in0);
 9926     __ tbl(decL1, arrangement, codecL, 4, in1);
 9927     __ tbl(decL2, arrangement, codecL, 4, in2);
 9928     __ tbl(decL3, arrangement, codecL, 4, in3);
 9929 
 9930     // higher half lookup
 9931     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9932     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9933     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9934     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9935 
 9936     // combine lower and higher
 9937     __ orr(decL0, arrangement, decL0, decH0);
 9938     __ orr(decL1, arrangement, decL1, decH1);
 9939     __ orr(decL2, arrangement, decL2, decH2);
 9940     __ orr(decL3, arrangement, decL3, decH3);
 9941 
 9942     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9943     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9944     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9945     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9946     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9947     __ orr(in0, arrangement, decH0, decH1);
 9948     __ orr(in1, arrangement, decH2, decH3);
 9949     __ orr(in2, arrangement, in0,   in1);
 9950     __ umaxv(in3, arrangement, in2);
 9951     __ umov(rscratch2, in3, __ B, 0);
 9952 
 9953     // get the data to output
 9954     __ shl(out0,  arrangement, decL0, 2);
 9955     __ ushr(out1, arrangement, decL1, 4);
 9956     __ orr(out0,  arrangement, out0,  out1);
 9957     __ shl(out1,  arrangement, decL1, 4);
 9958     __ ushr(out2, arrangement, decL2, 2);
 9959     __ orr(out1,  arrangement, out1,  out2);
 9960     __ shl(out2,  arrangement, decL2, 6);
 9961     __ orr(out2,  arrangement, out2,  decL3);
 9962 
 9963     __ cbz(rscratch2, NoIllegalData);
 9964 
 9965     // handle illegal input
 9966     __ umov(r10, in2, __ D, 0);
 9967     if (size == 16) {
 9968       __ cbnz(r10, ErrorInLowerHalf);
 9969 
 9970       // illegal input is in higher half, store the lower half now.
 9971       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9972 
 9973       __ umov(r10, in2,  __ D, 1);
 9974       __ umov(r11, out0, __ D, 1);
 9975       __ umov(r12, out1, __ D, 1);
 9976       __ umov(r13, out2, __ D, 1);
 9977       __ b(StoreLegalData);
 9978 
 9979       __ BIND(ErrorInLowerHalf);
 9980     }
 9981     __ umov(r11, out0, __ D, 0);
 9982     __ umov(r12, out1, __ D, 0);
 9983     __ umov(r13, out2, __ D, 0);
 9984 
 9985     __ BIND(StoreLegalData);
 9986     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 9987     __ strb(r11, __ post(dst, 1));
 9988     __ strb(r12, __ post(dst, 1));
 9989     __ strb(r13, __ post(dst, 1));
 9990     __ lsr(r10, r10, 8);
 9991     __ lsr(r11, r11, 8);
 9992     __ lsr(r12, r12, 8);
 9993     __ lsr(r13, r13, 8);
 9994     __ b(StoreLegalData);
 9995 
 9996     __ BIND(NoIllegalData);
 9997     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 9998   }
 9999 
10000 
10001    /**
10002    *  Arguments:
10003    *
10004    *  Input:
10005    *  c_rarg0   - src_start
10006    *  c_rarg1   - src_offset
10007    *  c_rarg2   - src_length
10008    *  c_rarg3   - dest_start
10009    *  c_rarg4   - dest_offset
10010    *  c_rarg5   - isURL
10011    *  c_rarg6   - isMIME
10012    *
10013    */
10014   address generate_base64_decodeBlock() {
10015 
10016     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10017     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10018     // titled "Base64 decoding".
10019 
10020     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10021     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10022     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10023     static const uint8_t fromBase64ForNoSIMD[256] = {
10024       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10025       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10026       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10027        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10028       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10029        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10030       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10031        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10032       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10033       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10034       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10035       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10036       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10037       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10038       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10039       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10040     };
10041 
10042     static const uint8_t fromBase64URLForNoSIMD[256] = {
10043       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10044       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10045       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10046        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10047       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10048        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10049       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10050        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10051       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10052       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10054       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10055       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10056       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10058       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10059     };
10060 
10061     // A legal value of base64 code is in range [0, 127].  We need two lookups
10062     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10063     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10064     // table vector lookup use tbx, out of range indices are unchanged in
10065     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10066     // The value of index 64 is set to 0, so that we know that we already get the
10067     // decoded data with the 1st lookup.
10068     static const uint8_t fromBase64ForSIMD[128] = {
10069       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10070       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10071       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10072        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10073         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10074        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10075       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10076        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10077     };
10078 
10079     static const uint8_t fromBase64URLForSIMD[128] = {
10080       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10081       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10082       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10083        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10084         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10085        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10086        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10087        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10088     };
10089 
10090     __ align(CodeEntryAlignment);
10091     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10092     StubCodeMark mark(this, stub_id);
10093     address start = __ pc();
10094 
10095     Register src    = c_rarg0;  // source array
10096     Register soff   = c_rarg1;  // source start offset
10097     Register send   = c_rarg2;  // source end offset
10098     Register dst    = c_rarg3;  // dest array
10099     Register doff   = c_rarg4;  // position for writing to dest array
10100     Register isURL  = c_rarg5;  // Base64 or URL character set
10101     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10102 
10103     Register length = send;    // reuse send as length of source data to process
10104 
10105     Register simd_codec   = c_rarg6;
10106     Register nosimd_codec = c_rarg7;
10107 
10108     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10109 
10110     __ enter();
10111 
10112     __ add(src, src, soff);
10113     __ add(dst, dst, doff);
10114 
10115     __ mov(doff, dst);
10116 
10117     __ sub(length, send, soff);
10118     __ bfm(length, zr, 0, 1);
10119 
10120     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10121     __ cbz(isURL, ProcessData);
10122     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10123 
10124     __ BIND(ProcessData);
10125     __ mov(rscratch1, length);
10126     __ cmp(length, (u1)144); // 144 = 80 + 64
10127     __ br(Assembler::LT, Process4B);
10128 
10129     // In the MIME case, the line length cannot be more than 76
10130     // bytes (see RFC 2045). This is too short a block for SIMD
10131     // to be worthwhile, so we use non-SIMD here.
10132     __ movw(rscratch1, 79);
10133 
10134     __ BIND(Process4B);
10135     __ ldrw(r14, __ post(src, 4));
10136     __ ubfxw(r10, r14, 0,  8);
10137     __ ubfxw(r11, r14, 8,  8);
10138     __ ubfxw(r12, r14, 16, 8);
10139     __ ubfxw(r13, r14, 24, 8);
10140     // get the de-code
10141     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10142     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10143     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10144     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10145     // error detection, 255u indicates an illegal input
10146     __ orrw(r14, r10, r11);
10147     __ orrw(r15, r12, r13);
10148     __ orrw(r14, r14, r15);
10149     __ tbnz(r14, 7, Exit);
10150     // recover the data
10151     __ lslw(r14, r10, 10);
10152     __ bfiw(r14, r11, 4, 6);
10153     __ bfmw(r14, r12, 2, 5);
10154     __ rev16w(r14, r14);
10155     __ bfiw(r13, r12, 6, 2);
10156     __ strh(r14, __ post(dst, 2));
10157     __ strb(r13, __ post(dst, 1));
10158     // non-simd loop
10159     __ subsw(rscratch1, rscratch1, 4);
10160     __ br(Assembler::GT, Process4B);
10161 
10162     // if exiting from PreProcess80B, rscratch1 == -1;
10163     // otherwise, rscratch1 == 0.
10164     __ cbzw(rscratch1, Exit);
10165     __ sub(length, length, 80);
10166 
10167     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10168     __ cbz(isURL, SIMDEnter);
10169     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10170 
10171     __ BIND(SIMDEnter);
10172     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10173     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10174     __ mov(rscratch1, 63);
10175     __ dup(v27, __ T16B, rscratch1);
10176 
10177     __ BIND(Process64B);
10178     __ cmp(length, (u1)64);
10179     __ br(Assembler::LT, Process32B);
10180     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10181     __ sub(length, length, 64);
10182     __ b(Process64B);
10183 
10184     __ BIND(Process32B);
10185     __ cmp(length, (u1)32);
10186     __ br(Assembler::LT, SIMDExit);
10187     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10188     __ sub(length, length, 32);
10189     __ b(Process32B);
10190 
10191     __ BIND(SIMDExit);
10192     __ cbz(length, Exit);
10193     __ movw(rscratch1, length);
10194     __ b(Process4B);
10195 
10196     __ BIND(Exit);
10197     __ sub(c_rarg0, dst, doff);
10198 
10199     __ leave();
10200     __ ret(lr);
10201 
10202     return start;
10203   }
10204 
10205   // Support for spin waits.
10206   address generate_spin_wait() {
10207     __ align(CodeEntryAlignment);
10208     StubId stub_id = StubId::stubgen_spin_wait_id;
10209     StubCodeMark mark(this, stub_id);
10210     address start = __ pc();
10211 
10212     __ spin_wait();
10213     __ ret(lr);
10214 
10215     return start;
10216   }
10217 
10218   void generate_lookup_secondary_supers_table_stub() {
10219     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10220     StubCodeMark mark(this, stub_id);
10221 
10222     const Register
10223       r_super_klass  = r0,
10224       r_array_base   = r1,
10225       r_array_length = r2,
10226       r_array_index  = r3,
10227       r_sub_klass    = r4,
10228       r_bitmap       = rscratch2,
10229       result         = r5;
10230     const FloatRegister
10231       vtemp          = v0;
10232 
10233     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10234       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10235       Label L_success;
10236       __ enter();
10237       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10238                                              r_array_base, r_array_length, r_array_index,
10239                                              vtemp, result, slot,
10240                                              /*stub_is_near*/true);
10241       __ leave();
10242       __ ret(lr);
10243     }
10244   }
10245 
10246   // Slow path implementation for UseSecondarySupersTable.
10247   address generate_lookup_secondary_supers_table_slow_path_stub() {
10248     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10249     StubCodeMark mark(this, stub_id);
10250 
10251     address start = __ pc();
10252     const Register
10253       r_super_klass  = r0,        // argument
10254       r_array_base   = r1,        // argument
10255       temp1          = r2,        // temp
10256       r_array_index  = r3,        // argument
10257       r_bitmap       = rscratch2, // argument
10258       result         = r5;        // argument
10259 
10260     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10261     __ ret(lr);
10262 
10263     return start;
10264   }
10265 
10266 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10267 
10268   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
10269   //
10270   // If LSE is in use, generate LSE versions of all the stubs. The
10271   // non-LSE versions are in atomic_aarch64.S.
10272 
10273   // class AtomicStubMark records the entry point of a stub and the
10274   // stub pointer which will point to it. The stub pointer is set to
10275   // the entry point when ~AtomicStubMark() is called, which must be
10276   // after ICache::invalidate_range. This ensures safe publication of
10277   // the generated code.
10278   class AtomicStubMark {
10279     address _entry_point;
10280     aarch64_atomic_stub_t *_stub;
10281     MacroAssembler *_masm;
10282   public:
10283     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10284       _masm = masm;
10285       __ align(32);
10286       _entry_point = __ pc();
10287       _stub = stub;
10288     }
10289     ~AtomicStubMark() {
10290       *_stub = (aarch64_atomic_stub_t)_entry_point;
10291     }
10292   };
10293 
10294   // NB: For memory_order_conservative we need a trailing membar after
10295   // LSE atomic operations but not a leading membar.
10296   //
10297   // We don't need a leading membar because a clause in the Arm ARM
10298   // says:
10299   //
10300   //   Barrier-ordered-before
10301   //
10302   //   Barrier instructions order prior Memory effects before subsequent
10303   //   Memory effects generated by the same Observer. A read or a write
10304   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10305   //   Observer if and only if RW1 appears in program order before RW 2
10306   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10307   //   instruction with both Acquire and Release semantics.
10308   //
10309   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10310   // and Release semantics, therefore we don't need a leading
10311   // barrier. However, there is no corresponding Barrier-ordered-after
10312   // relationship, therefore we need a trailing membar to prevent a
10313   // later store or load from being reordered with the store in an
10314   // atomic instruction.
10315   //
10316   // This was checked by using the herd7 consistency model simulator
10317   // (http://diy.inria.fr/) with this test case:
10318   //
10319   // AArch64 LseCas
10320   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10321   // P0 | P1;
10322   // LDR W4, [X2] | MOV W3, #0;
10323   // DMB LD       | MOV W4, #1;
10324   // LDR W3, [X1] | CASAL W3, W4, [X1];
10325   //              | DMB ISH;
10326   //              | STR W4, [X2];
10327   // exists
10328   // (0:X3=0 /\ 0:X4=1)
10329   //
10330   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10331   // with the store to x in P1. Without the DMB in P1 this may happen.
10332   //
10333   // At the time of writing we don't know of any AArch64 hardware that
10334   // reorders stores in this way, but the Reference Manual permits it.
10335 
10336   void gen_cas_entry(Assembler::operand_size size,
10337                      atomic_memory_order order) {
10338     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10339       exchange_val = c_rarg2;
10340     bool acquire, release;
10341     switch (order) {
10342       case memory_order_relaxed:
10343         acquire = false;
10344         release = false;
10345         break;
10346       case memory_order_release:
10347         acquire = false;
10348         release = true;
10349         break;
10350       default:
10351         acquire = true;
10352         release = true;
10353         break;
10354     }
10355     __ mov(prev, compare_val);
10356     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10357     if (order == memory_order_conservative) {
10358       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10359     }
10360     if (size == Assembler::xword) {
10361       __ mov(r0, prev);
10362     } else {
10363       __ movw(r0, prev);
10364     }
10365     __ ret(lr);
10366   }
10367 
10368   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10369     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10370     // If not relaxed, then default to conservative.  Relaxed is the only
10371     // case we use enough to be worth specializing.
10372     if (order == memory_order_relaxed) {
10373       __ ldadd(size, incr, prev, addr);
10374     } else {
10375       __ ldaddal(size, incr, prev, addr);
10376       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10377     }
10378     if (size == Assembler::xword) {
10379       __ mov(r0, prev);
10380     } else {
10381       __ movw(r0, prev);
10382     }
10383     __ ret(lr);
10384   }
10385 
10386   void gen_swpal_entry(Assembler::operand_size size) {
10387     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10388     __ swpal(size, incr, prev, addr);
10389     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10390     if (size == Assembler::xword) {
10391       __ mov(r0, prev);
10392     } else {
10393       __ movw(r0, prev);
10394     }
10395     __ ret(lr);
10396   }
10397 
10398   void generate_atomic_entry_points() {
10399     if (! UseLSE) {
10400       return;
10401     }
10402     __ align(CodeEntryAlignment);
10403     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10404     StubCodeMark mark(this, stub_id);
10405     address first_entry = __ pc();
10406 
10407     // ADD, memory_order_conservative
10408     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10409     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10410     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10411     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10412 
10413     // ADD, memory_order_relaxed
10414     AtomicStubMark mark_fetch_add_4_relaxed
10415       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10416     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10417     AtomicStubMark mark_fetch_add_8_relaxed
10418       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10419     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10420 
10421     // XCHG, memory_order_conservative
10422     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10423     gen_swpal_entry(Assembler::word);
10424     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10425     gen_swpal_entry(Assembler::xword);
10426 
10427     // CAS, memory_order_conservative
10428     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10429     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10430     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10431     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10432     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10433     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10434 
10435     // CAS, memory_order_relaxed
10436     AtomicStubMark mark_cmpxchg_1_relaxed
10437       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10438     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10439     AtomicStubMark mark_cmpxchg_4_relaxed
10440       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10441     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10442     AtomicStubMark mark_cmpxchg_8_relaxed
10443       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10444     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10445 
10446     AtomicStubMark mark_cmpxchg_4_release
10447       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10448     gen_cas_entry(MacroAssembler::word, memory_order_release);
10449     AtomicStubMark mark_cmpxchg_8_release
10450       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10451     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10452 
10453     AtomicStubMark mark_cmpxchg_4_seq_cst
10454       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10455     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10456     AtomicStubMark mark_cmpxchg_8_seq_cst
10457       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10458     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10459 
10460     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10461   }
10462 #endif // LINUX
10463 
10464   address generate_cont_thaw(Continuation::thaw_kind kind) {
10465     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10466     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10467 
10468     address start = __ pc();
10469 
10470     if (return_barrier) {
10471       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10472       __ mov(sp, rscratch1);
10473     }
10474     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10475 
10476     if (return_barrier) {
10477       // preserve possible return value from a method returning to the return barrier
10478       __ fmovd(rscratch1, v0);
10479       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10480     }
10481 
10482     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10483     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10484     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10485 
10486     if (return_barrier) {
10487       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10488       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10489       __ fmovd(v0, rscratch1);
10490     }
10491     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10492 
10493 
10494     Label thaw_success;
10495     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10496     __ cbnz(rscratch2, thaw_success);
10497     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10498     __ br(rscratch1);
10499     __ bind(thaw_success);
10500 
10501     // make room for the thawed frames
10502     __ sub(rscratch1, sp, rscratch2);
10503     __ andr(rscratch1, rscratch1, -16); // align
10504     __ mov(sp, rscratch1);
10505 
10506     if (return_barrier) {
10507       // save original return value -- again
10508       __ fmovd(rscratch1, v0);
10509       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10510     }
10511 
10512     // If we want, we can templatize thaw by kind, and have three different entries
10513     __ movw(c_rarg1, (uint32_t)kind);
10514 
10515     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10516     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10517 
10518     if (return_barrier) {
10519       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10520       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10521       __ fmovd(v0, rscratch1);
10522     } else {
10523       __ mov(r0, zr); // return 0 (success) from doYield
10524     }
10525 
10526     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10527     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10528     __ mov(rfp, sp);
10529 
10530     if (return_barrier_exception) {
10531       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10532       __ authenticate_return_address(c_rarg1);
10533       __ verify_oop(r0);
10534       // save return value containing the exception oop in callee-saved R19
10535       __ mov(r19, r0);
10536 
10537       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10538 
10539       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10540       // __ reinitialize_ptrue();
10541 
10542       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10543 
10544       __ mov(r1, r0); // the exception handler
10545       __ mov(r0, r19); // restore return value containing the exception oop
10546       __ verify_oop(r0);
10547 
10548       __ leave();
10549       __ mov(r3, lr);
10550       __ br(r1); // the exception handler
10551     } else {
10552       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10553       __ leave();
10554       __ ret(lr);
10555     }
10556 
10557     return start;
10558   }
10559 
10560   address generate_cont_thaw() {
10561     if (!Continuations::enabled()) return nullptr;
10562 
10563     StubId stub_id = StubId::stubgen_cont_thaw_id;
10564     StubCodeMark mark(this, stub_id);
10565     address start = __ pc();
10566     generate_cont_thaw(Continuation::thaw_top);
10567     return start;
10568   }
10569 
10570   address generate_cont_returnBarrier() {
10571     if (!Continuations::enabled()) return nullptr;
10572 
10573     // TODO: will probably need multiple return barriers depending on return type
10574     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10575     StubCodeMark mark(this, stub_id);
10576     address start = __ pc();
10577 
10578     generate_cont_thaw(Continuation::thaw_return_barrier);
10579 
10580     return start;
10581   }
10582 
10583   address generate_cont_returnBarrier_exception() {
10584     if (!Continuations::enabled()) return nullptr;
10585 
10586     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10587     StubCodeMark mark(this, stub_id);
10588     address start = __ pc();
10589 
10590     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10591 
10592     return start;
10593   }
10594 
10595   address generate_cont_preempt_stub() {
10596     if (!Continuations::enabled()) return nullptr;
10597     StubId stub_id = StubId::stubgen_cont_preempt_id;
10598     StubCodeMark mark(this, stub_id);
10599     address start = __ pc();
10600 
10601     __ reset_last_Java_frame(true);
10602 
10603     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10604     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10605     __ mov(sp, rscratch2);
10606 
10607     Label preemption_cancelled;
10608     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10609     __ cbnz(rscratch1, preemption_cancelled);
10610 
10611     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10612     SharedRuntime::continuation_enter_cleanup(_masm);
10613     __ leave();
10614     __ ret(lr);
10615 
10616     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10617     __ bind(preemption_cancelled);
10618     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10619     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10620     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10621     __ ldr(rscratch1, Address(rscratch1));
10622     __ br(rscratch1);
10623 
10624     return start;
10625   }
10626 
10627   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10628   // are represented as long[5], with BITS_PER_LIMB = 26.
10629   // Pack five 26-bit limbs into three 64-bit registers.
10630   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10631     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10632     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10633     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10634     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10635 
10636     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10637     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10638     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10639     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10640 
10641     if (dest2->is_valid()) {
10642       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10643     } else {
10644 #ifdef ASSERT
10645       Label OK;
10646       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10647       __ br(__ EQ, OK);
10648       __ stop("high bits of Poly1305 integer should be zero");
10649       __ should_not_reach_here();
10650       __ bind(OK);
10651 #endif
10652     }
10653   }
10654 
10655   // As above, but return only a 128-bit integer, packed into two
10656   // 64-bit registers.
10657   void pack_26(Register dest0, Register dest1, Register src) {
10658     pack_26(dest0, dest1, noreg, src);
10659   }
10660 
10661   // Multiply and multiply-accumulate unsigned 64-bit registers.
10662   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10663     __ mul(prod_lo, n, m);
10664     __ umulh(prod_hi, n, m);
10665   }
10666   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10667     wide_mul(rscratch1, rscratch2, n, m);
10668     __ adds(sum_lo, sum_lo, rscratch1);
10669     __ adc(sum_hi, sum_hi, rscratch2);
10670   }
10671 
10672   // Poly1305, RFC 7539
10673 
10674   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10675   // description of the tricks used to simplify and accelerate this
10676   // computation.
10677 
10678   address generate_poly1305_processBlocks() {
10679     __ align(CodeEntryAlignment);
10680     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10681     StubCodeMark mark(this, stub_id);
10682     address start = __ pc();
10683     Label here;
10684     __ enter();
10685     RegSet callee_saved = RegSet::range(r19, r28);
10686     __ push(callee_saved, sp);
10687 
10688     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10689 
10690     // Arguments
10691     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10692 
10693     // R_n is the 128-bit randomly-generated key, packed into two
10694     // registers.  The caller passes this key to us as long[5], with
10695     // BITS_PER_LIMB = 26.
10696     const Register R_0 = *++regs, R_1 = *++regs;
10697     pack_26(R_0, R_1, r_start);
10698 
10699     // RR_n is (R_n >> 2) * 5
10700     const Register RR_0 = *++regs, RR_1 = *++regs;
10701     __ lsr(RR_0, R_0, 2);
10702     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10703     __ lsr(RR_1, R_1, 2);
10704     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10705 
10706     // U_n is the current checksum
10707     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10708     pack_26(U_0, U_1, U_2, acc_start);
10709 
10710     static constexpr int BLOCK_LENGTH = 16;
10711     Label DONE, LOOP;
10712 
10713     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10714     __ br(Assembler::LT, DONE); {
10715       __ bind(LOOP);
10716 
10717       // S_n is to be the sum of U_n and the next block of data
10718       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10719       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10720       __ adds(S_0, U_0, S_0);
10721       __ adcs(S_1, U_1, S_1);
10722       __ adc(S_2, U_2, zr);
10723       __ add(S_2, S_2, 1);
10724 
10725       const Register U_0HI = *++regs, U_1HI = *++regs;
10726 
10727       // NB: this logic depends on some of the special properties of
10728       // Poly1305 keys. In particular, because we know that the top
10729       // four bits of R_0 and R_1 are zero, we can add together
10730       // partial products without any risk of needing to propagate a
10731       // carry out.
10732       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10733       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10734       __ andr(U_2, R_0, 3);
10735       __ mul(U_2, S_2, U_2);
10736 
10737       // Recycle registers S_0, S_1, S_2
10738       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10739 
10740       // Partial reduction mod 2**130 - 5
10741       __ adds(U_1, U_0HI, U_1);
10742       __ adc(U_2, U_1HI, U_2);
10743       // Sum now in U_2:U_1:U_0.
10744       // Dead: U_0HI, U_1HI.
10745       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10746 
10747       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10748 
10749       // First, U_2:U_1:U_0 += (U_2 >> 2)
10750       __ lsr(rscratch1, U_2, 2);
10751       __ andr(U_2, U_2, (u8)3);
10752       __ adds(U_0, U_0, rscratch1);
10753       __ adcs(U_1, U_1, zr);
10754       __ adc(U_2, U_2, zr);
10755       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10756       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10757       __ adcs(U_1, U_1, zr);
10758       __ adc(U_2, U_2, zr);
10759 
10760       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10761       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10762       __ br(~ Assembler::LT, LOOP);
10763     }
10764 
10765     // Further reduce modulo 2^130 - 5
10766     __ lsr(rscratch1, U_2, 2);
10767     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10768     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10769     __ adcs(U_1, U_1, zr);
10770     __ andr(U_2, U_2, (u1)3);
10771     __ adc(U_2, U_2, zr);
10772 
10773     // Unpack the sum into five 26-bit limbs and write to memory.
10774     __ ubfiz(rscratch1, U_0, 0, 26);
10775     __ ubfx(rscratch2, U_0, 26, 26);
10776     __ stp(rscratch1, rscratch2, Address(acc_start));
10777     __ ubfx(rscratch1, U_0, 52, 12);
10778     __ bfi(rscratch1, U_1, 12, 14);
10779     __ ubfx(rscratch2, U_1, 14, 26);
10780     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10781     __ ubfx(rscratch1, U_1, 40, 24);
10782     __ bfi(rscratch1, U_2, 24, 3);
10783     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10784 
10785     __ bind(DONE);
10786     __ pop(callee_saved, sp);
10787     __ leave();
10788     __ ret(lr);
10789 
10790     return start;
10791   }
10792 
10793   // exception handler for upcall stubs
10794   address generate_upcall_stub_exception_handler() {
10795     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10796     StubCodeMark mark(this, stub_id);
10797     address start = __ pc();
10798 
10799     // Native caller has no idea how to handle exceptions,
10800     // so we just crash here. Up to callee to catch exceptions.
10801     __ verify_oop(r0);
10802     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10803     __ blr(rscratch1);
10804     __ should_not_reach_here();
10805 
10806     return start;
10807   }
10808 
10809   // load Method* target of MethodHandle
10810   // j_rarg0 = jobject receiver
10811   // rmethod = result
10812   address generate_upcall_stub_load_target() {
10813     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10814     StubCodeMark mark(this, stub_id);
10815     address start = __ pc();
10816 
10817     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10818       // Load target method from receiver
10819     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10820     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10821     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10822     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10823                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10824                       noreg, noreg);
10825     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10826 
10827     __ ret(lr);
10828 
10829     return start;
10830   }
10831 
10832 #undef __
10833 #define __ masm->
10834 
10835   class MontgomeryMultiplyGenerator : public MacroAssembler {
10836 
10837     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10838       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10839 
10840     RegSet _toSave;
10841     bool _squaring;
10842 
10843   public:
10844     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10845       : MacroAssembler(as->code()), _squaring(squaring) {
10846 
10847       // Register allocation
10848 
10849       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10850       Pa_base = *regs;       // Argument registers
10851       if (squaring)
10852         Pb_base = Pa_base;
10853       else
10854         Pb_base = *++regs;
10855       Pn_base = *++regs;
10856       Rlen= *++regs;
10857       inv = *++regs;
10858       Pm_base = *++regs;
10859 
10860                           // Working registers:
10861       Ra =  *++regs;        // The current digit of a, b, n, and m.
10862       Rb =  *++regs;
10863       Rm =  *++regs;
10864       Rn =  *++regs;
10865 
10866       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10867       Pb =  *++regs;
10868       Pm =  *++regs;
10869       Pn =  *++regs;
10870 
10871       t0 =  *++regs;        // Three registers which form a
10872       t1 =  *++regs;        // triple-precision accumuator.
10873       t2 =  *++regs;
10874 
10875       Ri =  *++regs;        // Inner and outer loop indexes.
10876       Rj =  *++regs;
10877 
10878       Rhi_ab = *++regs;     // Product registers: low and high parts
10879       Rlo_ab = *++regs;     // of a*b and m*n.
10880       Rhi_mn = *++regs;
10881       Rlo_mn = *++regs;
10882 
10883       // r19 and up are callee-saved.
10884       _toSave = RegSet::range(r19, *regs) + Pm_base;
10885     }
10886 
10887   private:
10888     void save_regs() {
10889       push(_toSave, sp);
10890     }
10891 
10892     void restore_regs() {
10893       pop(_toSave, sp);
10894     }
10895 
10896     template <typename T>
10897     void unroll_2(Register count, T block) {
10898       Label loop, end, odd;
10899       tbnz(count, 0, odd);
10900       cbz(count, end);
10901       align(16);
10902       bind(loop);
10903       (this->*block)();
10904       bind(odd);
10905       (this->*block)();
10906       subs(count, count, 2);
10907       br(Assembler::GT, loop);
10908       bind(end);
10909     }
10910 
10911     template <typename T>
10912     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10913       Label loop, end, odd;
10914       tbnz(count, 0, odd);
10915       cbz(count, end);
10916       align(16);
10917       bind(loop);
10918       (this->*block)(d, s, tmp);
10919       bind(odd);
10920       (this->*block)(d, s, tmp);
10921       subs(count, count, 2);
10922       br(Assembler::GT, loop);
10923       bind(end);
10924     }
10925 
10926     void pre1(RegisterOrConstant i) {
10927       block_comment("pre1");
10928       // Pa = Pa_base;
10929       // Pb = Pb_base + i;
10930       // Pm = Pm_base;
10931       // Pn = Pn_base + i;
10932       // Ra = *Pa;
10933       // Rb = *Pb;
10934       // Rm = *Pm;
10935       // Rn = *Pn;
10936       ldr(Ra, Address(Pa_base));
10937       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10938       ldr(Rm, Address(Pm_base));
10939       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10940       lea(Pa, Address(Pa_base));
10941       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10942       lea(Pm, Address(Pm_base));
10943       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10944 
10945       // Zero the m*n result.
10946       mov(Rhi_mn, zr);
10947       mov(Rlo_mn, zr);
10948     }
10949 
10950     // The core multiply-accumulate step of a Montgomery
10951     // multiplication.  The idea is to schedule operations as a
10952     // pipeline so that instructions with long latencies (loads and
10953     // multiplies) have time to complete before their results are
10954     // used.  This most benefits in-order implementations of the
10955     // architecture but out-of-order ones also benefit.
10956     void step() {
10957       block_comment("step");
10958       // MACC(Ra, Rb, t0, t1, t2);
10959       // Ra = *++Pa;
10960       // Rb = *--Pb;
10961       umulh(Rhi_ab, Ra, Rb);
10962       mul(Rlo_ab, Ra, Rb);
10963       ldr(Ra, pre(Pa, wordSize));
10964       ldr(Rb, pre(Pb, -wordSize));
10965       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10966                                        // previous iteration.
10967       // MACC(Rm, Rn, t0, t1, t2);
10968       // Rm = *++Pm;
10969       // Rn = *--Pn;
10970       umulh(Rhi_mn, Rm, Rn);
10971       mul(Rlo_mn, Rm, Rn);
10972       ldr(Rm, pre(Pm, wordSize));
10973       ldr(Rn, pre(Pn, -wordSize));
10974       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10975     }
10976 
10977     void post1() {
10978       block_comment("post1");
10979 
10980       // MACC(Ra, Rb, t0, t1, t2);
10981       // Ra = *++Pa;
10982       // Rb = *--Pb;
10983       umulh(Rhi_ab, Ra, Rb);
10984       mul(Rlo_ab, Ra, Rb);
10985       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10986       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10987 
10988       // *Pm = Rm = t0 * inv;
10989       mul(Rm, t0, inv);
10990       str(Rm, Address(Pm));
10991 
10992       // MACC(Rm, Rn, t0, t1, t2);
10993       // t0 = t1; t1 = t2; t2 = 0;
10994       umulh(Rhi_mn, Rm, Rn);
10995 
10996 #ifndef PRODUCT
10997       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10998       {
10999         mul(Rlo_mn, Rm, Rn);
11000         add(Rlo_mn, t0, Rlo_mn);
11001         Label ok;
11002         cbz(Rlo_mn, ok); {
11003           stop("broken Montgomery multiply");
11004         } bind(ok);
11005       }
11006 #endif
11007       // We have very carefully set things up so that
11008       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11009       // the lower half of Rm * Rn because we know the result already:
11010       // it must be -t0.  t0 + (-t0) must generate a carry iff
11011       // t0 != 0.  So, rather than do a mul and an adds we just set
11012       // the carry flag iff t0 is nonzero.
11013       //
11014       // mul(Rlo_mn, Rm, Rn);
11015       // adds(zr, t0, Rlo_mn);
11016       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11017       adcs(t0, t1, Rhi_mn);
11018       adc(t1, t2, zr);
11019       mov(t2, zr);
11020     }
11021 
11022     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11023       block_comment("pre2");
11024       // Pa = Pa_base + i-len;
11025       // Pb = Pb_base + len;
11026       // Pm = Pm_base + i-len;
11027       // Pn = Pn_base + len;
11028 
11029       if (i.is_register()) {
11030         sub(Rj, i.as_register(), len);
11031       } else {
11032         mov(Rj, i.as_constant());
11033         sub(Rj, Rj, len);
11034       }
11035       // Rj == i-len
11036 
11037       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11038       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11039       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11040       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11041 
11042       // Ra = *++Pa;
11043       // Rb = *--Pb;
11044       // Rm = *++Pm;
11045       // Rn = *--Pn;
11046       ldr(Ra, pre(Pa, wordSize));
11047       ldr(Rb, pre(Pb, -wordSize));
11048       ldr(Rm, pre(Pm, wordSize));
11049       ldr(Rn, pre(Pn, -wordSize));
11050 
11051       mov(Rhi_mn, zr);
11052       mov(Rlo_mn, zr);
11053     }
11054 
11055     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11056       block_comment("post2");
11057       if (i.is_constant()) {
11058         mov(Rj, i.as_constant()-len.as_constant());
11059       } else {
11060         sub(Rj, i.as_register(), len);
11061       }
11062 
11063       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11064 
11065       // As soon as we know the least significant digit of our result,
11066       // store it.
11067       // Pm_base[i-len] = t0;
11068       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11069 
11070       // t0 = t1; t1 = t2; t2 = 0;
11071       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11072       adc(t1, t2, zr);
11073       mov(t2, zr);
11074     }
11075 
11076     // A carry in t0 after Montgomery multiplication means that we
11077     // should subtract multiples of n from our result in m.  We'll
11078     // keep doing that until there is no carry.
11079     void normalize(RegisterOrConstant len) {
11080       block_comment("normalize");
11081       // while (t0)
11082       //   t0 = sub(Pm_base, Pn_base, t0, len);
11083       Label loop, post, again;
11084       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11085       cbz(t0, post); {
11086         bind(again); {
11087           mov(i, zr);
11088           mov(cnt, len);
11089           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11090           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11091           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11092           align(16);
11093           bind(loop); {
11094             sbcs(Rm, Rm, Rn);
11095             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11096             add(i, i, 1);
11097             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11098             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11099             sub(cnt, cnt, 1);
11100           } cbnz(cnt, loop);
11101           sbc(t0, t0, zr);
11102         } cbnz(t0, again);
11103       } bind(post);
11104     }
11105 
11106     // Move memory at s to d, reversing words.
11107     //    Increments d to end of copied memory
11108     //    Destroys tmp1, tmp2
11109     //    Preserves len
11110     //    Leaves s pointing to the address which was in d at start
11111     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11112       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11113       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11114 
11115       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11116       mov(tmp1, len);
11117       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11118       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11119     }
11120     // where
11121     void reverse1(Register d, Register s, Register tmp) {
11122       ldr(tmp, pre(s, -wordSize));
11123       ror(tmp, tmp, 32);
11124       str(tmp, post(d, wordSize));
11125     }
11126 
11127     void step_squaring() {
11128       // An extra ACC
11129       step();
11130       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11131     }
11132 
11133     void last_squaring(RegisterOrConstant i) {
11134       Label dont;
11135       // if ((i & 1) == 0) {
11136       tbnz(i.as_register(), 0, dont); {
11137         // MACC(Ra, Rb, t0, t1, t2);
11138         // Ra = *++Pa;
11139         // Rb = *--Pb;
11140         umulh(Rhi_ab, Ra, Rb);
11141         mul(Rlo_ab, Ra, Rb);
11142         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11143       } bind(dont);
11144     }
11145 
11146     void extra_step_squaring() {
11147       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11148 
11149       // MACC(Rm, Rn, t0, t1, t2);
11150       // Rm = *++Pm;
11151       // Rn = *--Pn;
11152       umulh(Rhi_mn, Rm, Rn);
11153       mul(Rlo_mn, Rm, Rn);
11154       ldr(Rm, pre(Pm, wordSize));
11155       ldr(Rn, pre(Pn, -wordSize));
11156     }
11157 
11158     void post1_squaring() {
11159       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11160 
11161       // *Pm = Rm = t0 * inv;
11162       mul(Rm, t0, inv);
11163       str(Rm, Address(Pm));
11164 
11165       // MACC(Rm, Rn, t0, t1, t2);
11166       // t0 = t1; t1 = t2; t2 = 0;
11167       umulh(Rhi_mn, Rm, Rn);
11168 
11169 #ifndef PRODUCT
11170       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11171       {
11172         mul(Rlo_mn, Rm, Rn);
11173         add(Rlo_mn, t0, Rlo_mn);
11174         Label ok;
11175         cbz(Rlo_mn, ok); {
11176           stop("broken Montgomery multiply");
11177         } bind(ok);
11178       }
11179 #endif
11180       // We have very carefully set things up so that
11181       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11182       // the lower half of Rm * Rn because we know the result already:
11183       // it must be -t0.  t0 + (-t0) must generate a carry iff
11184       // t0 != 0.  So, rather than do a mul and an adds we just set
11185       // the carry flag iff t0 is nonzero.
11186       //
11187       // mul(Rlo_mn, Rm, Rn);
11188       // adds(zr, t0, Rlo_mn);
11189       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11190       adcs(t0, t1, Rhi_mn);
11191       adc(t1, t2, zr);
11192       mov(t2, zr);
11193     }
11194 
11195     void acc(Register Rhi, Register Rlo,
11196              Register t0, Register t1, Register t2) {
11197       adds(t0, t0, Rlo);
11198       adcs(t1, t1, Rhi);
11199       adc(t2, t2, zr);
11200     }
11201 
11202   public:
11203     /**
11204      * Fast Montgomery multiplication.  The derivation of the
11205      * algorithm is in A Cryptographic Library for the Motorola
11206      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11207      *
11208      * Arguments:
11209      *
11210      * Inputs for multiplication:
11211      *   c_rarg0   - int array elements a
11212      *   c_rarg1   - int array elements b
11213      *   c_rarg2   - int array elements n (the modulus)
11214      *   c_rarg3   - int length
11215      *   c_rarg4   - int inv
11216      *   c_rarg5   - int array elements m (the result)
11217      *
11218      * Inputs for squaring:
11219      *   c_rarg0   - int array elements a
11220      *   c_rarg1   - int array elements n (the modulus)
11221      *   c_rarg2   - int length
11222      *   c_rarg3   - int inv
11223      *   c_rarg4   - int array elements m (the result)
11224      *
11225      */
11226     address generate_multiply() {
11227       Label argh, nothing;
11228       bind(argh);
11229       stop("MontgomeryMultiply total_allocation must be <= 8192");
11230 
11231       align(CodeEntryAlignment);
11232       address entry = pc();
11233 
11234       cbzw(Rlen, nothing);
11235 
11236       enter();
11237 
11238       // Make room.
11239       cmpw(Rlen, 512);
11240       br(Assembler::HI, argh);
11241       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11242       andr(sp, Ra, -2 * wordSize);
11243 
11244       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11245 
11246       {
11247         // Copy input args, reversing as we go.  We use Ra as a
11248         // temporary variable.
11249         reverse(Ra, Pa_base, Rlen, t0, t1);
11250         if (!_squaring)
11251           reverse(Ra, Pb_base, Rlen, t0, t1);
11252         reverse(Ra, Pn_base, Rlen, t0, t1);
11253       }
11254 
11255       // Push all call-saved registers and also Pm_base which we'll need
11256       // at the end.
11257       save_regs();
11258 
11259 #ifndef PRODUCT
11260       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11261       {
11262         ldr(Rn, Address(Pn_base, 0));
11263         mul(Rlo_mn, Rn, inv);
11264         subs(zr, Rlo_mn, -1);
11265         Label ok;
11266         br(EQ, ok); {
11267           stop("broken inverse in Montgomery multiply");
11268         } bind(ok);
11269       }
11270 #endif
11271 
11272       mov(Pm_base, Ra);
11273 
11274       mov(t0, zr);
11275       mov(t1, zr);
11276       mov(t2, zr);
11277 
11278       block_comment("for (int i = 0; i < len; i++) {");
11279       mov(Ri, zr); {
11280         Label loop, end;
11281         cmpw(Ri, Rlen);
11282         br(Assembler::GE, end);
11283 
11284         bind(loop);
11285         pre1(Ri);
11286 
11287         block_comment("  for (j = i; j; j--) {"); {
11288           movw(Rj, Ri);
11289           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11290         } block_comment("  } // j");
11291 
11292         post1();
11293         addw(Ri, Ri, 1);
11294         cmpw(Ri, Rlen);
11295         br(Assembler::LT, loop);
11296         bind(end);
11297         block_comment("} // i");
11298       }
11299 
11300       block_comment("for (int i = len; i < 2*len; i++) {");
11301       mov(Ri, Rlen); {
11302         Label loop, end;
11303         cmpw(Ri, Rlen, Assembler::LSL, 1);
11304         br(Assembler::GE, end);
11305 
11306         bind(loop);
11307         pre2(Ri, Rlen);
11308 
11309         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11310           lslw(Rj, Rlen, 1);
11311           subw(Rj, Rj, Ri);
11312           subw(Rj, Rj, 1);
11313           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11314         } block_comment("  } // j");
11315 
11316         post2(Ri, Rlen);
11317         addw(Ri, Ri, 1);
11318         cmpw(Ri, Rlen, Assembler::LSL, 1);
11319         br(Assembler::LT, loop);
11320         bind(end);
11321       }
11322       block_comment("} // i");
11323 
11324       normalize(Rlen);
11325 
11326       mov(Ra, Pm_base);  // Save Pm_base in Ra
11327       restore_regs();  // Restore caller's Pm_base
11328 
11329       // Copy our result into caller's Pm_base
11330       reverse(Pm_base, Ra, Rlen, t0, t1);
11331 
11332       leave();
11333       bind(nothing);
11334       ret(lr);
11335 
11336       return entry;
11337     }
11338     // In C, approximately:
11339 
11340     // void
11341     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11342     //                     julong Pn_base[], julong Pm_base[],
11343     //                     julong inv, int len) {
11344     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11345     //   julong *Pa, *Pb, *Pn, *Pm;
11346     //   julong Ra, Rb, Rn, Rm;
11347 
11348     //   int i;
11349 
11350     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11351 
11352     //   for (i = 0; i < len; i++) {
11353     //     int j;
11354 
11355     //     Pa = Pa_base;
11356     //     Pb = Pb_base + i;
11357     //     Pm = Pm_base;
11358     //     Pn = Pn_base + i;
11359 
11360     //     Ra = *Pa;
11361     //     Rb = *Pb;
11362     //     Rm = *Pm;
11363     //     Rn = *Pn;
11364 
11365     //     int iters = i;
11366     //     for (j = 0; iters--; j++) {
11367     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11368     //       MACC(Ra, Rb, t0, t1, t2);
11369     //       Ra = *++Pa;
11370     //       Rb = *--Pb;
11371     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11372     //       MACC(Rm, Rn, t0, t1, t2);
11373     //       Rm = *++Pm;
11374     //       Rn = *--Pn;
11375     //     }
11376 
11377     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11378     //     MACC(Ra, Rb, t0, t1, t2);
11379     //     *Pm = Rm = t0 * inv;
11380     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11381     //     MACC(Rm, Rn, t0, t1, t2);
11382 
11383     //     assert(t0 == 0, "broken Montgomery multiply");
11384 
11385     //     t0 = t1; t1 = t2; t2 = 0;
11386     //   }
11387 
11388     //   for (i = len; i < 2*len; i++) {
11389     //     int j;
11390 
11391     //     Pa = Pa_base + i-len;
11392     //     Pb = Pb_base + len;
11393     //     Pm = Pm_base + i-len;
11394     //     Pn = Pn_base + len;
11395 
11396     //     Ra = *++Pa;
11397     //     Rb = *--Pb;
11398     //     Rm = *++Pm;
11399     //     Rn = *--Pn;
11400 
11401     //     int iters = len*2-i-1;
11402     //     for (j = i-len+1; iters--; j++) {
11403     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11404     //       MACC(Ra, Rb, t0, t1, t2);
11405     //       Ra = *++Pa;
11406     //       Rb = *--Pb;
11407     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11408     //       MACC(Rm, Rn, t0, t1, t2);
11409     //       Rm = *++Pm;
11410     //       Rn = *--Pn;
11411     //     }
11412 
11413     //     Pm_base[i-len] = t0;
11414     //     t0 = t1; t1 = t2; t2 = 0;
11415     //   }
11416 
11417     //   while (t0)
11418     //     t0 = sub(Pm_base, Pn_base, t0, len);
11419     // }
11420 
11421     /**
11422      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11423      * multiplies than Montgomery multiplication so it should be up to
11424      * 25% faster.  However, its loop control is more complex and it
11425      * may actually run slower on some machines.
11426      *
11427      * Arguments:
11428      *
11429      * Inputs:
11430      *   c_rarg0   - int array elements a
11431      *   c_rarg1   - int array elements n (the modulus)
11432      *   c_rarg2   - int length
11433      *   c_rarg3   - int inv
11434      *   c_rarg4   - int array elements m (the result)
11435      *
11436      */
11437     address generate_square() {
11438       Label argh;
11439       bind(argh);
11440       stop("MontgomeryMultiply total_allocation must be <= 8192");
11441 
11442       align(CodeEntryAlignment);
11443       address entry = pc();
11444 
11445       enter();
11446 
11447       // Make room.
11448       cmpw(Rlen, 512);
11449       br(Assembler::HI, argh);
11450       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11451       andr(sp, Ra, -2 * wordSize);
11452 
11453       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11454 
11455       {
11456         // Copy input args, reversing as we go.  We use Ra as a
11457         // temporary variable.
11458         reverse(Ra, Pa_base, Rlen, t0, t1);
11459         reverse(Ra, Pn_base, Rlen, t0, t1);
11460       }
11461 
11462       // Push all call-saved registers and also Pm_base which we'll need
11463       // at the end.
11464       save_regs();
11465 
11466       mov(Pm_base, Ra);
11467 
11468       mov(t0, zr);
11469       mov(t1, zr);
11470       mov(t2, zr);
11471 
11472       block_comment("for (int i = 0; i < len; i++) {");
11473       mov(Ri, zr); {
11474         Label loop, end;
11475         bind(loop);
11476         cmp(Ri, Rlen);
11477         br(Assembler::GE, end);
11478 
11479         pre1(Ri);
11480 
11481         block_comment("for (j = (i+1)/2; j; j--) {"); {
11482           add(Rj, Ri, 1);
11483           lsr(Rj, Rj, 1);
11484           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11485         } block_comment("  } // j");
11486 
11487         last_squaring(Ri);
11488 
11489         block_comment("  for (j = i/2; j; j--) {"); {
11490           lsr(Rj, Ri, 1);
11491           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11492         } block_comment("  } // j");
11493 
11494         post1_squaring();
11495         add(Ri, Ri, 1);
11496         cmp(Ri, Rlen);
11497         br(Assembler::LT, loop);
11498 
11499         bind(end);
11500         block_comment("} // i");
11501       }
11502 
11503       block_comment("for (int i = len; i < 2*len; i++) {");
11504       mov(Ri, Rlen); {
11505         Label loop, end;
11506         bind(loop);
11507         cmp(Ri, Rlen, Assembler::LSL, 1);
11508         br(Assembler::GE, end);
11509 
11510         pre2(Ri, Rlen);
11511 
11512         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11513           lsl(Rj, Rlen, 1);
11514           sub(Rj, Rj, Ri);
11515           sub(Rj, Rj, 1);
11516           lsr(Rj, Rj, 1);
11517           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11518         } block_comment("  } // j");
11519 
11520         last_squaring(Ri);
11521 
11522         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11523           lsl(Rj, Rlen, 1);
11524           sub(Rj, Rj, Ri);
11525           lsr(Rj, Rj, 1);
11526           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11527         } block_comment("  } // j");
11528 
11529         post2(Ri, Rlen);
11530         add(Ri, Ri, 1);
11531         cmp(Ri, Rlen, Assembler::LSL, 1);
11532 
11533         br(Assembler::LT, loop);
11534         bind(end);
11535         block_comment("} // i");
11536       }
11537 
11538       normalize(Rlen);
11539 
11540       mov(Ra, Pm_base);  // Save Pm_base in Ra
11541       restore_regs();  // Restore caller's Pm_base
11542 
11543       // Copy our result into caller's Pm_base
11544       reverse(Pm_base, Ra, Rlen, t0, t1);
11545 
11546       leave();
11547       ret(lr);
11548 
11549       return entry;
11550     }
11551     // In C, approximately:
11552 
11553     // void
11554     // montgomery_square(julong Pa_base[], julong Pn_base[],
11555     //                   julong Pm_base[], julong inv, int len) {
11556     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11557     //   julong *Pa, *Pb, *Pn, *Pm;
11558     //   julong Ra, Rb, Rn, Rm;
11559 
11560     //   int i;
11561 
11562     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11563 
11564     //   for (i = 0; i < len; i++) {
11565     //     int j;
11566 
11567     //     Pa = Pa_base;
11568     //     Pb = Pa_base + i;
11569     //     Pm = Pm_base;
11570     //     Pn = Pn_base + i;
11571 
11572     //     Ra = *Pa;
11573     //     Rb = *Pb;
11574     //     Rm = *Pm;
11575     //     Rn = *Pn;
11576 
11577     //     int iters = (i+1)/2;
11578     //     for (j = 0; iters--; j++) {
11579     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11580     //       MACC2(Ra, Rb, t0, t1, t2);
11581     //       Ra = *++Pa;
11582     //       Rb = *--Pb;
11583     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11584     //       MACC(Rm, Rn, t0, t1, t2);
11585     //       Rm = *++Pm;
11586     //       Rn = *--Pn;
11587     //     }
11588     //     if ((i & 1) == 0) {
11589     //       assert(Ra == Pa_base[j], "must be");
11590     //       MACC(Ra, Ra, t0, t1, t2);
11591     //     }
11592     //     iters = i/2;
11593     //     assert(iters == i-j, "must be");
11594     //     for (; iters--; j++) {
11595     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11596     //       MACC(Rm, Rn, t0, t1, t2);
11597     //       Rm = *++Pm;
11598     //       Rn = *--Pn;
11599     //     }
11600 
11601     //     *Pm = Rm = t0 * inv;
11602     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11603     //     MACC(Rm, Rn, t0, t1, t2);
11604 
11605     //     assert(t0 == 0, "broken Montgomery multiply");
11606 
11607     //     t0 = t1; t1 = t2; t2 = 0;
11608     //   }
11609 
11610     //   for (i = len; i < 2*len; i++) {
11611     //     int start = i-len+1;
11612     //     int end = start + (len - start)/2;
11613     //     int j;
11614 
11615     //     Pa = Pa_base + i-len;
11616     //     Pb = Pa_base + len;
11617     //     Pm = Pm_base + i-len;
11618     //     Pn = Pn_base + len;
11619 
11620     //     Ra = *++Pa;
11621     //     Rb = *--Pb;
11622     //     Rm = *++Pm;
11623     //     Rn = *--Pn;
11624 
11625     //     int iters = (2*len-i-1)/2;
11626     //     assert(iters == end-start, "must be");
11627     //     for (j = start; iters--; j++) {
11628     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11629     //       MACC2(Ra, Rb, t0, t1, t2);
11630     //       Ra = *++Pa;
11631     //       Rb = *--Pb;
11632     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11633     //       MACC(Rm, Rn, t0, t1, t2);
11634     //       Rm = *++Pm;
11635     //       Rn = *--Pn;
11636     //     }
11637     //     if ((i & 1) == 0) {
11638     //       assert(Ra == Pa_base[j], "must be");
11639     //       MACC(Ra, Ra, t0, t1, t2);
11640     //     }
11641     //     iters =  (2*len-i)/2;
11642     //     assert(iters == len-j, "must be");
11643     //     for (; iters--; j++) {
11644     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11645     //       MACC(Rm, Rn, t0, t1, t2);
11646     //       Rm = *++Pm;
11647     //       Rn = *--Pn;
11648     //     }
11649     //     Pm_base[i-len] = t0;
11650     //     t0 = t1; t1 = t2; t2 = 0;
11651     //   }
11652 
11653     //   while (t0)
11654     //     t0 = sub(Pm_base, Pn_base, t0, len);
11655     // }
11656   };
11657 
11658   // Initialization
11659   void generate_preuniverse_stubs() {
11660     // preuniverse stubs are not needed for aarch64
11661   }
11662 
11663   void generate_initial_stubs() {
11664     // Generate initial stubs and initializes the entry points
11665 
11666     // entry points that exist in all platforms Note: This is code
11667     // that could be shared among different platforms - however the
11668     // benefit seems to be smaller than the disadvantage of having a
11669     // much more complicated generator structure. See also comment in
11670     // stubRoutines.hpp.
11671 
11672     StubRoutines::_forward_exception_entry = generate_forward_exception();
11673 
11674     StubRoutines::_call_stub_entry =
11675       generate_call_stub(StubRoutines::_call_stub_return_address);
11676 
11677     // is referenced by megamorphic call
11678     StubRoutines::_catch_exception_entry = generate_catch_exception();
11679 
11680     // Initialize table for copy memory (arraycopy) check.
11681     if (UnsafeMemoryAccess::_table == nullptr) {
11682       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11683     }
11684 
11685     if (UseCRC32Intrinsics) {
11686       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11687     }
11688 
11689     if (UseCRC32CIntrinsics) {
11690       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11691     }
11692 
11693     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11694       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11695     }
11696 
11697     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11698       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11699     }
11700 
11701     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11702         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11703       StubRoutines::_hf2f = generate_float16ToFloat();
11704       StubRoutines::_f2hf = generate_floatToFloat16();
11705     }
11706   }
11707 
11708   void generate_continuation_stubs() {
11709     // Continuation stubs:
11710     StubRoutines::_cont_thaw          = generate_cont_thaw();
11711     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11712     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11713     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11714   }
11715 
11716   void generate_final_stubs() {
11717     // support for verify_oop (must happen after universe_init)
11718     if (VerifyOops) {
11719       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11720     }
11721 
11722     // arraycopy stubs used by compilers
11723     generate_arraycopy_stubs();
11724 
11725     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11726 
11727     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11728 
11729     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11730     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11731 
11732 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11733 
11734     generate_atomic_entry_points();
11735 
11736 #endif // LINUX
11737 
11738 #ifdef COMPILER2
11739     if (UseSecondarySupersTable) {
11740       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11741       if (! InlineSecondarySupersTest) {
11742         generate_lookup_secondary_supers_table_stub();
11743       }
11744     }
11745 #endif
11746 
11747     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11748 
11749     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11750   }
11751 
11752   void generate_compiler_stubs() {
11753 #if COMPILER2_OR_JVMCI
11754 
11755     if (UseSVE == 0) {
11756       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11757     }
11758 
11759     // array equals stub for large arrays.
11760     if (!UseSimpleArrayEquals) {
11761       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11762     }
11763 
11764     // arrays_hascode stub for large arrays.
11765     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11766     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11767     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11768     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11769     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11770 
11771     // byte_array_inflate stub for large arrays.
11772     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11773 
11774     // countPositives stub for large arrays.
11775     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11776 
11777     generate_compare_long_strings();
11778 
11779     generate_string_indexof_stubs();
11780 
11781 #ifdef COMPILER2
11782     if (UseMultiplyToLenIntrinsic) {
11783       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11784     }
11785 
11786     if (UseSquareToLenIntrinsic) {
11787       StubRoutines::_squareToLen = generate_squareToLen();
11788     }
11789 
11790     if (UseMulAddIntrinsic) {
11791       StubRoutines::_mulAdd = generate_mulAdd();
11792     }
11793 
11794     if (UseSIMDForBigIntegerShiftIntrinsics) {
11795       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11796       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11797     }
11798 
11799     if (UseMontgomeryMultiplyIntrinsic) {
11800       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11801       StubCodeMark mark(this, stub_id);
11802       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11803       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11804     }
11805 
11806     if (UseMontgomerySquareIntrinsic) {
11807       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11808       StubCodeMark mark(this, stub_id);
11809       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11810       // We use generate_multiply() rather than generate_square()
11811       // because it's faster for the sizes of modulus we care about.
11812       StubRoutines::_montgomerySquare = g.generate_multiply();
11813     }
11814 
11815 #endif // COMPILER2
11816 
11817     if (UseChaCha20Intrinsics) {
11818       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11819     }
11820 
11821     if (UseKyberIntrinsics) {
11822       StubRoutines::_kyberNtt = generate_kyberNtt();
11823       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11824       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11825       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11826       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11827       StubRoutines::_kyber12To16 = generate_kyber12To16();
11828       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11829     }
11830 
11831     if (UseDilithiumIntrinsics) {
11832       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11833       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11834       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11835       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11836       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11837     }
11838 
11839     if (UseBASE64Intrinsics) {
11840         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11841         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11842     }
11843 
11844     // data cache line writeback
11845     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11846     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11847 
11848     if (UseAESIntrinsics) {
11849       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11850       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11851       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11852       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11853       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11854     }
11855     if (UseGHASHIntrinsics) {
11856       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11857       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11858     }
11859     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11860       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11861     }
11862 
11863     if (UseMD5Intrinsics) {
11864       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11865       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11866     }
11867     if (UseSHA1Intrinsics) {
11868       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11869       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11870     }
11871     if (UseSHA256Intrinsics) {
11872       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11873       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11874     }
11875     if (UseSHA512Intrinsics) {
11876       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11877       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11878     }
11879     if (UseSHA3Intrinsics) {
11880 
11881       StubRoutines::_double_keccak         = generate_double_keccak();
11882       if (UseSIMDForSHA3Intrinsic) {
11883          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11884          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11885       } else {
11886          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11887          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11888       }
11889     }
11890 
11891     if (UsePoly1305Intrinsics) {
11892       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11893     }
11894 
11895     // generate Adler32 intrinsics code
11896     if (UseAdler32Intrinsics) {
11897       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11898     }
11899 
11900 #endif // COMPILER2_OR_JVMCI
11901   }
11902 
11903  public:
11904   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11905     switch(blob_id) {
11906     case BlobId::stubgen_preuniverse_id:
11907       generate_preuniverse_stubs();
11908       break;
11909     case BlobId::stubgen_initial_id:
11910       generate_initial_stubs();
11911       break;
11912      case BlobId::stubgen_continuation_id:
11913       generate_continuation_stubs();
11914       break;
11915     case BlobId::stubgen_compiler_id:
11916       generate_compiler_stubs();
11917       break;
11918     case BlobId::stubgen_final_id:
11919       generate_final_stubs();
11920       break;
11921     default:
11922       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11923       break;
11924     };
11925   }
11926 }; // end class declaration
11927 
11928 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11929   StubGenerator g(code, blob_id);
11930 }
11931 
11932 
11933 #if defined (LINUX)
11934 
11935 // Define pointers to atomic stubs and initialize them to point to the
11936 // code in atomic_aarch64.S.
11937 
11938 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11939   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11940     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11941   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11942     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11943 
11944 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11945 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11946 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11947 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11948 DEFAULT_ATOMIC_OP(xchg, 4, )
11949 DEFAULT_ATOMIC_OP(xchg, 8, )
11950 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11951 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11952 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11953 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11954 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11955 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11956 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11957 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11958 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11959 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11960 
11961 #undef DEFAULT_ATOMIC_OP
11962 
11963 #endif // LINUX