1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     __ ldr(j_rarg2, result);
  332     Label is_long, is_float, is_double, exit;
  333     __ ldr(j_rarg1, result_type);
  334     __ cmp(j_rarg1, (u1)T_OBJECT);
  335     __ br(Assembler::EQ, is_long);
  336     __ cmp(j_rarg1, (u1)T_LONG);
  337     __ br(Assembler::EQ, is_long);
  338     __ cmp(j_rarg1, (u1)T_FLOAT);
  339     __ br(Assembler::EQ, is_float);
  340     __ cmp(j_rarg1, (u1)T_DOUBLE);
  341     __ br(Assembler::EQ, is_double);
  342 
  343     // handle T_INT case
  344     __ strw(r0, Address(j_rarg2));
  345 
  346     __ BIND(exit);
  347 
  348     // pop parameters
  349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  350 
  351 #ifdef ASSERT
  352     // verify that threads correspond
  353     {
  354       Label L, S;
  355       __ ldr(rscratch1, thread);
  356       __ cmp(rthread, rscratch1);
  357       __ br(Assembler::NE, S);
  358       __ get_thread(rscratch1);
  359       __ cmp(rthread, rscratch1);
  360       __ br(Assembler::EQ, L);
  361       __ BIND(S);
  362       __ stop("StubRoutines::call_stub: threads must correspond");
  363       __ BIND(L);
  364     }
  365 #endif
  366 
  367     __ pop_cont_fastpath(rthread);
  368 
  369     // restore callee-save registers
  370     __ ldpd(v15, v14,  d15_save);
  371     __ ldpd(v13, v12,  d13_save);
  372     __ ldpd(v11, v10,  d11_save);
  373     __ ldpd(v9,  v8,   d9_save);
  374 
  375     __ ldp(r28, r27,   r28_save);
  376     __ ldp(r26, r25,   r26_save);
  377     __ ldp(r24, r23,   r24_save);
  378     __ ldp(r22, r21,   r22_save);
  379     __ ldp(r20, r19,   r20_save);
  380 
  381     // restore fpcr
  382     __ ldr(rscratch1,  fpcr_save);
  383     __ set_fpcr(rscratch1);
  384 
  385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  386     __ ldrw(c_rarg2, result_type);
  387     __ ldr(c_rarg3,  method);
  388     __ ldp(c_rarg4, c_rarg5,  entry_point);
  389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  390 
  391     // leave frame and return to caller
  392     __ leave();
  393     __ ret(lr);
  394 
  395     // handle return types different from T_INT
  396 
  397     __ BIND(is_long);
  398     __ str(r0, Address(j_rarg2, 0));
  399     __ br(Assembler::AL, exit);
  400 
  401     __ BIND(is_float);
  402     __ strs(j_farg0, Address(j_rarg2, 0));
  403     __ br(Assembler::AL, exit);
  404 
  405     __ BIND(is_double);
  406     __ strd(j_farg0, Address(j_rarg2, 0));
  407     __ br(Assembler::AL, exit);
  408 
  409     return start;
  410   }
  411 
  412   // Return point for a Java call if there's an exception thrown in
  413   // Java code.  The exception is caught and transformed into a
  414   // pending exception stored in JavaThread that can be tested from
  415   // within the VM.
  416   //
  417   // Note: Usually the parameters are removed by the callee. In case
  418   // of an exception crossing an activation frame boundary, that is
  419   // not the case if the callee is compiled code => need to setup the
  420   // rsp.
  421   //
  422   // r0: exception oop
  423 
  424   address generate_catch_exception() {
  425     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  426     StubCodeMark mark(this, stub_id);
  427     address start = __ pc();
  428 
  429     // same as in generate_call_stub():
  430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  431     const Address thread        (rfp, thread_off         * wordSize);
  432 
  433 #ifdef ASSERT
  434     // verify that threads correspond
  435     {
  436       Label L, S;
  437       __ ldr(rscratch1, thread);
  438       __ cmp(rthread, rscratch1);
  439       __ br(Assembler::NE, S);
  440       __ get_thread(rscratch1);
  441       __ cmp(rthread, rscratch1);
  442       __ br(Assembler::EQ, L);
  443       __ bind(S);
  444       __ stop("StubRoutines::catch_exception: threads must correspond");
  445       __ bind(L);
  446     }
  447 #endif
  448 
  449     // set pending exception
  450     __ verify_oop(r0);
  451 
  452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  453     __ mov(rscratch1, (address)__FILE__);
  454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  455     __ movw(rscratch1, (int)__LINE__);
  456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  457 
  458     // complete return to VM
  459     assert(StubRoutines::_call_stub_return_address != nullptr,
  460            "_call_stub_return_address must have been generated before");
  461     __ b(StubRoutines::_call_stub_return_address);
  462 
  463     return start;
  464   }
  465 
  466   // Continuation point for runtime calls returning with a pending
  467   // exception.  The pending exception check happened in the runtime
  468   // or native call stub.  The pending exception in Thread is
  469   // converted into a Java-level exception.
  470   //
  471   // Contract with Java-level exception handlers:
  472   // r0: exception
  473   // r3: throwing pc
  474   //
  475   // NOTE: At entry of this stub, exception-pc must be in LR !!
  476 
  477   // NOTE: this is always used as a jump target within generated code
  478   // so it just needs to be generated code with no x86 prolog
  479 
  480   address generate_forward_exception() {
  481     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  482     StubCodeMark mark(this, stub_id);
  483     address start = __ pc();
  484 
  485     // Upon entry, LR points to the return address returning into
  486     // Java (interpreted or compiled) code; i.e., the return address
  487     // becomes the throwing pc.
  488     //
  489     // Arguments pushed before the runtime call are still on the stack
  490     // but the exception handler will reset the stack pointer ->
  491     // ignore them.  A potential result in registers can be ignored as
  492     // well.
  493 
  494 #ifdef ASSERT
  495     // make sure this code is only executed if there is a pending exception
  496     {
  497       Label L;
  498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  499       __ cbnz(rscratch1, L);
  500       __ stop("StubRoutines::forward exception: no pending exception (1)");
  501       __ bind(L);
  502     }
  503 #endif
  504 
  505     // compute exception handler into r19
  506 
  507     // call the VM to find the handler address associated with the
  508     // caller address. pass thread in r0 and caller pc (ret address)
  509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  510     // the stack.
  511     __ mov(c_rarg1, lr);
  512     // lr will be trashed by the VM call so we move it to R19
  513     // (callee-saved) because we also need to pass it to the handler
  514     // returned by this call.
  515     __ mov(r19, lr);
  516     BLOCK_COMMENT("call exception_handler_for_return_address");
  517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  518                          SharedRuntime::exception_handler_for_return_address),
  519                     rthread, c_rarg1);
  520     // Reinitialize the ptrue predicate register, in case the external runtime
  521     // call clobbers ptrue reg, as we may return to SVE compiled code.
  522     __ reinitialize_ptrue();
  523 
  524     // we should not really care that lr is no longer the callee
  525     // address. we saved the value the handler needs in r19 so we can
  526     // just copy it to r3. however, the C2 handler will push its own
  527     // frame and then calls into the VM and the VM code asserts that
  528     // the PC for the frame above the handler belongs to a compiled
  529     // Java method. So, we restore lr here to satisfy that assert.
  530     __ mov(lr, r19);
  531     // setup r0 & r3 & clear pending exception
  532     __ mov(r3, r19);
  533     __ mov(r19, r0);
  534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  536 
  537 #ifdef ASSERT
  538     // make sure exception is set
  539     {
  540       Label L;
  541       __ cbnz(r0, L);
  542       __ stop("StubRoutines::forward exception: no pending exception (2)");
  543       __ bind(L);
  544     }
  545 #endif
  546 
  547     // continue at exception handler
  548     // r0: exception
  549     // r3: throwing pc
  550     // r19: exception handler
  551     __ verify_oop(r0);
  552     __ br(r19);
  553 
  554     return start;
  555   }
  556 
  557   // Non-destructive plausibility checks for oops
  558   //
  559   // Arguments:
  560   //    r0: oop to verify
  561   //    rscratch1: error message
  562   //
  563   // Stack after saving c_rarg3:
  564   //    [tos + 0]: saved c_rarg3
  565   //    [tos + 1]: saved c_rarg2
  566   //    [tos + 2]: saved lr
  567   //    [tos + 3]: saved rscratch2
  568   //    [tos + 4]: saved r0
  569   //    [tos + 5]: saved rscratch1
  570   address generate_verify_oop() {
  571     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  572     StubCodeMark mark(this, stub_id);
  573     address start = __ pc();
  574 
  575     Label exit, error;
  576 
  577     // save c_rarg2 and c_rarg3
  578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  579 
  580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ ldr(c_rarg3, Address(c_rarg2));
  583     __ add(c_rarg3, c_rarg3, 1);
  584     __ str(c_rarg3, Address(c_rarg2));
  585 
  586     // object is in r0
  587     // make sure object is 'reasonable'
  588     __ cbz(r0, exit); // if obj is null it is OK
  589 
  590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  592 
  593     // return if everything seems ok
  594     __ bind(exit);
  595 
  596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  597     __ ret(lr);
  598 
  599     // handle errors
  600     __ bind(error);
  601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  602 
  603     __ push(RegSet::range(r0, r29), sp);
  604     // debug(char* msg, int64_t pc, int64_t regs[])
  605     __ mov(c_rarg0, rscratch1);      // pass address of error message
  606     __ mov(c_rarg1, lr);             // pass return address
  607     __ mov(c_rarg2, sp);             // pass address of regs on stack
  608 #ifndef PRODUCT
  609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  610 #endif
  611     BLOCK_COMMENT("call MacroAssembler::debug");
  612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  613     __ blr(rscratch1);
  614     __ hlt(0);
  615 
  616     return start;
  617   }
  618 
  619   // Generate indices for iota vector.
  620   address generate_iota_indices(StubGenStubId stub_id) {
  621     __ align(CodeEntryAlignment);
  622     StubCodeMark mark(this, stub_id);
  623     address start = __ pc();
  624     // B
  625     __ emit_data64(0x0706050403020100, relocInfo::none);
  626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  627     // H
  628     __ emit_data64(0x0003000200010000, relocInfo::none);
  629     __ emit_data64(0x0007000600050004, relocInfo::none);
  630     // S
  631     __ emit_data64(0x0000000100000000, relocInfo::none);
  632     __ emit_data64(0x0000000300000002, relocInfo::none);
  633     // D
  634     __ emit_data64(0x0000000000000000, relocInfo::none);
  635     __ emit_data64(0x0000000000000001, relocInfo::none);
  636     // S - FP
  637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  639     // D - FP
  640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  642     return start;
  643   }
  644 
  645   // The inner part of zero_words().  This is the bulk operation,
  646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  647   // caller is responsible for zeroing the last few words.
  648   //
  649   // Inputs:
  650   // r10: the HeapWord-aligned base address of an array to zero.
  651   // r11: the count in HeapWords, r11 > 0.
  652   //
  653   // Returns r10 and r11, adjusted for the caller to clear.
  654   // r10: the base address of the tail of words left to clear.
  655   // r11: the number of words in the tail.
  656   //      r11 < MacroAssembler::zero_words_block_size.
  657 
  658   address generate_zero_blocks() {
  659     Label done;
  660     Label base_aligned;
  661 
  662     Register base = r10, cnt = r11;
  663 
  664     __ align(CodeEntryAlignment);
  665     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  666     StubCodeMark mark(this, stub_id);
  667     address start = __ pc();
  668 
  669     if (UseBlockZeroing) {
  670       int zva_length = VM_Version::zva_length();
  671 
  672       // Ensure ZVA length can be divided by 16. This is required by
  673       // the subsequent operations.
  674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  675 
  676       __ tbz(base, 3, base_aligned);
  677       __ str(zr, Address(__ post(base, 8)));
  678       __ sub(cnt, cnt, 1);
  679       __ bind(base_aligned);
  680 
  681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  682       // alignment.
  683       Label small;
  684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  685       __ subs(rscratch1, cnt, low_limit >> 3);
  686       __ br(Assembler::LT, small);
  687       __ zero_dcache_blocks(base, cnt);
  688       __ bind(small);
  689     }
  690 
  691     {
  692       // Number of stp instructions we'll unroll
  693       const int unroll =
  694         MacroAssembler::zero_words_block_size / 2;
  695       // Clear the remaining blocks.
  696       Label loop;
  697       __ subs(cnt, cnt, unroll * 2);
  698       __ br(Assembler::LT, done);
  699       __ bind(loop);
  700       for (int i = 0; i < unroll; i++)
  701         __ stp(zr, zr, __ post(base, 16));
  702       __ subs(cnt, cnt, unroll * 2);
  703       __ br(Assembler::GE, loop);
  704       __ bind(done);
  705       __ add(cnt, cnt, unroll * 2);
  706     }
  707 
  708     __ ret(lr);
  709 
  710     return start;
  711   }
  712 
  713 
  714   typedef enum {
  715     copy_forwards = 1,
  716     copy_backwards = -1
  717   } copy_direction;
  718 
  719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  720   // for arraycopy stubs.
  721   class ArrayCopyBarrierSetHelper : StackObj {
  722     BarrierSetAssembler* _bs_asm;
  723     MacroAssembler* _masm;
  724     DecoratorSet _decorators;
  725     BasicType _type;
  726     Register _gct1;
  727     Register _gct2;
  728     Register _gct3;
  729     FloatRegister _gcvt1;
  730     FloatRegister _gcvt2;
  731     FloatRegister _gcvt3;
  732 
  733   public:
  734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  735                               DecoratorSet decorators,
  736                               BasicType type,
  737                               Register gct1,
  738                               Register gct2,
  739                               Register gct3,
  740                               FloatRegister gcvt1,
  741                               FloatRegister gcvt2,
  742                               FloatRegister gcvt3)
  743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  744         _masm(masm),
  745         _decorators(decorators),
  746         _type(type),
  747         _gct1(gct1),
  748         _gct2(gct2),
  749         _gct3(gct3),
  750         _gcvt1(gcvt1),
  751         _gcvt2(gcvt2),
  752         _gcvt3(gcvt3) {
  753     }
  754 
  755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  757                             dst1, dst2, src,
  758                             _gct1, _gct2, _gcvt1);
  759     }
  760 
  761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  763                              dst, src1, src2,
  764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  765     }
  766 
  767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  769                             dst1, dst2, src,
  770                             _gct1);
  771     }
  772 
  773     void copy_store_at_16(Address dst, Register src1, Register src2) {
  774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  775                              dst, src1, src2,
  776                              _gct1, _gct2, _gct3);
  777     }
  778 
  779     void copy_load_at_8(Register dst, Address src) {
  780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  781                             dst, noreg, src,
  782                             _gct1);
  783     }
  784 
  785     void copy_store_at_8(Address dst, Register src) {
  786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  787                              dst, src, noreg,
  788                              _gct1, _gct2, _gct3);
  789     }
  790   };
  791 
  792   // Bulk copy of blocks of 8 words.
  793   //
  794   // count is a count of words.
  795   //
  796   // Precondition: count >= 8
  797   //
  798   // Postconditions:
  799   //
  800   // The least significant bit of count contains the remaining count
  801   // of words to copy.  The rest of count is trash.
  802   //
  803   // s and d are adjusted to point to the remaining words to copy
  804   //
  805   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  806     BasicType type;
  807     copy_direction direction;
  808 
  809     switch (stub_id) {
  810     case copy_byte_f_id:
  811       direction = copy_forwards;
  812       type = T_BYTE;
  813       break;
  814     case copy_byte_b_id:
  815       direction = copy_backwards;
  816       type = T_BYTE;
  817       break;
  818     case copy_oop_f_id:
  819       direction = copy_forwards;
  820       type = T_OBJECT;
  821       break;
  822     case copy_oop_b_id:
  823       direction = copy_backwards;
  824       type = T_OBJECT;
  825       break;
  826     case copy_oop_uninit_f_id:
  827       direction = copy_forwards;
  828       type = T_OBJECT;
  829       break;
  830     case copy_oop_uninit_b_id:
  831       direction = copy_backwards;
  832       type = T_OBJECT;
  833       break;
  834     default:
  835       ShouldNotReachHere();
  836     }
  837 
  838     int unit = wordSize * direction;
  839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  840 
  841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  843     const Register stride = r14;
  844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  847 
  848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  849     assert_different_registers(s, d, count, rscratch1, rscratch2);
  850 
  851     Label again, drain;
  852 
  853     __ align(CodeEntryAlignment);
  854 
  855     StubCodeMark mark(this, stub_id);
  856 
  857     __ bind(start);
  858 
  859     Label unaligned_copy_long;
  860     if (AvoidUnalignedAccesses) {
  861       __ tbnz(d, 3, unaligned_copy_long);
  862     }
  863 
  864     if (direction == copy_forwards) {
  865       __ sub(s, s, bias);
  866       __ sub(d, d, bias);
  867     }
  868 
  869 #ifdef ASSERT
  870     // Make sure we are never given < 8 words
  871     {
  872       Label L;
  873       __ cmp(count, (u1)8);
  874       __ br(Assembler::GE, L);
  875       __ stop("genrate_copy_longs called with < 8 words");
  876       __ bind(L);
  877     }
  878 #endif
  879 
  880     // Fill 8 registers
  881     if (UseSIMDForMemoryOps) {
  882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  884     } else {
  885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  889     }
  890 
  891     __ subs(count, count, 16);
  892     __ br(Assembler::LO, drain);
  893 
  894     int prefetch = PrefetchCopyIntervalInBytes;
  895     bool use_stride = false;
  896     if (direction == copy_backwards) {
  897        use_stride = prefetch > 256;
  898        prefetch = -prefetch;
  899        if (use_stride) __ mov(stride, prefetch);
  900     }
  901 
  902     __ bind(again);
  903 
  904     if (PrefetchCopyIntervalInBytes > 0)
  905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  906 
  907     if (UseSIMDForMemoryOps) {
  908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  912     } else {
  913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  921     }
  922 
  923     __ subs(count, count, 8);
  924     __ br(Assembler::HS, again);
  925 
  926     // Drain
  927     __ bind(drain);
  928     if (UseSIMDForMemoryOps) {
  929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  931     } else {
  932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936     }
  937 
  938     {
  939       Label L1, L2;
  940       __ tbz(count, exact_log2(4), L1);
  941       if (UseSIMDForMemoryOps) {
  942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  944       } else {
  945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  949       }
  950       __ bind(L1);
  951 
  952       if (direction == copy_forwards) {
  953         __ add(s, s, bias);
  954         __ add(d, d, bias);
  955       }
  956 
  957       __ tbz(count, 1, L2);
  958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  960       __ bind(L2);
  961     }
  962 
  963     __ ret(lr);
  964 
  965     if (AvoidUnalignedAccesses) {
  966       Label drain, again;
  967       // Register order for storing. Order is different for backward copy.
  968 
  969       __ bind(unaligned_copy_long);
  970 
  971       // source address is even aligned, target odd aligned
  972       //
  973       // when forward copying word pairs we read long pairs at offsets
  974       // {0, 2, 4, 6} (in long words). when backwards copying we read
  975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  976       // address by -2 in the forwards case so we can compute the
  977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  978       // or -1.
  979       //
  980       // when forward copying we need to store 1 word, 3 pairs and
  981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  982       // zero offset We adjust the destination by -1 which means we
  983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  984       //
  985       // When backwards copyng we need to store 1 word, 3 pairs and
  986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  987       // offsets {1, 3, 5, 7, 8} * unit.
  988 
  989       if (direction == copy_forwards) {
  990         __ sub(s, s, 16);
  991         __ sub(d, d, 8);
  992       }
  993 
  994       // Fill 8 registers
  995       //
  996       // for forwards copy s was offset by -16 from the original input
  997       // value of s so the register contents are at these offsets
  998       // relative to the 64 bit block addressed by that original input
  999       // and so on for each successive 64 byte block when s is updated
 1000       //
 1001       // t0 at offset 0,  t1 at offset 8
 1002       // t2 at offset 16, t3 at offset 24
 1003       // t4 at offset 32, t5 at offset 40
 1004       // t6 at offset 48, t7 at offset 56
 1005 
 1006       // for backwards copy s was not offset so the register contents
 1007       // are at these offsets into the preceding 64 byte block
 1008       // relative to that original input and so on for each successive
 1009       // preceding 64 byte block when s is updated. this explains the
 1010       // slightly counter-intuitive looking pattern of register usage
 1011       // in the stp instructions for backwards copy.
 1012       //
 1013       // t0 at offset -16, t1 at offset -8
 1014       // t2 at offset -32, t3 at offset -24
 1015       // t4 at offset -48, t5 at offset -40
 1016       // t6 at offset -64, t7 at offset -56
 1017 
 1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1022 
 1023       __ subs(count, count, 16);
 1024       __ br(Assembler::LO, drain);
 1025 
 1026       int prefetch = PrefetchCopyIntervalInBytes;
 1027       bool use_stride = false;
 1028       if (direction == copy_backwards) {
 1029          use_stride = prefetch > 256;
 1030          prefetch = -prefetch;
 1031          if (use_stride) __ mov(stride, prefetch);
 1032       }
 1033 
 1034       __ bind(again);
 1035 
 1036       if (PrefetchCopyIntervalInBytes > 0)
 1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1038 
 1039       if (direction == copy_forwards) {
 1040        // allowing for the offset of -8 the store instructions place
 1041        // registers into the target 64 bit block at the following
 1042        // offsets
 1043        //
 1044        // t0 at offset 0
 1045        // t1 at offset 8,  t2 at offset 16
 1046        // t3 at offset 24, t4 at offset 32
 1047        // t5 at offset 40, t6 at offset 48
 1048        // t7 at offset 56
 1049 
 1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1059       } else {
 1060        // d was not offset when we started so the registers are
 1061        // written into the 64 bit block preceding d with the following
 1062        // offsets
 1063        //
 1064        // t1 at offset -8
 1065        // t3 at offset -24, t0 at offset -16
 1066        // t5 at offset -48, t2 at offset -32
 1067        // t7 at offset -56, t4 at offset -48
 1068        //                   t6 at offset -64
 1069        //
 1070        // note that this matches the offsets previously noted for the
 1071        // loads
 1072 
 1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1082       }
 1083 
 1084       __ subs(count, count, 8);
 1085       __ br(Assembler::HS, again);
 1086 
 1087       // Drain
 1088       //
 1089       // this uses the same pattern of offsets and register arguments
 1090       // as above
 1091       __ bind(drain);
 1092       if (direction == copy_forwards) {
 1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1098       } else {
 1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1104       }
 1105       // now we need to copy any remaining part block which may
 1106       // include a 4 word block subblock and/or a 2 word subblock.
 1107       // bits 2 and 1 in the count are the tell-tale for whether we
 1108       // have each such subblock
 1109       {
 1110         Label L1, L2;
 1111         __ tbz(count, exact_log2(4), L1);
 1112        // this is the same as above but copying only 4 longs hence
 1113        // with only one intervening stp between the str instructions
 1114        // but note that the offsets and registers still follow the
 1115        // same pattern
 1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1118         if (direction == copy_forwards) {
 1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1122         } else {
 1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1126         }
 1127         __ bind(L1);
 1128 
 1129         __ tbz(count, 1, L2);
 1130        // this is the same as above but copying only 2 longs hence
 1131        // there is no intervening stp between the str instructions
 1132        // but note that the offset and register patterns are still
 1133        // the same
 1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1135         if (direction == copy_forwards) {
 1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1141         }
 1142         __ bind(L2);
 1143 
 1144        // for forwards copy we need to re-adjust the offsets we
 1145        // applied so that s and d are follow the last words written
 1146 
 1147        if (direction == copy_forwards) {
 1148          __ add(s, s, 16);
 1149          __ add(d, d, 8);
 1150        }
 1151 
 1152       }
 1153 
 1154       __ ret(lr);
 1155       }
 1156   }
 1157 
 1158   // Small copy: less than 16 bytes.
 1159   //
 1160   // NB: Ignores all of the bits of count which represent more than 15
 1161   // bytes, so a caller doesn't have to mask them.
 1162 
 1163   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1164     bool is_backwards = step < 0;
 1165     size_t granularity = g_uabs(step);
 1166     int direction = is_backwards ? -1 : 1;
 1167 
 1168     Label Lword, Lint, Lshort, Lbyte;
 1169 
 1170     assert(granularity
 1171            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1172 
 1173     const Register t0 = r3;
 1174     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1175     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1176 
 1177     // ??? I don't know if this bit-test-and-branch is the right thing
 1178     // to do.  It does a lot of jumping, resulting in several
 1179     // mispredicted branches.  It might make more sense to do this
 1180     // with something like Duff's device with a single computed branch.
 1181 
 1182     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1183     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1184     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1185     __ bind(Lword);
 1186 
 1187     if (granularity <= sizeof (jint)) {
 1188       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1189       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1190       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1191       __ bind(Lint);
 1192     }
 1193 
 1194     if (granularity <= sizeof (jshort)) {
 1195       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1196       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1197       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1198       __ bind(Lshort);
 1199     }
 1200 
 1201     if (granularity <= sizeof (jbyte)) {
 1202       __ tbz(count, 0, Lbyte);
 1203       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1204       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1205       __ bind(Lbyte);
 1206     }
 1207   }
 1208 
 1209   Label copy_f, copy_b;
 1210   Label copy_obj_f, copy_obj_b;
 1211   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1212 
 1213   // All-singing all-dancing memory copy.
 1214   //
 1215   // Copy count units of memory from s to d.  The size of a unit is
 1216   // step, which can be positive or negative depending on the direction
 1217   // of copy.  If is_aligned is false, we align the source address.
 1218   //
 1219 
 1220   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1221                    Register s, Register d, Register count, int step) {
 1222     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1223     bool is_backwards = step < 0;
 1224     unsigned int granularity = g_uabs(step);
 1225     const Register t0 = r3, t1 = r4;
 1226 
 1227     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1228     // load all the data before writing anything
 1229     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1230     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1231     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1232     const Register send = r17, dend = r16;
 1233     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1234     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1235     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1236 
 1237     if (PrefetchCopyIntervalInBytes > 0)
 1238       __ prfm(Address(s, 0), PLDL1KEEP);
 1239     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1240     __ br(Assembler::HI, copy_big);
 1241 
 1242     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1243     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1244 
 1245     __ cmp(count, u1(16/granularity));
 1246     __ br(Assembler::LS, copy16);
 1247 
 1248     __ cmp(count, u1(64/granularity));
 1249     __ br(Assembler::HI, copy80);
 1250 
 1251     __ cmp(count, u1(32/granularity));
 1252     __ br(Assembler::LS, copy32);
 1253 
 1254     // 33..64 bytes
 1255     if (UseSIMDForMemoryOps) {
 1256       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1257       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1258       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1259       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1260     } else {
 1261       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1262       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1263       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1264       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1265 
 1266       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1267       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1268       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1269       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1270     }
 1271     __ b(finish);
 1272 
 1273     // 17..32 bytes
 1274     __ bind(copy32);
 1275     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1276     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1277 
 1278     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1279     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1280     __ b(finish);
 1281 
 1282     // 65..80/96 bytes
 1283     // (96 bytes if SIMD because we do 32 byes per instruction)
 1284     __ bind(copy80);
 1285     if (UseSIMDForMemoryOps) {
 1286       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1287       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1288       // Unaligned pointers can be an issue for copying.
 1289       // The issue has more chances to happen when granularity of data is
 1290       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1291       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1292       // The most performance drop has been seen for the range 65-80 bytes.
 1293       // For such cases using the pair of ldp/stp instead of the third pair of
 1294       // ldpq/stpq fixes the performance issue.
 1295       if (granularity < sizeof (jint)) {
 1296         Label copy96;
 1297         __ cmp(count, u1(80/granularity));
 1298         __ br(Assembler::HI, copy96);
 1299         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1300 
 1301         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1302         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1303 
 1304         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1305         __ b(finish);
 1306 
 1307         __ bind(copy96);
 1308       }
 1309       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1310 
 1311       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1312       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1313 
 1314       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1315     } else {
 1316       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1317       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1318       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1319       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1320       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1321 
 1322       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1323       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1324       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1325       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1326       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1327     }
 1328     __ b(finish);
 1329 
 1330     // 0..16 bytes
 1331     __ bind(copy16);
 1332     __ cmp(count, u1(8/granularity));
 1333     __ br(Assembler::LO, copy8);
 1334 
 1335     // 8..16 bytes
 1336     bs.copy_load_at_8(t0, Address(s, 0));
 1337     bs.copy_load_at_8(t1, Address(send, -8));
 1338     bs.copy_store_at_8(Address(d, 0), t0);
 1339     bs.copy_store_at_8(Address(dend, -8), t1);
 1340     __ b(finish);
 1341 
 1342     if (granularity < 8) {
 1343       // 4..7 bytes
 1344       __ bind(copy8);
 1345       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1346       __ ldrw(t0, Address(s, 0));
 1347       __ ldrw(t1, Address(send, -4));
 1348       __ strw(t0, Address(d, 0));
 1349       __ strw(t1, Address(dend, -4));
 1350       __ b(finish);
 1351       if (granularity < 4) {
 1352         // 0..3 bytes
 1353         __ bind(copy4);
 1354         __ cbz(count, finish); // get rid of 0 case
 1355         if (granularity == 2) {
 1356           __ ldrh(t0, Address(s, 0));
 1357           __ strh(t0, Address(d, 0));
 1358         } else { // granularity == 1
 1359           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1360           // the first and last byte.
 1361           // Handle the 3 byte case by loading and storing base + count/2
 1362           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1363           // This does means in the 1 byte case we load/store the same
 1364           // byte 3 times.
 1365           __ lsr(count, count, 1);
 1366           __ ldrb(t0, Address(s, 0));
 1367           __ ldrb(t1, Address(send, -1));
 1368           __ ldrb(t2, Address(s, count));
 1369           __ strb(t0, Address(d, 0));
 1370           __ strb(t1, Address(dend, -1));
 1371           __ strb(t2, Address(d, count));
 1372         }
 1373         __ b(finish);
 1374       }
 1375     }
 1376 
 1377     __ bind(copy_big);
 1378     if (is_backwards) {
 1379       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1380       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1381     }
 1382 
 1383     // Now we've got the small case out of the way we can align the
 1384     // source address on a 2-word boundary.
 1385 
 1386     // Here we will materialize a count in r15, which is used by copy_memory_small
 1387     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1388     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1389     // can not be used as a temp register, as it contains the count.
 1390 
 1391     Label aligned;
 1392 
 1393     if (is_aligned) {
 1394       // We may have to adjust by 1 word to get s 2-word-aligned.
 1395       __ tbz(s, exact_log2(wordSize), aligned);
 1396       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1397       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1398       __ sub(count, count, wordSize/granularity);
 1399     } else {
 1400       if (is_backwards) {
 1401         __ andr(r15, s, 2 * wordSize - 1);
 1402       } else {
 1403         __ neg(r15, s);
 1404         __ andr(r15, r15, 2 * wordSize - 1);
 1405       }
 1406       // r15 is the byte adjustment needed to align s.
 1407       __ cbz(r15, aligned);
 1408       int shift = exact_log2(granularity);
 1409       if (shift > 0) {
 1410         __ lsr(r15, r15, shift);
 1411       }
 1412       __ sub(count, count, r15);
 1413 
 1414 #if 0
 1415       // ?? This code is only correct for a disjoint copy.  It may or
 1416       // may not make sense to use it in that case.
 1417 
 1418       // Copy the first pair; s and d may not be aligned.
 1419       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1420       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1421 
 1422       // Align s and d, adjust count
 1423       if (is_backwards) {
 1424         __ sub(s, s, r15);
 1425         __ sub(d, d, r15);
 1426       } else {
 1427         __ add(s, s, r15);
 1428         __ add(d, d, r15);
 1429       }
 1430 #else
 1431       copy_memory_small(decorators, type, s, d, r15, step);
 1432 #endif
 1433     }
 1434 
 1435     __ bind(aligned);
 1436 
 1437     // s is now 2-word-aligned.
 1438 
 1439     // We have a count of units and some trailing bytes. Adjust the
 1440     // count and do a bulk copy of words. If the shift is zero
 1441     // perform a move instead to benefit from zero latency moves.
 1442     int shift = exact_log2(wordSize/granularity);
 1443     if (shift > 0) {
 1444       __ lsr(r15, count, shift);
 1445     } else {
 1446       __ mov(r15, count);
 1447     }
 1448     if (direction == copy_forwards) {
 1449       if (type != T_OBJECT) {
 1450         __ bl(copy_f);
 1451       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1452         __ bl(copy_obj_uninit_f);
 1453       } else {
 1454         __ bl(copy_obj_f);
 1455       }
 1456     } else {
 1457       if (type != T_OBJECT) {
 1458         __ bl(copy_b);
 1459       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1460         __ bl(copy_obj_uninit_b);
 1461       } else {
 1462         __ bl(copy_obj_b);
 1463       }
 1464     }
 1465 
 1466     // And the tail.
 1467     copy_memory_small(decorators, type, s, d, count, step);
 1468 
 1469     if (granularity >= 8) __ bind(copy8);
 1470     if (granularity >= 4) __ bind(copy4);
 1471     __ bind(finish);
 1472   }
 1473 
 1474 
 1475   void clobber_registers() {
 1476 #ifdef ASSERT
 1477     RegSet clobbered
 1478       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1479     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1480     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1481     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1482       __ mov(*it, rscratch1);
 1483     }
 1484 #endif
 1485 
 1486   }
 1487 
 1488   // Scan over array at a for count oops, verifying each one.
 1489   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1490   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1491     Label loop, end;
 1492     __ mov(rscratch1, a);
 1493     __ mov(rscratch2, zr);
 1494     __ bind(loop);
 1495     __ cmp(rscratch2, count);
 1496     __ br(Assembler::HS, end);
 1497     if (size == wordSize) {
 1498       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1499       __ verify_oop(temp);
 1500     } else {
 1501       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1502       __ decode_heap_oop(temp); // calls verify_oop
 1503     }
 1504     __ add(rscratch2, rscratch2, 1);
 1505     __ b(loop);
 1506     __ bind(end);
 1507   }
 1508 
 1509   // Arguments:
 1510   //   stub_id - is used to name the stub and identify all details of
 1511   //             how to perform the copy.
 1512   //
 1513   //   entry - is assigned to the stub's post push entry point unless
 1514   //           it is null
 1515   //
 1516   // Inputs:
 1517   //   c_rarg0   - source array address
 1518   //   c_rarg1   - destination array address
 1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1520   //
 1521   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1522   // the hardware handle it.  The two dwords within qwords that span
 1523   // cache line boundaries will still be loaded and stored atomically.
 1524   //
 1525   // Side Effects: entry is set to the (post push) entry point so it
 1526   //               can be used by the corresponding conjoint copy
 1527   //               method
 1528   //
 1529   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1530     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1531     RegSet saved_reg = RegSet::of(s, d, count);
 1532     int size;
 1533     bool aligned;
 1534     bool is_oop;
 1535     bool dest_uninitialized;
 1536     switch (stub_id) {
 1537     case jbyte_disjoint_arraycopy_id:
 1538       size = sizeof(jbyte);
 1539       aligned = false;
 1540       is_oop = false;
 1541       dest_uninitialized = false;
 1542       break;
 1543     case arrayof_jbyte_disjoint_arraycopy_id:
 1544       size = sizeof(jbyte);
 1545       aligned = true;
 1546       is_oop = false;
 1547       dest_uninitialized = false;
 1548       break;
 1549     case jshort_disjoint_arraycopy_id:
 1550       size = sizeof(jshort);
 1551       aligned = false;
 1552       is_oop = false;
 1553       dest_uninitialized = false;
 1554       break;
 1555     case arrayof_jshort_disjoint_arraycopy_id:
 1556       size = sizeof(jshort);
 1557       aligned = true;
 1558       is_oop = false;
 1559       dest_uninitialized = false;
 1560       break;
 1561     case jint_disjoint_arraycopy_id:
 1562       size = sizeof(jint);
 1563       aligned = false;
 1564       is_oop = false;
 1565       dest_uninitialized = false;
 1566       break;
 1567     case arrayof_jint_disjoint_arraycopy_id:
 1568       size = sizeof(jint);
 1569       aligned = true;
 1570       is_oop = false;
 1571       dest_uninitialized = false;
 1572       break;
 1573     case jlong_disjoint_arraycopy_id:
 1574       // since this is always aligned we can (should!) use the same
 1575       // stub as for case arrayof_jlong_disjoint_arraycopy
 1576       ShouldNotReachHere();
 1577       break;
 1578     case arrayof_jlong_disjoint_arraycopy_id:
 1579       size = sizeof(jlong);
 1580       aligned = true;
 1581       is_oop = false;
 1582       dest_uninitialized = false;
 1583       break;
 1584     case oop_disjoint_arraycopy_id:
 1585       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1586       aligned = !UseCompressedOops;
 1587       is_oop = true;
 1588       dest_uninitialized = false;
 1589       break;
 1590     case arrayof_oop_disjoint_arraycopy_id:
 1591       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1592       aligned = !UseCompressedOops;
 1593       is_oop = true;
 1594       dest_uninitialized = false;
 1595       break;
 1596     case oop_disjoint_arraycopy_uninit_id:
 1597       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1598       aligned = !UseCompressedOops;
 1599       is_oop = true;
 1600       dest_uninitialized = true;
 1601       break;
 1602     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1603       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1604       aligned = !UseCompressedOops;
 1605       is_oop = true;
 1606       dest_uninitialized = true;
 1607       break;
 1608     default:
 1609       ShouldNotReachHere();
 1610       break;
 1611     }
 1612 
 1613     __ align(CodeEntryAlignment);
 1614     StubCodeMark mark(this, stub_id);
 1615     address start = __ pc();
 1616     __ enter();
 1617 
 1618     if (entry != nullptr) {
 1619       *entry = __ pc();
 1620       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1621       BLOCK_COMMENT("Entry:");
 1622     }
 1623 
 1624     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1625     if (dest_uninitialized) {
 1626       decorators |= IS_DEST_UNINITIALIZED;
 1627     }
 1628     if (aligned) {
 1629       decorators |= ARRAYCOPY_ALIGNED;
 1630     }
 1631 
 1632     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1633     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1634 
 1635     if (is_oop) {
 1636       // save regs before copy_memory
 1637       __ push(RegSet::of(d, count), sp);
 1638     }
 1639     {
 1640       // UnsafeMemoryAccess page error: continue after unsafe access
 1641       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1642       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1643       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1644     }
 1645 
 1646     if (is_oop) {
 1647       __ pop(RegSet::of(d, count), sp);
 1648       if (VerifyOops)
 1649         verify_oop_array(size, d, count, r16);
 1650     }
 1651 
 1652     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1653 
 1654     __ leave();
 1655     __ mov(r0, zr); // return 0
 1656     __ ret(lr);
 1657     return start;
 1658   }
 1659 
 1660   // Arguments:
 1661   //   stub_id - is used to name the stub and identify all details of
 1662   //             how to perform the copy.
 1663   //
 1664   //   nooverlap_target - identifes the (post push) entry for the
 1665   //             corresponding disjoint copy routine which can be
 1666   //             jumped to if the ranges do not actually overlap
 1667   //
 1668   //   entry - is assigned to the stub's post push entry point unless
 1669   //           it is null
 1670   //
 1671   //
 1672   // Inputs:
 1673   //   c_rarg0   - source array address
 1674   //   c_rarg1   - destination array address
 1675   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1676   //
 1677   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1678   // the hardware handle it.  The two dwords within qwords that span
 1679   // cache line boundaries will still be loaded and stored atomically.
 1680   //
 1681   // Side Effects:
 1682   //   entry is set to the no-overlap entry point so it can be used by
 1683   //   some other conjoint copy method
 1684   //
 1685   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1686     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1687     RegSet saved_regs = RegSet::of(s, d, count);
 1688     int size;
 1689     bool aligned;
 1690     bool is_oop;
 1691     bool dest_uninitialized;
 1692     switch (stub_id) {
 1693     case jbyte_arraycopy_id:
 1694       size = sizeof(jbyte);
 1695       aligned = false;
 1696       is_oop = false;
 1697       dest_uninitialized = false;
 1698       break;
 1699     case arrayof_jbyte_arraycopy_id:
 1700       size = sizeof(jbyte);
 1701       aligned = true;
 1702       is_oop = false;
 1703       dest_uninitialized = false;
 1704       break;
 1705     case jshort_arraycopy_id:
 1706       size = sizeof(jshort);
 1707       aligned = false;
 1708       is_oop = false;
 1709       dest_uninitialized = false;
 1710       break;
 1711     case arrayof_jshort_arraycopy_id:
 1712       size = sizeof(jshort);
 1713       aligned = true;
 1714       is_oop = false;
 1715       dest_uninitialized = false;
 1716       break;
 1717     case jint_arraycopy_id:
 1718       size = sizeof(jint);
 1719       aligned = false;
 1720       is_oop = false;
 1721       dest_uninitialized = false;
 1722       break;
 1723     case arrayof_jint_arraycopy_id:
 1724       size = sizeof(jint);
 1725       aligned = true;
 1726       is_oop = false;
 1727       dest_uninitialized = false;
 1728       break;
 1729     case jlong_arraycopy_id:
 1730       // since this is always aligned we can (should!) use the same
 1731       // stub as for case arrayof_jlong_disjoint_arraycopy
 1732       ShouldNotReachHere();
 1733       break;
 1734     case arrayof_jlong_arraycopy_id:
 1735       size = sizeof(jlong);
 1736       aligned = true;
 1737       is_oop = false;
 1738       dest_uninitialized = false;
 1739       break;
 1740     case oop_arraycopy_id:
 1741       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1742       aligned = !UseCompressedOops;
 1743       is_oop = true;
 1744       dest_uninitialized = false;
 1745       break;
 1746     case arrayof_oop_arraycopy_id:
 1747       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1748       aligned = !UseCompressedOops;
 1749       is_oop = true;
 1750       dest_uninitialized = false;
 1751       break;
 1752     case oop_arraycopy_uninit_id:
 1753       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1754       aligned = !UseCompressedOops;
 1755       is_oop = true;
 1756       dest_uninitialized = true;
 1757       break;
 1758     case arrayof_oop_arraycopy_uninit_id:
 1759       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1760       aligned = !UseCompressedOops;
 1761       is_oop = true;
 1762       dest_uninitialized = true;
 1763       break;
 1764     default:
 1765       ShouldNotReachHere();
 1766     }
 1767 
 1768     StubCodeMark mark(this, stub_id);
 1769     address start = __ pc();
 1770     __ enter();
 1771 
 1772     if (entry != nullptr) {
 1773       *entry = __ pc();
 1774       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1775       BLOCK_COMMENT("Entry:");
 1776     }
 1777 
 1778     // use fwd copy when (d-s) above_equal (count*size)
 1779     __ sub(rscratch1, d, s);
 1780     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1781     __ br(Assembler::HS, nooverlap_target);
 1782 
 1783     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1784     if (dest_uninitialized) {
 1785       decorators |= IS_DEST_UNINITIALIZED;
 1786     }
 1787     if (aligned) {
 1788       decorators |= ARRAYCOPY_ALIGNED;
 1789     }
 1790 
 1791     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1792     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1793 
 1794     if (is_oop) {
 1795       // save regs before copy_memory
 1796       __ push(RegSet::of(d, count), sp);
 1797     }
 1798     {
 1799       // UnsafeMemoryAccess page error: continue after unsafe access
 1800       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1801       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1802       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1803     }
 1804     if (is_oop) {
 1805       __ pop(RegSet::of(d, count), sp);
 1806       if (VerifyOops)
 1807         verify_oop_array(size, d, count, r16);
 1808     }
 1809     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1810     __ leave();
 1811     __ mov(r0, zr); // return 0
 1812     __ ret(lr);
 1813     return start;
 1814   }
 1815 
 1816   // Helper for generating a dynamic type check.
 1817   // Smashes rscratch1, rscratch2.
 1818   void generate_type_check(Register sub_klass,
 1819                            Register super_check_offset,
 1820                            Register super_klass,
 1821                            Register temp1,
 1822                            Register temp2,
 1823                            Register result,
 1824                            Label& L_success) {
 1825     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1826 
 1827     BLOCK_COMMENT("type_check:");
 1828 
 1829     Label L_miss;
 1830 
 1831     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1832                                      super_check_offset);
 1833     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1834 
 1835     // Fall through on failure!
 1836     __ BIND(L_miss);
 1837   }
 1838 
 1839   //
 1840   //  Generate checkcasting array copy stub
 1841   //
 1842   //  Input:
 1843   //    c_rarg0   - source array address
 1844   //    c_rarg1   - destination array address
 1845   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1846   //    c_rarg3   - size_t ckoff (super_check_offset)
 1847   //    c_rarg4   - oop ckval (super_klass)
 1848   //
 1849   //  Output:
 1850   //    r0 ==  0  -  success
 1851   //    r0 == -1^K - failure, where K is partial transfer count
 1852   //
 1853   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1854     bool dest_uninitialized;
 1855     switch (stub_id) {
 1856     case checkcast_arraycopy_id:
 1857       dest_uninitialized = false;
 1858       break;
 1859     case checkcast_arraycopy_uninit_id:
 1860       dest_uninitialized = true;
 1861       break;
 1862     default:
 1863       ShouldNotReachHere();
 1864     }
 1865 
 1866     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1867 
 1868     // Input registers (after setup_arg_regs)
 1869     const Register from        = c_rarg0;   // source array address
 1870     const Register to          = c_rarg1;   // destination array address
 1871     const Register count       = c_rarg2;   // elementscount
 1872     const Register ckoff       = c_rarg3;   // super_check_offset
 1873     const Register ckval       = c_rarg4;   // super_klass
 1874 
 1875     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1876     RegSet wb_post_saved_regs = RegSet::of(count);
 1877 
 1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1879     const Register copied_oop  = r22;       // actual oop copied
 1880     const Register count_save  = r21;       // orig elementscount
 1881     const Register start_to    = r20;       // destination array start address
 1882     const Register r19_klass   = r19;       // oop._klass
 1883 
 1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1886 
 1887     //---------------------------------------------------------------
 1888     // Assembler stub will be used for this call to arraycopy
 1889     // if the two arrays are subtypes of Object[] but the
 1890     // destination array type is not equal to or a supertype
 1891     // of the source type.  Each element must be separately
 1892     // checked.
 1893 
 1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1895                                copied_oop, r19_klass, count_save);
 1896 
 1897     __ align(CodeEntryAlignment);
 1898     StubCodeMark mark(this, stub_id);
 1899     address start = __ pc();
 1900 
 1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1902 
 1903 #ifdef ASSERT
 1904     // caller guarantees that the arrays really are different
 1905     // otherwise, we would have to make conjoint checks
 1906     { Label L;
 1907       __ b(L);                  // conjoint check not yet implemented
 1908       __ stop("checkcast_copy within a single array");
 1909       __ bind(L);
 1910     }
 1911 #endif //ASSERT
 1912 
 1913     // Caller of this entry point must set up the argument registers.
 1914     if (entry != nullptr) {
 1915       *entry = __ pc();
 1916       BLOCK_COMMENT("Entry:");
 1917     }
 1918 
 1919      // Empty array:  Nothing to do.
 1920     __ cbz(count, L_done);
 1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1922 
 1923 #ifdef ASSERT
 1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1925     // The ckoff and ckval must be mutually consistent,
 1926     // even though caller generates both.
 1927     { Label L;
 1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1929       __ ldrw(start_to, Address(ckval, sco_offset));
 1930       __ cmpw(ckoff, start_to);
 1931       __ br(Assembler::EQ, L);
 1932       __ stop("super_check_offset inconsistent");
 1933       __ bind(L);
 1934     }
 1935 #endif //ASSERT
 1936 
 1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1938     bool is_oop = true;
 1939     int element_size = UseCompressedOops ? 4 : 8;
 1940     if (dest_uninitialized) {
 1941       decorators |= IS_DEST_UNINITIALIZED;
 1942     }
 1943 
 1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1946 
 1947     // save the original count
 1948     __ mov(count_save, count);
 1949 
 1950     // Copy from low to high addresses
 1951     __ mov(start_to, to);              // Save destination array start address
 1952     __ b(L_load_element);
 1953 
 1954     // ======== begin loop ========
 1955     // (Loop is rotated; its entry is L_load_element.)
 1956     // Loop control:
 1957     //   for (; count != 0; count--) {
 1958     //     copied_oop = load_heap_oop(from++);
 1959     //     ... generate_type_check ...;
 1960     //     store_heap_oop(to++, copied_oop);
 1961     //   }
 1962     __ align(OptoLoopAlignment);
 1963 
 1964     __ BIND(L_store_element);
 1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1966                       __ post(to, element_size), copied_oop, noreg,
 1967                       gct1, gct2, gct3);
 1968     __ sub(count, count, 1);
 1969     __ cbz(count, L_do_card_marks);
 1970 
 1971     // ======== loop entry is here ========
 1972     __ BIND(L_load_element);
 1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1974                      copied_oop, noreg, __ post(from, element_size),
 1975                      gct1);
 1976     __ cbz(copied_oop, L_store_element);
 1977 
 1978     __ load_klass(r19_klass, copied_oop);// query the object klass
 1979 
 1980     BLOCK_COMMENT("type_check:");
 1981     generate_type_check(/*sub_klass*/r19_klass,
 1982                         /*super_check_offset*/ckoff,
 1983                         /*super_klass*/ckval,
 1984                         /*r_array_base*/gct1,
 1985                         /*temp2*/gct2,
 1986                         /*result*/r10, L_store_element);
 1987 
 1988     // Fall through on failure!
 1989 
 1990     // ======== end loop ========
 1991 
 1992     // It was a real error; we must depend on the caller to finish the job.
 1993     // Register count = remaining oops, count_orig = total oops.
 1994     // Emit GC store barriers for the oops we have copied and report
 1995     // their number to the caller.
 1996 
 1997     __ subs(count, count_save, count);     // K = partially copied oop count
 1998     __ eon(count, count, zr);              // report (-1^K) to caller
 1999     __ br(Assembler::EQ, L_done_pop);
 2000 
 2001     __ BIND(L_do_card_marks);
 2002     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2003 
 2004     __ bind(L_done_pop);
 2005     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2007 
 2008     __ bind(L_done);
 2009     __ mov(r0, count);
 2010     __ leave();
 2011     __ ret(lr);
 2012 
 2013     return start;
 2014   }
 2015 
 2016   // Perform range checks on the proposed arraycopy.
 2017   // Kills temp, but nothing else.
 2018   // Also, clean the sign bits of src_pos and dst_pos.
 2019   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2020                               Register src_pos, // source position (c_rarg1)
 2021                               Register dst,     // destination array oo (c_rarg2)
 2022                               Register dst_pos, // destination position (c_rarg3)
 2023                               Register length,
 2024                               Register temp,
 2025                               Label& L_failed) {
 2026     BLOCK_COMMENT("arraycopy_range_checks:");
 2027 
 2028     assert_different_registers(rscratch1, temp);
 2029 
 2030     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2031     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2032     __ addw(temp, length, src_pos);
 2033     __ cmpw(temp, rscratch1);
 2034     __ br(Assembler::HI, L_failed);
 2035 
 2036     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2037     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2038     __ addw(temp, length, dst_pos);
 2039     __ cmpw(temp, rscratch1);
 2040     __ br(Assembler::HI, L_failed);
 2041 
 2042     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2043     __ movw(src_pos, src_pos);
 2044     __ movw(dst_pos, dst_pos);
 2045 
 2046     BLOCK_COMMENT("arraycopy_range_checks done");
 2047   }
 2048 
 2049   // These stubs get called from some dumb test routine.
 2050   // I'll write them properly when they're called from
 2051   // something that's actually doing something.
 2052   static void fake_arraycopy_stub(address src, address dst, int count) {
 2053     assert(count == 0, "huh?");
 2054   }
 2055 
 2056 
 2057   //
 2058   //  Generate 'unsafe' array copy stub
 2059   //  Though just as safe as the other stubs, it takes an unscaled
 2060   //  size_t argument instead of an element count.
 2061   //
 2062   //  Input:
 2063   //    c_rarg0   - source array address
 2064   //    c_rarg1   - destination array address
 2065   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2066   //
 2067   // Examines the alignment of the operands and dispatches
 2068   // to a long, int, short, or byte copy loop.
 2069   //
 2070   address generate_unsafe_copy(address byte_copy_entry,
 2071                                address short_copy_entry,
 2072                                address int_copy_entry,
 2073                                address long_copy_entry) {
 2074     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2075 
 2076     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2077     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2078 
 2079     __ align(CodeEntryAlignment);
 2080     StubCodeMark mark(this, stub_id);
 2081     address start = __ pc();
 2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2083 
 2084     // bump this on entry, not on exit:
 2085     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2086 
 2087     __ orr(rscratch1, s, d);
 2088     __ orr(rscratch1, rscratch1, count);
 2089 
 2090     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2091     __ cbz(rscratch1, L_long_aligned);
 2092     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2093     __ cbz(rscratch1, L_int_aligned);
 2094     __ tbz(rscratch1, 0, L_short_aligned);
 2095     __ b(RuntimeAddress(byte_copy_entry));
 2096 
 2097     __ BIND(L_short_aligned);
 2098     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2099     __ b(RuntimeAddress(short_copy_entry));
 2100     __ BIND(L_int_aligned);
 2101     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2102     __ b(RuntimeAddress(int_copy_entry));
 2103     __ BIND(L_long_aligned);
 2104     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2105     __ b(RuntimeAddress(long_copy_entry));
 2106 
 2107     return start;
 2108   }
 2109 
 2110   //
 2111   //  Generate generic array copy stubs
 2112   //
 2113   //  Input:
 2114   //    c_rarg0    -  src oop
 2115   //    c_rarg1    -  src_pos (32-bits)
 2116   //    c_rarg2    -  dst oop
 2117   //    c_rarg3    -  dst_pos (32-bits)
 2118   //    c_rarg4    -  element count (32-bits)
 2119   //
 2120   //  Output:
 2121   //    r0 ==  0  -  success
 2122   //    r0 == -1^K - failure, where K is partial transfer count
 2123   //
 2124   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2125                                 address int_copy_entry, address oop_copy_entry,
 2126                                 address long_copy_entry, address checkcast_copy_entry) {
 2127     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2128 
 2129     Label L_failed, L_objArray;
 2130     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2131 
 2132     // Input registers
 2133     const Register src        = c_rarg0;  // source array oop
 2134     const Register src_pos    = c_rarg1;  // source position
 2135     const Register dst        = c_rarg2;  // destination array oop
 2136     const Register dst_pos    = c_rarg3;  // destination position
 2137     const Register length     = c_rarg4;
 2138 
 2139 
 2140     // Registers used as temps
 2141     const Register dst_klass  = c_rarg5;
 2142 
 2143     __ align(CodeEntryAlignment);
 2144 
 2145     StubCodeMark mark(this, stub_id);
 2146 
 2147     address start = __ pc();
 2148 
 2149     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2150 
 2151     // bump this on entry, not on exit:
 2152     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2153 
 2154     //-----------------------------------------------------------------------
 2155     // Assembler stub will be used for this call to arraycopy
 2156     // if the following conditions are met:
 2157     //
 2158     // (1) src and dst must not be null.
 2159     // (2) src_pos must not be negative.
 2160     // (3) dst_pos must not be negative.
 2161     // (4) length  must not be negative.
 2162     // (5) src klass and dst klass should be the same and not null.
 2163     // (6) src and dst should be arrays.
 2164     // (7) src_pos + length must not exceed length of src.
 2165     // (8) dst_pos + length must not exceed length of dst.
 2166     //
 2167 
 2168     //  if (src == nullptr) return -1;
 2169     __ cbz(src, L_failed);
 2170 
 2171     //  if (src_pos < 0) return -1;
 2172     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2173 
 2174     //  if (dst == nullptr) return -1;
 2175     __ cbz(dst, L_failed);
 2176 
 2177     //  if (dst_pos < 0) return -1;
 2178     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2179 
 2180     // registers used as temp
 2181     const Register scratch_length    = r16; // elements count to copy
 2182     const Register scratch_src_klass = r17; // array klass
 2183     const Register lh                = r15; // layout helper
 2184 
 2185     //  if (length < 0) return -1;
 2186     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2187     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2188 
 2189     __ load_klass(scratch_src_klass, src);
 2190 #ifdef ASSERT
 2191     //  assert(src->klass() != nullptr);
 2192     {
 2193       BLOCK_COMMENT("assert klasses not null {");
 2194       Label L1, L2;
 2195       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2196       __ bind(L1);
 2197       __ stop("broken null klass");
 2198       __ bind(L2);
 2199       __ load_klass(rscratch1, dst);
 2200       __ cbz(rscratch1, L1);     // this would be broken also
 2201       BLOCK_COMMENT("} assert klasses not null done");
 2202     }
 2203 #endif
 2204 
 2205     // Load layout helper (32-bits)
 2206     //
 2207     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2208     // 32        30    24            16              8     2                 0
 2209     //
 2210     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2211     //
 2212 
 2213     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2214 
 2215     // Handle objArrays completely differently...
 2216     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2217     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2218     __ movw(rscratch1, objArray_lh);
 2219     __ eorw(rscratch2, lh, rscratch1);
 2220     __ cbzw(rscratch2, L_objArray);
 2221 
 2222     //  if (src->klass() != dst->klass()) return -1;
 2223     __ load_klass(rscratch2, dst);
 2224     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2225     __ cbnz(rscratch2, L_failed);
 2226 
 2227     //  if (!src->is_Array()) return -1;
 2228     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2229 
 2230     // At this point, it is known to be a typeArray (array_tag 0x3).
 2231 #ifdef ASSERT
 2232     {
 2233       BLOCK_COMMENT("assert primitive array {");
 2234       Label L;
 2235       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2236       __ cmpw(lh, rscratch2);
 2237       __ br(Assembler::GE, L);
 2238       __ stop("must be a primitive array");
 2239       __ bind(L);
 2240       BLOCK_COMMENT("} assert primitive array done");
 2241     }
 2242 #endif
 2243 
 2244     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2245                            rscratch2, L_failed);
 2246 
 2247     // TypeArrayKlass
 2248     //
 2249     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2250     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2251     //
 2252 
 2253     const Register rscratch1_offset = rscratch1;    // array offset
 2254     const Register r15_elsize = lh; // element size
 2255 
 2256     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2257            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2258     __ add(src, src, rscratch1_offset);           // src array offset
 2259     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2260     BLOCK_COMMENT("choose copy loop based on element size");
 2261 
 2262     // next registers should be set before the jump to corresponding stub
 2263     const Register from     = c_rarg0;  // source array address
 2264     const Register to       = c_rarg1;  // destination array address
 2265     const Register count    = c_rarg2;  // elements count
 2266 
 2267     // 'from', 'to', 'count' registers should be set in such order
 2268     // since they are the same as 'src', 'src_pos', 'dst'.
 2269 
 2270     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2271 
 2272     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2273     // size in bytes).  We do a simple bitwise binary search.
 2274   __ BIND(L_copy_bytes);
 2275     __ tbnz(r15_elsize, 1, L_copy_ints);
 2276     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2277     __ lea(from, Address(src, src_pos));// src_addr
 2278     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2279     __ movw(count, scratch_length); // length
 2280     __ b(RuntimeAddress(byte_copy_entry));
 2281 
 2282   __ BIND(L_copy_shorts);
 2283     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2285     __ movw(count, scratch_length); // length
 2286     __ b(RuntimeAddress(short_copy_entry));
 2287 
 2288   __ BIND(L_copy_ints);
 2289     __ tbnz(r15_elsize, 0, L_copy_longs);
 2290     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2291     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2292     __ movw(count, scratch_length); // length
 2293     __ b(RuntimeAddress(int_copy_entry));
 2294 
 2295   __ BIND(L_copy_longs);
 2296 #ifdef ASSERT
 2297     {
 2298       BLOCK_COMMENT("assert long copy {");
 2299       Label L;
 2300       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2301       __ cmpw(r15_elsize, LogBytesPerLong);
 2302       __ br(Assembler::EQ, L);
 2303       __ stop("must be long copy, but elsize is wrong");
 2304       __ bind(L);
 2305       BLOCK_COMMENT("} assert long copy done");
 2306     }
 2307 #endif
 2308     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2310     __ movw(count, scratch_length); // length
 2311     __ b(RuntimeAddress(long_copy_entry));
 2312 
 2313     // ObjArrayKlass
 2314   __ BIND(L_objArray);
 2315     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2316 
 2317     Label L_plain_copy, L_checkcast_copy;
 2318     //  test array classes for subtyping
 2319     __ load_klass(r15, dst);
 2320     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2321     __ br(Assembler::NE, L_checkcast_copy);
 2322 
 2323     // Identically typed arrays can be copied without element-wise checks.
 2324     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2325                            rscratch2, L_failed);
 2326 
 2327     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2328     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2329     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2330     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2331     __ movw(count, scratch_length); // length
 2332   __ BIND(L_plain_copy);
 2333     __ b(RuntimeAddress(oop_copy_entry));
 2334 
 2335   __ BIND(L_checkcast_copy);
 2336     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2337     {
 2338       // Before looking at dst.length, make sure dst is also an objArray.
 2339       __ ldrw(rscratch1, Address(r15, lh_offset));
 2340       __ movw(rscratch2, objArray_lh);
 2341       __ eorw(rscratch1, rscratch1, rscratch2);
 2342       __ cbnzw(rscratch1, L_failed);
 2343 
 2344       // It is safe to examine both src.length and dst.length.
 2345       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2346                              r15, L_failed);
 2347 
 2348       __ load_klass(dst_klass, dst); // reload
 2349 
 2350       // Marshal the base address arguments now, freeing registers.
 2351       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2352       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2354       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2355       __ movw(count, length);           // length (reloaded)
 2356       Register sco_temp = c_rarg3;      // this register is free now
 2357       assert_different_registers(from, to, count, sco_temp,
 2358                                  dst_klass, scratch_src_klass);
 2359       // assert_clean_int(count, sco_temp);
 2360 
 2361       // Generate the type check.
 2362       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2364 
 2365       // Smashes rscratch1, rscratch2
 2366       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2367                           L_plain_copy);
 2368 
 2369       // Fetch destination element klass from the ObjArrayKlass header.
 2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2373 
 2374       // the checkcast_copy loop needs two extra arguments:
 2375       assert(c_rarg3 == sco_temp, "#3 already in place");
 2376       // Set up arguments for checkcast_copy_entry.
 2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2378       __ b(RuntimeAddress(checkcast_copy_entry));
 2379     }
 2380 
 2381   __ BIND(L_failed);
 2382     __ mov(r0, -1);
 2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2384     __ ret(lr);
 2385 
 2386     return start;
 2387   }
 2388 
 2389   //
 2390   // Generate stub for array fill. If "aligned" is true, the
 2391   // "to" address is assumed to be heapword aligned.
 2392   //
 2393   // Arguments for generated stub:
 2394   //   to:    c_rarg0
 2395   //   value: c_rarg1
 2396   //   count: c_rarg2 treated as signed
 2397   //
 2398   address generate_fill(StubGenStubId stub_id) {
 2399     BasicType t;
 2400     bool aligned;
 2401 
 2402     switch (stub_id) {
 2403     case jbyte_fill_id:
 2404       t = T_BYTE;
 2405       aligned = false;
 2406       break;
 2407     case jshort_fill_id:
 2408       t = T_SHORT;
 2409       aligned = false;
 2410       break;
 2411     case jint_fill_id:
 2412       t = T_INT;
 2413       aligned = false;
 2414       break;
 2415     case arrayof_jbyte_fill_id:
 2416       t = T_BYTE;
 2417       aligned = true;
 2418       break;
 2419     case arrayof_jshort_fill_id:
 2420       t = T_SHORT;
 2421       aligned = true;
 2422       break;
 2423     case arrayof_jint_fill_id:
 2424       t = T_INT;
 2425       aligned = true;
 2426       break;
 2427     default:
 2428       ShouldNotReachHere();
 2429     };
 2430 
 2431     __ align(CodeEntryAlignment);
 2432     StubCodeMark mark(this, stub_id);
 2433     address start = __ pc();
 2434 
 2435     BLOCK_COMMENT("Entry:");
 2436 
 2437     const Register to        = c_rarg0;  // source array address
 2438     const Register value     = c_rarg1;  // value
 2439     const Register count     = c_rarg2;  // elements count
 2440 
 2441     const Register bz_base = r10;        // base for block_zero routine
 2442     const Register cnt_words = r11;      // temp register
 2443 
 2444     __ enter();
 2445 
 2446     Label L_fill_elements, L_exit1;
 2447 
 2448     int shift = -1;
 2449     switch (t) {
 2450       case T_BYTE:
 2451         shift = 0;
 2452         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2453         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2454         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2455         __ br(Assembler::LO, L_fill_elements);
 2456         break;
 2457       case T_SHORT:
 2458         shift = 1;
 2459         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2461         __ br(Assembler::LO, L_fill_elements);
 2462         break;
 2463       case T_INT:
 2464         shift = 2;
 2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2466         __ br(Assembler::LO, L_fill_elements);
 2467         break;
 2468       default: ShouldNotReachHere();
 2469     }
 2470 
 2471     // Align source address at 8 bytes address boundary.
 2472     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2473     if (!aligned) {
 2474       switch (t) {
 2475         case T_BYTE:
 2476           // One byte misalignment happens only for byte arrays.
 2477           __ tbz(to, 0, L_skip_align1);
 2478           __ strb(value, Address(__ post(to, 1)));
 2479           __ subw(count, count, 1);
 2480           __ bind(L_skip_align1);
 2481           // Fallthrough
 2482         case T_SHORT:
 2483           // Two bytes misalignment happens only for byte and short (char) arrays.
 2484           __ tbz(to, 1, L_skip_align2);
 2485           __ strh(value, Address(__ post(to, 2)));
 2486           __ subw(count, count, 2 >> shift);
 2487           __ bind(L_skip_align2);
 2488           // Fallthrough
 2489         case T_INT:
 2490           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2491           __ tbz(to, 2, L_skip_align4);
 2492           __ strw(value, Address(__ post(to, 4)));
 2493           __ subw(count, count, 4 >> shift);
 2494           __ bind(L_skip_align4);
 2495           break;
 2496         default: ShouldNotReachHere();
 2497       }
 2498     }
 2499 
 2500     //
 2501     //  Fill large chunks
 2502     //
 2503     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2504     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2505     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2506     if (UseBlockZeroing) {
 2507       Label non_block_zeroing, rest;
 2508       // If the fill value is zero we can use the fast zero_words().
 2509       __ cbnz(value, non_block_zeroing);
 2510       __ mov(bz_base, to);
 2511       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2512       address tpc = __ zero_words(bz_base, cnt_words);
 2513       if (tpc == nullptr) {
 2514         fatal("CodeCache is full at generate_fill");
 2515       }
 2516       __ b(rest);
 2517       __ bind(non_block_zeroing);
 2518       __ fill_words(to, cnt_words, value);
 2519       __ bind(rest);
 2520     } else {
 2521       __ fill_words(to, cnt_words, value);
 2522     }
 2523 
 2524     // Remaining count is less than 8 bytes. Fill it by a single store.
 2525     // Note that the total length is no less than 8 bytes.
 2526     if (t == T_BYTE || t == T_SHORT) {
 2527       Label L_exit1;
 2528       __ cbzw(count, L_exit1);
 2529       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2530       __ str(value, Address(to, -8));    // overwrite some elements
 2531       __ bind(L_exit1);
 2532       __ leave();
 2533       __ ret(lr);
 2534     }
 2535 
 2536     // Handle copies less than 8 bytes.
 2537     Label L_fill_2, L_fill_4, L_exit2;
 2538     __ bind(L_fill_elements);
 2539     switch (t) {
 2540       case T_BYTE:
 2541         __ tbz(count, 0, L_fill_2);
 2542         __ strb(value, Address(__ post(to, 1)));
 2543         __ bind(L_fill_2);
 2544         __ tbz(count, 1, L_fill_4);
 2545         __ strh(value, Address(__ post(to, 2)));
 2546         __ bind(L_fill_4);
 2547         __ tbz(count, 2, L_exit2);
 2548         __ strw(value, Address(to));
 2549         break;
 2550       case T_SHORT:
 2551         __ tbz(count, 0, L_fill_4);
 2552         __ strh(value, Address(__ post(to, 2)));
 2553         __ bind(L_fill_4);
 2554         __ tbz(count, 1, L_exit2);
 2555         __ strw(value, Address(to));
 2556         break;
 2557       case T_INT:
 2558         __ cbzw(count, L_exit2);
 2559         __ strw(value, Address(to));
 2560         break;
 2561       default: ShouldNotReachHere();
 2562     }
 2563     __ bind(L_exit2);
 2564     __ leave();
 2565     __ ret(lr);
 2566     return start;
 2567   }
 2568 
 2569   address generate_unsafecopy_common_error_exit() {
 2570     address start_pc = __ pc();
 2571       __ leave();
 2572       __ mov(r0, 0);
 2573       __ ret(lr);
 2574     return start_pc;
 2575   }
 2576 
 2577   //
 2578   //  Generate 'unsafe' set memory stub
 2579   //  Though just as safe as the other stubs, it takes an unscaled
 2580   //  size_t (# bytes) argument instead of an element count.
 2581   //
 2582   //  This fill operation is atomicity preserving: as long as the
 2583   //  address supplied is sufficiently aligned, all writes of up to 64
 2584   //  bits in size are single-copy atomic.
 2585   //
 2586   //  Input:
 2587   //    c_rarg0   - destination array address
 2588   //    c_rarg1   - byte count (size_t)
 2589   //    c_rarg2   - byte value
 2590   //
 2591   address generate_unsafe_setmemory() {
 2592     __ align(CodeEntryAlignment);
 2593     StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id);
 2594     address start = __ pc();
 2595 
 2596     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2597     Label tail;
 2598 
 2599     UnsafeMemoryAccessMark umam(this, true, false);
 2600 
 2601     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2602 
 2603     __ dup(v0, __ T16B, value);
 2604 
 2605     if (AvoidUnalignedAccesses) {
 2606       __ cmp(count, (u1)16);
 2607       __ br(__ LO, tail);
 2608 
 2609       __ mov(rscratch1, 16);
 2610       __ andr(rscratch2, dest, 15);
 2611       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2612       __ strq(v0, Address(dest));
 2613       __ sub(count, count, rscratch1);
 2614       __ add(dest, dest, rscratch1);
 2615     }
 2616 
 2617     __ subs(count, count, (u1)64);
 2618     __ br(__ LO, tail);
 2619     {
 2620       Label again;
 2621       __ bind(again);
 2622       __ stpq(v0, v0, Address(dest));
 2623       __ stpq(v0, v0, Address(dest, 32));
 2624 
 2625       __ subs(count, count, 64);
 2626       __ add(dest, dest, 64);
 2627       __ br(__ HS, again);
 2628     }
 2629 
 2630     __ bind(tail);
 2631     // The count of bytes is off by 64, but we don't need to correct
 2632     // it because we're only going to use the least-significant few
 2633     // count bits from here on.
 2634     // __ add(count, count, 64);
 2635 
 2636     {
 2637       Label dont;
 2638       __ tbz(count, exact_log2(32), dont);
 2639       __ stpq(v0, v0, __ post(dest, 32));
 2640       __ bind(dont);
 2641     }
 2642     {
 2643       Label dont;
 2644       __ tbz(count, exact_log2(16), dont);
 2645       __ strq(v0, __ post(dest, 16));
 2646       __ bind(dont);
 2647     }
 2648     {
 2649       Label dont;
 2650       __ tbz(count, exact_log2(8), dont);
 2651       __ strd(v0, __ post(dest, 8));
 2652       __ bind(dont);
 2653     }
 2654 
 2655     Label finished;
 2656     __ tst(count, 7);
 2657     __ br(__ EQ, finished);
 2658 
 2659     {
 2660       Label dont;
 2661       __ tbz(count, exact_log2(4), dont);
 2662       __ strs(v0, __ post(dest, 4));
 2663       __ bind(dont);
 2664     }
 2665     {
 2666       Label dont;
 2667       __ tbz(count, exact_log2(2), dont);
 2668       __ bfi(value, value, 8, 8);
 2669       __ strh(value, __ post(dest, 2));
 2670       __ bind(dont);
 2671     }
 2672     {
 2673       Label dont;
 2674       __ tbz(count, exact_log2(1), dont);
 2675       __ strb(value, Address(dest));
 2676       __ bind(dont);
 2677     }
 2678 
 2679     __ bind(finished);
 2680     __ leave();
 2681     __ ret(lr);
 2682 
 2683     return start;
 2684   }
 2685 
 2686   address generate_data_cache_writeback() {
 2687     const Register line        = c_rarg0;  // address of line to write back
 2688 
 2689     __ align(CodeEntryAlignment);
 2690 
 2691     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2692     StubCodeMark mark(this, stub_id);
 2693 
 2694     address start = __ pc();
 2695     __ enter();
 2696     __ cache_wb(Address(line, 0));
 2697     __ leave();
 2698     __ ret(lr);
 2699 
 2700     return start;
 2701   }
 2702 
 2703   address generate_data_cache_writeback_sync() {
 2704     const Register is_pre     = c_rarg0;  // pre or post sync
 2705 
 2706     __ align(CodeEntryAlignment);
 2707 
 2708     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2709     StubCodeMark mark(this, stub_id);
 2710 
 2711     // pre wbsync is a no-op
 2712     // post wbsync translates to an sfence
 2713 
 2714     Label skip;
 2715     address start = __ pc();
 2716     __ enter();
 2717     __ cbnz(is_pre, skip);
 2718     __ cache_wbsync(false);
 2719     __ bind(skip);
 2720     __ leave();
 2721     __ ret(lr);
 2722 
 2723     return start;
 2724   }
 2725 
 2726   void generate_arraycopy_stubs() {
 2727     address entry;
 2728     address entry_jbyte_arraycopy;
 2729     address entry_jshort_arraycopy;
 2730     address entry_jint_arraycopy;
 2731     address entry_oop_arraycopy;
 2732     address entry_jlong_arraycopy;
 2733     address entry_checkcast_arraycopy;
 2734 
 2735     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
 2736     UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit);
 2737 
 2738     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2739     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2740 
 2741     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2742     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2743 
 2744     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2745     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2746 
 2747     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2748 
 2749     //*** jbyte
 2750     // Always need aligned and unaligned versions
 2751     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2752     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2753     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2754     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2755 
 2756     //*** jshort
 2757     // Always need aligned and unaligned versions
 2758     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2759     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2760     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2761     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2762 
 2763     //*** jint
 2764     // Aligned versions
 2765     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2766     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2767     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2768     // entry_jint_arraycopy always points to the unaligned version
 2769     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2770     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2771 
 2772     //*** jlong
 2773     // It is always aligned
 2774     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2775     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2776     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2777     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2778 
 2779     //*** oops
 2780     {
 2781       // With compressed oops we need unaligned versions; notice that
 2782       // we overwrite entry_oop_arraycopy.
 2783       bool aligned = !UseCompressedOops;
 2784 
 2785       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2786         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2787       StubRoutines::_arrayof_oop_arraycopy
 2788         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2789       // Aligned versions without pre-barriers
 2790       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2791         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2792       StubRoutines::_arrayof_oop_arraycopy_uninit
 2793         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2794     }
 2795 
 2796     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2797     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2798     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2799     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2800 
 2801     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2802     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2803 
 2804     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2805                                                               entry_jshort_arraycopy,
 2806                                                               entry_jint_arraycopy,
 2807                                                               entry_jlong_arraycopy);
 2808 
 2809     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2810                                                                entry_jshort_arraycopy,
 2811                                                                entry_jint_arraycopy,
 2812                                                                entry_oop_arraycopy,
 2813                                                                entry_jlong_arraycopy,
 2814                                                                entry_checkcast_arraycopy);
 2815 
 2816     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2817     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2818     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2819     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2820     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2821     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2822   }
 2823 
 2824   void generate_math_stubs() { Unimplemented(); }
 2825 
 2826   // Arguments:
 2827   //
 2828   // Inputs:
 2829   //   c_rarg0   - source byte array address
 2830   //   c_rarg1   - destination byte array address
 2831   //   c_rarg2   - K (key) in little endian int array
 2832   //
 2833   address generate_aescrypt_encryptBlock() {
 2834     __ align(CodeEntryAlignment);
 2835     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2836     StubCodeMark mark(this, stub_id);
 2837 
 2838     const Register from        = c_rarg0;  // source array address
 2839     const Register to          = c_rarg1;  // destination array address
 2840     const Register key         = c_rarg2;  // key array address
 2841     const Register keylen      = rscratch1;
 2842 
 2843     address start = __ pc();
 2844     __ enter();
 2845 
 2846     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2847 
 2848     __ aesenc_loadkeys(key, keylen);
 2849     __ aesecb_encrypt(from, to, keylen);
 2850 
 2851     __ mov(r0, 0);
 2852 
 2853     __ leave();
 2854     __ ret(lr);
 2855 
 2856     return start;
 2857   }
 2858 
 2859   // Arguments:
 2860   //
 2861   // Inputs:
 2862   //   c_rarg0   - source byte array address
 2863   //   c_rarg1   - destination byte array address
 2864   //   c_rarg2   - K (key) in little endian int array
 2865   //
 2866   address generate_aescrypt_decryptBlock() {
 2867     assert(UseAES, "need AES cryptographic extension support");
 2868     __ align(CodeEntryAlignment);
 2869     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2870     StubCodeMark mark(this, stub_id);
 2871     Label L_doLast;
 2872 
 2873     const Register from        = c_rarg0;  // source array address
 2874     const Register to          = c_rarg1;  // destination array address
 2875     const Register key         = c_rarg2;  // key array address
 2876     const Register keylen      = rscratch1;
 2877 
 2878     address start = __ pc();
 2879     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2880 
 2881     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2882 
 2883     __ aesecb_decrypt(from, to, key, keylen);
 2884 
 2885     __ mov(r0, 0);
 2886 
 2887     __ leave();
 2888     __ ret(lr);
 2889 
 2890     return start;
 2891   }
 2892 
 2893   // Arguments:
 2894   //
 2895   // Inputs:
 2896   //   c_rarg0   - source byte array address
 2897   //   c_rarg1   - destination byte array address
 2898   //   c_rarg2   - K (key) in little endian int array
 2899   //   c_rarg3   - r vector byte array address
 2900   //   c_rarg4   - input length
 2901   //
 2902   // Output:
 2903   //   x0        - input length
 2904   //
 2905   address generate_cipherBlockChaining_encryptAESCrypt() {
 2906     assert(UseAES, "need AES cryptographic extension support");
 2907     __ align(CodeEntryAlignment);
 2908     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2909     StubCodeMark mark(this, stub_id);
 2910 
 2911     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2912 
 2913     const Register from        = c_rarg0;  // source array address
 2914     const Register to          = c_rarg1;  // destination array address
 2915     const Register key         = c_rarg2;  // key array address
 2916     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2917                                            // and left with the results of the last encryption block
 2918     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2919     const Register keylen      = rscratch1;
 2920 
 2921     address start = __ pc();
 2922 
 2923       __ enter();
 2924 
 2925       __ movw(rscratch2, len_reg);
 2926 
 2927       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2928 
 2929       __ ld1(v0, __ T16B, rvec);
 2930 
 2931       __ cmpw(keylen, 52);
 2932       __ br(Assembler::CC, L_loadkeys_44);
 2933       __ br(Assembler::EQ, L_loadkeys_52);
 2934 
 2935       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2936       __ rev32(v17, __ T16B, v17);
 2937       __ rev32(v18, __ T16B, v18);
 2938     __ BIND(L_loadkeys_52);
 2939       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2940       __ rev32(v19, __ T16B, v19);
 2941       __ rev32(v20, __ T16B, v20);
 2942     __ BIND(L_loadkeys_44);
 2943       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2944       __ rev32(v21, __ T16B, v21);
 2945       __ rev32(v22, __ T16B, v22);
 2946       __ rev32(v23, __ T16B, v23);
 2947       __ rev32(v24, __ T16B, v24);
 2948       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2949       __ rev32(v25, __ T16B, v25);
 2950       __ rev32(v26, __ T16B, v26);
 2951       __ rev32(v27, __ T16B, v27);
 2952       __ rev32(v28, __ T16B, v28);
 2953       __ ld1(v29, v30, v31, __ T16B, key);
 2954       __ rev32(v29, __ T16B, v29);
 2955       __ rev32(v30, __ T16B, v30);
 2956       __ rev32(v31, __ T16B, v31);
 2957 
 2958     __ BIND(L_aes_loop);
 2959       __ ld1(v1, __ T16B, __ post(from, 16));
 2960       __ eor(v0, __ T16B, v0, v1);
 2961 
 2962       __ br(Assembler::CC, L_rounds_44);
 2963       __ br(Assembler::EQ, L_rounds_52);
 2964 
 2965       __ aese(v0, v17); __ aesmc(v0, v0);
 2966       __ aese(v0, v18); __ aesmc(v0, v0);
 2967     __ BIND(L_rounds_52);
 2968       __ aese(v0, v19); __ aesmc(v0, v0);
 2969       __ aese(v0, v20); __ aesmc(v0, v0);
 2970     __ BIND(L_rounds_44);
 2971       __ aese(v0, v21); __ aesmc(v0, v0);
 2972       __ aese(v0, v22); __ aesmc(v0, v0);
 2973       __ aese(v0, v23); __ aesmc(v0, v0);
 2974       __ aese(v0, v24); __ aesmc(v0, v0);
 2975       __ aese(v0, v25); __ aesmc(v0, v0);
 2976       __ aese(v0, v26); __ aesmc(v0, v0);
 2977       __ aese(v0, v27); __ aesmc(v0, v0);
 2978       __ aese(v0, v28); __ aesmc(v0, v0);
 2979       __ aese(v0, v29); __ aesmc(v0, v0);
 2980       __ aese(v0, v30);
 2981       __ eor(v0, __ T16B, v0, v31);
 2982 
 2983       __ st1(v0, __ T16B, __ post(to, 16));
 2984 
 2985       __ subw(len_reg, len_reg, 16);
 2986       __ cbnzw(len_reg, L_aes_loop);
 2987 
 2988       __ st1(v0, __ T16B, rvec);
 2989 
 2990       __ mov(r0, rscratch2);
 2991 
 2992       __ leave();
 2993       __ ret(lr);
 2994 
 2995       return start;
 2996   }
 2997 
 2998   // Arguments:
 2999   //
 3000   // Inputs:
 3001   //   c_rarg0   - source byte array address
 3002   //   c_rarg1   - destination byte array address
 3003   //   c_rarg2   - K (key) in little endian int array
 3004   //   c_rarg3   - r vector byte array address
 3005   //   c_rarg4   - input length
 3006   //
 3007   // Output:
 3008   //   r0        - input length
 3009   //
 3010   address generate_cipherBlockChaining_decryptAESCrypt() {
 3011     assert(UseAES, "need AES cryptographic extension support");
 3012     __ align(CodeEntryAlignment);
 3013     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 3014     StubCodeMark mark(this, stub_id);
 3015 
 3016     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3017 
 3018     const Register from        = c_rarg0;  // source array address
 3019     const Register to          = c_rarg1;  // destination array address
 3020     const Register key         = c_rarg2;  // key array address
 3021     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3022                                            // and left with the results of the last encryption block
 3023     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3024     const Register keylen      = rscratch1;
 3025 
 3026     address start = __ pc();
 3027 
 3028       __ enter();
 3029 
 3030       __ movw(rscratch2, len_reg);
 3031 
 3032       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3033 
 3034       __ ld1(v2, __ T16B, rvec);
 3035 
 3036       __ ld1(v31, __ T16B, __ post(key, 16));
 3037       __ rev32(v31, __ T16B, v31);
 3038 
 3039       __ cmpw(keylen, 52);
 3040       __ br(Assembler::CC, L_loadkeys_44);
 3041       __ br(Assembler::EQ, L_loadkeys_52);
 3042 
 3043       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3044       __ rev32(v17, __ T16B, v17);
 3045       __ rev32(v18, __ T16B, v18);
 3046     __ BIND(L_loadkeys_52);
 3047       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3048       __ rev32(v19, __ T16B, v19);
 3049       __ rev32(v20, __ T16B, v20);
 3050     __ BIND(L_loadkeys_44);
 3051       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3052       __ rev32(v21, __ T16B, v21);
 3053       __ rev32(v22, __ T16B, v22);
 3054       __ rev32(v23, __ T16B, v23);
 3055       __ rev32(v24, __ T16B, v24);
 3056       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3057       __ rev32(v25, __ T16B, v25);
 3058       __ rev32(v26, __ T16B, v26);
 3059       __ rev32(v27, __ T16B, v27);
 3060       __ rev32(v28, __ T16B, v28);
 3061       __ ld1(v29, v30, __ T16B, key);
 3062       __ rev32(v29, __ T16B, v29);
 3063       __ rev32(v30, __ T16B, v30);
 3064 
 3065     __ BIND(L_aes_loop);
 3066       __ ld1(v0, __ T16B, __ post(from, 16));
 3067       __ orr(v1, __ T16B, v0, v0);
 3068 
 3069       __ br(Assembler::CC, L_rounds_44);
 3070       __ br(Assembler::EQ, L_rounds_52);
 3071 
 3072       __ aesd(v0, v17); __ aesimc(v0, v0);
 3073       __ aesd(v0, v18); __ aesimc(v0, v0);
 3074     __ BIND(L_rounds_52);
 3075       __ aesd(v0, v19); __ aesimc(v0, v0);
 3076       __ aesd(v0, v20); __ aesimc(v0, v0);
 3077     __ BIND(L_rounds_44);
 3078       __ aesd(v0, v21); __ aesimc(v0, v0);
 3079       __ aesd(v0, v22); __ aesimc(v0, v0);
 3080       __ aesd(v0, v23); __ aesimc(v0, v0);
 3081       __ aesd(v0, v24); __ aesimc(v0, v0);
 3082       __ aesd(v0, v25); __ aesimc(v0, v0);
 3083       __ aesd(v0, v26); __ aesimc(v0, v0);
 3084       __ aesd(v0, v27); __ aesimc(v0, v0);
 3085       __ aesd(v0, v28); __ aesimc(v0, v0);
 3086       __ aesd(v0, v29); __ aesimc(v0, v0);
 3087       __ aesd(v0, v30);
 3088       __ eor(v0, __ T16B, v0, v31);
 3089       __ eor(v0, __ T16B, v0, v2);
 3090 
 3091       __ st1(v0, __ T16B, __ post(to, 16));
 3092       __ orr(v2, __ T16B, v1, v1);
 3093 
 3094       __ subw(len_reg, len_reg, 16);
 3095       __ cbnzw(len_reg, L_aes_loop);
 3096 
 3097       __ st1(v2, __ T16B, rvec);
 3098 
 3099       __ mov(r0, rscratch2);
 3100 
 3101       __ leave();
 3102       __ ret(lr);
 3103 
 3104     return start;
 3105   }
 3106 
 3107   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3108   // Inputs: 128-bits. in is preserved.
 3109   // The least-significant 64-bit word is in the upper dword of each vector.
 3110   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3111   // Output: result
 3112   void be_add_128_64(FloatRegister result, FloatRegister in,
 3113                      FloatRegister inc, FloatRegister tmp) {
 3114     assert_different_registers(result, tmp, inc);
 3115 
 3116     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3117                                            // input
 3118     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3119     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3120                                            // MSD == 0 (must be!) to LSD
 3121     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3122   }
 3123 
 3124   // CTR AES crypt.
 3125   // Arguments:
 3126   //
 3127   // Inputs:
 3128   //   c_rarg0   - source byte array address
 3129   //   c_rarg1   - destination byte array address
 3130   //   c_rarg2   - K (key) in little endian int array
 3131   //   c_rarg3   - counter vector byte array address
 3132   //   c_rarg4   - input length
 3133   //   c_rarg5   - saved encryptedCounter start
 3134   //   c_rarg6   - saved used length
 3135   //
 3136   // Output:
 3137   //   r0       - input length
 3138   //
 3139   address generate_counterMode_AESCrypt() {
 3140     const Register in = c_rarg0;
 3141     const Register out = c_rarg1;
 3142     const Register key = c_rarg2;
 3143     const Register counter = c_rarg3;
 3144     const Register saved_len = c_rarg4, len = r10;
 3145     const Register saved_encrypted_ctr = c_rarg5;
 3146     const Register used_ptr = c_rarg6, used = r12;
 3147 
 3148     const Register offset = r7;
 3149     const Register keylen = r11;
 3150 
 3151     const unsigned char block_size = 16;
 3152     const int bulk_width = 4;
 3153     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3154     // performance with larger data sizes, but it also means that the
 3155     // fast path isn't used until you have at least 8 blocks, and up
 3156     // to 127 bytes of data will be executed on the slow path. For
 3157     // that reason, and also so as not to blow away too much icache, 4
 3158     // blocks seems like a sensible compromise.
 3159 
 3160     // Algorithm:
 3161     //
 3162     //    if (len == 0) {
 3163     //        goto DONE;
 3164     //    }
 3165     //    int result = len;
 3166     //    do {
 3167     //        if (used >= blockSize) {
 3168     //            if (len >= bulk_width * blockSize) {
 3169     //                CTR_large_block();
 3170     //                if (len == 0)
 3171     //                    goto DONE;
 3172     //            }
 3173     //            for (;;) {
 3174     //                16ByteVector v0 = counter;
 3175     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3176     //                used = 0;
 3177     //                if (len < blockSize)
 3178     //                    break;    /* goto NEXT */
 3179     //                16ByteVector v1 = load16Bytes(in, offset);
 3180     //                v1 = v1 ^ encryptedCounter;
 3181     //                store16Bytes(out, offset);
 3182     //                used = blockSize;
 3183     //                offset += blockSize;
 3184     //                len -= blockSize;
 3185     //                if (len == 0)
 3186     //                    goto DONE;
 3187     //            }
 3188     //        }
 3189     //      NEXT:
 3190     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3191     //        len--;
 3192     //    } while (len != 0);
 3193     //  DONE:
 3194     //    return result;
 3195     //
 3196     // CTR_large_block()
 3197     //    Wide bulk encryption of whole blocks.
 3198 
 3199     __ align(CodeEntryAlignment);
 3200     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3201     StubCodeMark mark(this, stub_id);
 3202     const address start = __ pc();
 3203     __ enter();
 3204 
 3205     Label DONE, CTR_large_block, large_block_return;
 3206     __ ldrw(used, Address(used_ptr));
 3207     __ cbzw(saved_len, DONE);
 3208 
 3209     __ mov(len, saved_len);
 3210     __ mov(offset, 0);
 3211 
 3212     // Compute #rounds for AES based on the length of the key array
 3213     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3214 
 3215     __ aesenc_loadkeys(key, keylen);
 3216 
 3217     {
 3218       Label L_CTR_loop, NEXT;
 3219 
 3220       __ bind(L_CTR_loop);
 3221 
 3222       __ cmp(used, block_size);
 3223       __ br(__ LO, NEXT);
 3224 
 3225       // Maybe we have a lot of data
 3226       __ subsw(rscratch1, len, bulk_width * block_size);
 3227       __ br(__ HS, CTR_large_block);
 3228       __ BIND(large_block_return);
 3229       __ cbzw(len, DONE);
 3230 
 3231       // Setup the counter
 3232       __ movi(v4, __ T4S, 0);
 3233       __ movi(v5, __ T4S, 1);
 3234       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3235 
 3236       // 128-bit big-endian increment
 3237       __ ld1(v0, __ T16B, counter);
 3238       __ rev64(v16, __ T16B, v0);
 3239       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3240       __ rev64(v16, __ T16B, v16);
 3241       __ st1(v16, __ T16B, counter);
 3242       // Previous counter value is in v0
 3243       // v4 contains { 0, 1 }
 3244 
 3245       {
 3246         // We have fewer than bulk_width blocks of data left. Encrypt
 3247         // them one by one until there is less than a full block
 3248         // remaining, being careful to save both the encrypted counter
 3249         // and the counter.
 3250 
 3251         Label inner_loop;
 3252         __ bind(inner_loop);
 3253         // Counter to encrypt is in v0
 3254         __ aesecb_encrypt(noreg, noreg, keylen);
 3255         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3256 
 3257         // Do we have a remaining full block?
 3258 
 3259         __ mov(used, 0);
 3260         __ cmp(len, block_size);
 3261         __ br(__ LO, NEXT);
 3262 
 3263         // Yes, we have a full block
 3264         __ ldrq(v1, Address(in, offset));
 3265         __ eor(v1, __ T16B, v1, v0);
 3266         __ strq(v1, Address(out, offset));
 3267         __ mov(used, block_size);
 3268         __ add(offset, offset, block_size);
 3269 
 3270         __ subw(len, len, block_size);
 3271         __ cbzw(len, DONE);
 3272 
 3273         // Increment the counter, store it back
 3274         __ orr(v0, __ T16B, v16, v16);
 3275         __ rev64(v16, __ T16B, v16);
 3276         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3277         __ rev64(v16, __ T16B, v16);
 3278         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3279 
 3280         __ b(inner_loop);
 3281       }
 3282 
 3283       __ BIND(NEXT);
 3284 
 3285       // Encrypt a single byte, and loop.
 3286       // We expect this to be a rare event.
 3287       __ ldrb(rscratch1, Address(in, offset));
 3288       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3289       __ eor(rscratch1, rscratch1, rscratch2);
 3290       __ strb(rscratch1, Address(out, offset));
 3291       __ add(offset, offset, 1);
 3292       __ add(used, used, 1);
 3293       __ subw(len, len,1);
 3294       __ cbnzw(len, L_CTR_loop);
 3295     }
 3296 
 3297     __ bind(DONE);
 3298     __ strw(used, Address(used_ptr));
 3299     __ mov(r0, saved_len);
 3300 
 3301     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3302     __ ret(lr);
 3303 
 3304     // Bulk encryption
 3305 
 3306     __ BIND (CTR_large_block);
 3307     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3308 
 3309     if (bulk_width == 8) {
 3310       __ sub(sp, sp, 4 * 16);
 3311       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3312     }
 3313     __ sub(sp, sp, 4 * 16);
 3314     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3315     RegSet saved_regs = (RegSet::of(in, out, offset)
 3316                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3317     __ push(saved_regs, sp);
 3318     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3319     __ add(in, in, offset);
 3320     __ add(out, out, offset);
 3321 
 3322     // Keys should already be loaded into the correct registers
 3323 
 3324     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3325     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3326 
 3327     // AES/CTR loop
 3328     {
 3329       Label L_CTR_loop;
 3330       __ BIND(L_CTR_loop);
 3331 
 3332       // Setup the counters
 3333       __ movi(v8, __ T4S, 0);
 3334       __ movi(v9, __ T4S, 1);
 3335       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3336 
 3337       for (int i = 0; i < bulk_width; i++) {
 3338         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3339         __ rev64(v0_ofs, __ T16B, v16);
 3340         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3341       }
 3342 
 3343       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3344 
 3345       // Encrypt the counters
 3346       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3347 
 3348       if (bulk_width == 8) {
 3349         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3350       }
 3351 
 3352       // XOR the encrypted counters with the inputs
 3353       for (int i = 0; i < bulk_width; i++) {
 3354         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3355         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3356         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3357       }
 3358 
 3359       // Write the encrypted data
 3360       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3361       if (bulk_width == 8) {
 3362         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3363       }
 3364 
 3365       __ subw(len, len, 16 * bulk_width);
 3366       __ cbnzw(len, L_CTR_loop);
 3367     }
 3368 
 3369     // Save the counter back where it goes
 3370     __ rev64(v16, __ T16B, v16);
 3371     __ st1(v16, __ T16B, counter);
 3372 
 3373     __ pop(saved_regs, sp);
 3374 
 3375     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3376     if (bulk_width == 8) {
 3377       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3378     }
 3379 
 3380     __ andr(rscratch1, len, -16 * bulk_width);
 3381     __ sub(len, len, rscratch1);
 3382     __ add(offset, offset, rscratch1);
 3383     __ mov(used, 16);
 3384     __ strw(used, Address(used_ptr));
 3385     __ b(large_block_return);
 3386 
 3387     return start;
 3388   }
 3389 
 3390   // Vector AES Galois Counter Mode implementation. Parameters:
 3391   //
 3392   // in = c_rarg0
 3393   // len = c_rarg1
 3394   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3395   // out = c_rarg3
 3396   // key = c_rarg4
 3397   // state = c_rarg5 - GHASH.state
 3398   // subkeyHtbl = c_rarg6 - powers of H
 3399   // counter = c_rarg7 - 16 bytes of CTR
 3400   // return - number of processed bytes
 3401   address generate_galoisCounterMode_AESCrypt() {
 3402     address ghash_polynomial = __ pc();
 3403     __ emit_int64(0x87);  // The low-order bits of the field
 3404                           // polynomial (i.e. p = z^7+z^2+z+1)
 3405                           // repeated in the low and high parts of a
 3406                           // 128-bit vector
 3407     __ emit_int64(0x87);
 3408 
 3409     __ align(CodeEntryAlignment);
 3410     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3411     StubCodeMark mark(this, stub_id);
 3412     address start = __ pc();
 3413     __ enter();
 3414 
 3415     const Register in = c_rarg0;
 3416     const Register len = c_rarg1;
 3417     const Register ct = c_rarg2;
 3418     const Register out = c_rarg3;
 3419     // and updated with the incremented counter in the end
 3420 
 3421     const Register key = c_rarg4;
 3422     const Register state = c_rarg5;
 3423 
 3424     const Register subkeyHtbl = c_rarg6;
 3425 
 3426     const Register counter = c_rarg7;
 3427 
 3428     const Register keylen = r10;
 3429     // Save state before entering routine
 3430     __ sub(sp, sp, 4 * 16);
 3431     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3432     __ sub(sp, sp, 4 * 16);
 3433     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3434 
 3435     // __ andr(len, len, -512);
 3436     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3437     __ str(len, __ pre(sp, -2 * wordSize));
 3438 
 3439     Label DONE;
 3440     __ cbz(len, DONE);
 3441 
 3442     // Compute #rounds for AES based on the length of the key array
 3443     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3444 
 3445     __ aesenc_loadkeys(key, keylen);
 3446     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3447     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3448 
 3449     // AES/CTR loop
 3450     {
 3451       Label L_CTR_loop;
 3452       __ BIND(L_CTR_loop);
 3453 
 3454       // Setup the counters
 3455       __ movi(v8, __ T4S, 0);
 3456       __ movi(v9, __ T4S, 1);
 3457       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3458 
 3459       assert(v0->encoding() < v8->encoding(), "");
 3460       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3461         FloatRegister f = as_FloatRegister(i);
 3462         __ rev32(f, __ T16B, v16);
 3463         __ addv(v16, __ T4S, v16, v8);
 3464       }
 3465 
 3466       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3467 
 3468       // Encrypt the counters
 3469       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3470 
 3471       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3472 
 3473       // XOR the encrypted counters with the inputs
 3474       for (int i = 0; i < 8; i++) {
 3475         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3476         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3477         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3478       }
 3479       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3480       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3481 
 3482       __ subw(len, len, 16 * 8);
 3483       __ cbnzw(len, L_CTR_loop);
 3484     }
 3485 
 3486     __ rev32(v16, __ T16B, v16);
 3487     __ st1(v16, __ T16B, counter);
 3488 
 3489     __ ldr(len, Address(sp));
 3490     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3491 
 3492     // GHASH/CTR loop
 3493     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3494                                 len, /*unrolls*/4);
 3495 
 3496 #ifdef ASSERT
 3497     { Label L;
 3498       __ cmp(len, (unsigned char)0);
 3499       __ br(Assembler::EQ, L);
 3500       __ stop("stubGenerator: abort");
 3501       __ bind(L);
 3502   }
 3503 #endif
 3504 
 3505   __ bind(DONE);
 3506     // Return the number of bytes processed
 3507     __ ldr(r0, __ post(sp, 2 * wordSize));
 3508 
 3509     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3510     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3511 
 3512     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3513     __ ret(lr);
 3514      return start;
 3515   }
 3516 
 3517   class Cached64Bytes {
 3518   private:
 3519     MacroAssembler *_masm;
 3520     Register _regs[8];
 3521 
 3522   public:
 3523     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3524       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3525       auto it = rs.begin();
 3526       for (auto &r: _regs) {
 3527         r = *it;
 3528         ++it;
 3529       }
 3530     }
 3531 
 3532     void gen_loads(Register base) {
 3533       for (int i = 0; i < 8; i += 2) {
 3534         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3535       }
 3536     }
 3537 
 3538     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3539     void extract_u32(Register dest, int i) {
 3540       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3541     }
 3542   };
 3543 
 3544   // Utility routines for md5.
 3545   // Clobbers r10 and r11.
 3546   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3547               int k, int s, int t) {
 3548     Register rscratch3 = r10;
 3549     Register rscratch4 = r11;
 3550 
 3551     __ eorw(rscratch3, r3, r4);
 3552     __ movw(rscratch2, t);
 3553     __ andw(rscratch3, rscratch3, r2);
 3554     __ addw(rscratch4, r1, rscratch2);
 3555     reg_cache.extract_u32(rscratch1, k);
 3556     __ eorw(rscratch3, rscratch3, r4);
 3557     __ addw(rscratch4, rscratch4, rscratch1);
 3558     __ addw(rscratch3, rscratch3, rscratch4);
 3559     __ rorw(rscratch2, rscratch3, 32 - s);
 3560     __ addw(r1, rscratch2, r2);
 3561   }
 3562 
 3563   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3564               int k, int s, int t) {
 3565     Register rscratch3 = r10;
 3566     Register rscratch4 = r11;
 3567 
 3568     reg_cache.extract_u32(rscratch1, k);
 3569     __ movw(rscratch2, t);
 3570     __ addw(rscratch4, r1, rscratch2);
 3571     __ addw(rscratch4, rscratch4, rscratch1);
 3572     __ bicw(rscratch2, r3, r4);
 3573     __ andw(rscratch3, r2, r4);
 3574     __ addw(rscratch2, rscratch2, rscratch4);
 3575     __ addw(rscratch2, rscratch2, rscratch3);
 3576     __ rorw(rscratch2, rscratch2, 32 - s);
 3577     __ addw(r1, rscratch2, r2);
 3578   }
 3579 
 3580   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3581               int k, int s, int t) {
 3582     Register rscratch3 = r10;
 3583     Register rscratch4 = r11;
 3584 
 3585     __ eorw(rscratch3, r3, r4);
 3586     __ movw(rscratch2, t);
 3587     __ addw(rscratch4, r1, rscratch2);
 3588     reg_cache.extract_u32(rscratch1, k);
 3589     __ eorw(rscratch3, rscratch3, r2);
 3590     __ addw(rscratch4, rscratch4, rscratch1);
 3591     __ addw(rscratch3, rscratch3, rscratch4);
 3592     __ rorw(rscratch2, rscratch3, 32 - s);
 3593     __ addw(r1, rscratch2, r2);
 3594   }
 3595 
 3596   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3597               int k, int s, int t) {
 3598     Register rscratch3 = r10;
 3599     Register rscratch4 = r11;
 3600 
 3601     __ movw(rscratch3, t);
 3602     __ ornw(rscratch2, r2, r4);
 3603     __ addw(rscratch4, r1, rscratch3);
 3604     reg_cache.extract_u32(rscratch1, k);
 3605     __ eorw(rscratch3, rscratch2, r3);
 3606     __ addw(rscratch4, rscratch4, rscratch1);
 3607     __ addw(rscratch3, rscratch3, rscratch4);
 3608     __ rorw(rscratch2, rscratch3, 32 - s);
 3609     __ addw(r1, rscratch2, r2);
 3610   }
 3611 
 3612   // Arguments:
 3613   //
 3614   // Inputs:
 3615   //   c_rarg0   - byte[]  source+offset
 3616   //   c_rarg1   - int[]   SHA.state
 3617   //   c_rarg2   - int     offset
 3618   //   c_rarg3   - int     limit
 3619   //
 3620   address generate_md5_implCompress(StubGenStubId stub_id) {
 3621     bool multi_block;
 3622     switch (stub_id) {
 3623     case md5_implCompress_id:
 3624       multi_block = false;
 3625       break;
 3626     case md5_implCompressMB_id:
 3627       multi_block = true;
 3628       break;
 3629     default:
 3630       ShouldNotReachHere();
 3631     }
 3632     __ align(CodeEntryAlignment);
 3633 
 3634     StubCodeMark mark(this, stub_id);
 3635     address start = __ pc();
 3636 
 3637     Register buf       = c_rarg0;
 3638     Register state     = c_rarg1;
 3639     Register ofs       = c_rarg2;
 3640     Register limit     = c_rarg3;
 3641     Register a         = r4;
 3642     Register b         = r5;
 3643     Register c         = r6;
 3644     Register d         = r7;
 3645     Register rscratch3 = r10;
 3646     Register rscratch4 = r11;
 3647 
 3648     Register state_regs[2] = { r12, r13 };
 3649     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3650     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3651 
 3652     __ push(saved_regs, sp);
 3653 
 3654     __ ldp(state_regs[0], state_regs[1], Address(state));
 3655     __ ubfx(a, state_regs[0],  0, 32);
 3656     __ ubfx(b, state_regs[0], 32, 32);
 3657     __ ubfx(c, state_regs[1],  0, 32);
 3658     __ ubfx(d, state_regs[1], 32, 32);
 3659 
 3660     Label md5_loop;
 3661     __ BIND(md5_loop);
 3662 
 3663     reg_cache.gen_loads(buf);
 3664 
 3665     // Round 1
 3666     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3667     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3668     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3669     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3670     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3671     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3672     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3673     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3674     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3675     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3676     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3677     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3678     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3679     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3680     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3681     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3682 
 3683     // Round 2
 3684     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3685     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3686     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3687     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3688     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3689     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3690     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3691     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3692     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3693     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3694     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3695     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3696     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3697     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3698     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3699     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3700 
 3701     // Round 3
 3702     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3703     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3704     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3705     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3706     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3707     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3708     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3709     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3710     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3711     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3712     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3713     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3714     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3715     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3716     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3717     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3718 
 3719     // Round 4
 3720     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3721     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3722     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3723     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3724     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3725     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3726     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3727     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3728     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3729     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3730     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3731     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3732     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3733     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3734     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3735     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3736 
 3737     __ addw(a, state_regs[0], a);
 3738     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3739     __ addw(b, rscratch2, b);
 3740     __ addw(c, state_regs[1], c);
 3741     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3742     __ addw(d, rscratch4, d);
 3743 
 3744     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3745     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3746 
 3747     if (multi_block) {
 3748       __ add(buf, buf, 64);
 3749       __ add(ofs, ofs, 64);
 3750       __ cmp(ofs, limit);
 3751       __ br(Assembler::LE, md5_loop);
 3752       __ mov(c_rarg0, ofs); // return ofs
 3753     }
 3754 
 3755     // write hash values back in the correct order
 3756     __ stp(state_regs[0], state_regs[1], Address(state));
 3757 
 3758     __ pop(saved_regs, sp);
 3759 
 3760     __ ret(lr);
 3761 
 3762     return start;
 3763   }
 3764 
 3765   // Arguments:
 3766   //
 3767   // Inputs:
 3768   //   c_rarg0   - byte[]  source+offset
 3769   //   c_rarg1   - int[]   SHA.state
 3770   //   c_rarg2   - int     offset
 3771   //   c_rarg3   - int     limit
 3772   //
 3773   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3774     bool multi_block;
 3775     switch (stub_id) {
 3776     case sha1_implCompress_id:
 3777       multi_block = false;
 3778       break;
 3779     case sha1_implCompressMB_id:
 3780       multi_block = true;
 3781       break;
 3782     default:
 3783       ShouldNotReachHere();
 3784     }
 3785 
 3786     __ align(CodeEntryAlignment);
 3787 
 3788     StubCodeMark mark(this, stub_id);
 3789     address start = __ pc();
 3790 
 3791     Register buf   = c_rarg0;
 3792     Register state = c_rarg1;
 3793     Register ofs   = c_rarg2;
 3794     Register limit = c_rarg3;
 3795 
 3796     Label keys;
 3797     Label sha1_loop;
 3798 
 3799     // load the keys into v0..v3
 3800     __ adr(rscratch1, keys);
 3801     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3802     // load 5 words state into v6, v7
 3803     __ ldrq(v6, Address(state, 0));
 3804     __ ldrs(v7, Address(state, 16));
 3805 
 3806 
 3807     __ BIND(sha1_loop);
 3808     // load 64 bytes of data into v16..v19
 3809     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3810     __ rev32(v16, __ T16B, v16);
 3811     __ rev32(v17, __ T16B, v17);
 3812     __ rev32(v18, __ T16B, v18);
 3813     __ rev32(v19, __ T16B, v19);
 3814 
 3815     // do the sha1
 3816     __ addv(v4, __ T4S, v16, v0);
 3817     __ orr(v20, __ T16B, v6, v6);
 3818 
 3819     FloatRegister d0 = v16;
 3820     FloatRegister d1 = v17;
 3821     FloatRegister d2 = v18;
 3822     FloatRegister d3 = v19;
 3823 
 3824     for (int round = 0; round < 20; round++) {
 3825       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3826       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3827       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3828       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3829       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3830 
 3831       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3832       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3833       __ sha1h(tmp2, __ T4S, v20);
 3834       if (round < 5)
 3835         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3836       else if (round < 10 || round >= 15)
 3837         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3838       else
 3839         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3840       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3841 
 3842       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3843     }
 3844 
 3845     __ addv(v7, __ T2S, v7, v21);
 3846     __ addv(v6, __ T4S, v6, v20);
 3847 
 3848     if (multi_block) {
 3849       __ add(ofs, ofs, 64);
 3850       __ cmp(ofs, limit);
 3851       __ br(Assembler::LE, sha1_loop);
 3852       __ mov(c_rarg0, ofs); // return ofs
 3853     }
 3854 
 3855     __ strq(v6, Address(state, 0));
 3856     __ strs(v7, Address(state, 16));
 3857 
 3858     __ ret(lr);
 3859 
 3860     __ bind(keys);
 3861     __ emit_int32(0x5a827999);
 3862     __ emit_int32(0x6ed9eba1);
 3863     __ emit_int32(0x8f1bbcdc);
 3864     __ emit_int32(0xca62c1d6);
 3865 
 3866     return start;
 3867   }
 3868 
 3869 
 3870   // Arguments:
 3871   //
 3872   // Inputs:
 3873   //   c_rarg0   - byte[]  source+offset
 3874   //   c_rarg1   - int[]   SHA.state
 3875   //   c_rarg2   - int     offset
 3876   //   c_rarg3   - int     limit
 3877   //
 3878   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3879     bool multi_block;
 3880     switch (stub_id) {
 3881     case sha256_implCompress_id:
 3882       multi_block = false;
 3883       break;
 3884     case sha256_implCompressMB_id:
 3885       multi_block = true;
 3886       break;
 3887     default:
 3888       ShouldNotReachHere();
 3889     }
 3890 
 3891     static const uint32_t round_consts[64] = {
 3892       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3893       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3894       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3895       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3896       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3897       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3898       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3899       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3900       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3901       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3902       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3903       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3904       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3905       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3906       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3907       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3908     };
 3909 
 3910     __ align(CodeEntryAlignment);
 3911 
 3912     StubCodeMark mark(this, stub_id);
 3913     address start = __ pc();
 3914 
 3915     Register buf   = c_rarg0;
 3916     Register state = c_rarg1;
 3917     Register ofs   = c_rarg2;
 3918     Register limit = c_rarg3;
 3919 
 3920     Label sha1_loop;
 3921 
 3922     __ stpd(v8, v9, __ pre(sp, -32));
 3923     __ stpd(v10, v11, Address(sp, 16));
 3924 
 3925 // dga == v0
 3926 // dgb == v1
 3927 // dg0 == v2
 3928 // dg1 == v3
 3929 // dg2 == v4
 3930 // t0 == v6
 3931 // t1 == v7
 3932 
 3933     // load 16 keys to v16..v31
 3934     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3935     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3936     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3937     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3938     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3939 
 3940     // load 8 words (256 bits) state
 3941     __ ldpq(v0, v1, state);
 3942 
 3943     __ BIND(sha1_loop);
 3944     // load 64 bytes of data into v8..v11
 3945     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3946     __ rev32(v8, __ T16B, v8);
 3947     __ rev32(v9, __ T16B, v9);
 3948     __ rev32(v10, __ T16B, v10);
 3949     __ rev32(v11, __ T16B, v11);
 3950 
 3951     __ addv(v6, __ T4S, v8, v16);
 3952     __ orr(v2, __ T16B, v0, v0);
 3953     __ orr(v3, __ T16B, v1, v1);
 3954 
 3955     FloatRegister d0 = v8;
 3956     FloatRegister d1 = v9;
 3957     FloatRegister d2 = v10;
 3958     FloatRegister d3 = v11;
 3959 
 3960 
 3961     for (int round = 0; round < 16; round++) {
 3962       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3963       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3964       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3965       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3966 
 3967       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3968        __ orr(v4, __ T16B, v2, v2);
 3969       if (round < 15)
 3970         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3971       __ sha256h(v2, __ T4S, v3, tmp2);
 3972       __ sha256h2(v3, __ T4S, v4, tmp2);
 3973       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3974 
 3975       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3976     }
 3977 
 3978     __ addv(v0, __ T4S, v0, v2);
 3979     __ addv(v1, __ T4S, v1, v3);
 3980 
 3981     if (multi_block) {
 3982       __ add(ofs, ofs, 64);
 3983       __ cmp(ofs, limit);
 3984       __ br(Assembler::LE, sha1_loop);
 3985       __ mov(c_rarg0, ofs); // return ofs
 3986     }
 3987 
 3988     __ ldpd(v10, v11, Address(sp, 16));
 3989     __ ldpd(v8, v9, __ post(sp, 32));
 3990 
 3991     __ stpq(v0, v1, state);
 3992 
 3993     __ ret(lr);
 3994 
 3995     return start;
 3996   }
 3997 
 3998   // Double rounds for sha512.
 3999   void sha512_dround(int dr,
 4000                      FloatRegister vi0, FloatRegister vi1,
 4001                      FloatRegister vi2, FloatRegister vi3,
 4002                      FloatRegister vi4, FloatRegister vrc0,
 4003                      FloatRegister vrc1, FloatRegister vin0,
 4004                      FloatRegister vin1, FloatRegister vin2,
 4005                      FloatRegister vin3, FloatRegister vin4) {
 4006       if (dr < 36) {
 4007         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4008       }
 4009       __ addv(v5, __ T2D, vrc0, vin0);
 4010       __ ext(v6, __ T16B, vi2, vi3, 8);
 4011       __ ext(v5, __ T16B, v5, v5, 8);
 4012       __ ext(v7, __ T16B, vi1, vi2, 8);
 4013       __ addv(vi3, __ T2D, vi3, v5);
 4014       if (dr < 32) {
 4015         __ ext(v5, __ T16B, vin3, vin4, 8);
 4016         __ sha512su0(vin0, __ T2D, vin1);
 4017       }
 4018       __ sha512h(vi3, __ T2D, v6, v7);
 4019       if (dr < 32) {
 4020         __ sha512su1(vin0, __ T2D, vin2, v5);
 4021       }
 4022       __ addv(vi4, __ T2D, vi1, vi3);
 4023       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4024   }
 4025 
 4026   // Arguments:
 4027   //
 4028   // Inputs:
 4029   //   c_rarg0   - byte[]  source+offset
 4030   //   c_rarg1   - int[]   SHA.state
 4031   //   c_rarg2   - int     offset
 4032   //   c_rarg3   - int     limit
 4033   //
 4034   address generate_sha512_implCompress(StubGenStubId stub_id) {
 4035     bool multi_block;
 4036     switch (stub_id) {
 4037     case sha512_implCompress_id:
 4038       multi_block = false;
 4039       break;
 4040     case sha512_implCompressMB_id:
 4041       multi_block = true;
 4042       break;
 4043     default:
 4044       ShouldNotReachHere();
 4045     }
 4046 
 4047     static const uint64_t round_consts[80] = {
 4048       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4049       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4050       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4051       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4052       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4053       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4054       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4055       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4056       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4057       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4058       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4059       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4060       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4061       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4062       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4063       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4064       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4065       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4066       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4067       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4068       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4069       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4070       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4071       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4072       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4073       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4074       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4075     };
 4076 
 4077     __ align(CodeEntryAlignment);
 4078 
 4079     StubCodeMark mark(this, stub_id);
 4080     address start = __ pc();
 4081 
 4082     Register buf   = c_rarg0;
 4083     Register state = c_rarg1;
 4084     Register ofs   = c_rarg2;
 4085     Register limit = c_rarg3;
 4086 
 4087     __ stpd(v8, v9, __ pre(sp, -64));
 4088     __ stpd(v10, v11, Address(sp, 16));
 4089     __ stpd(v12, v13, Address(sp, 32));
 4090     __ stpd(v14, v15, Address(sp, 48));
 4091 
 4092     Label sha512_loop;
 4093 
 4094     // load state
 4095     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4096 
 4097     // load first 4 round constants
 4098     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4099     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4100 
 4101     __ BIND(sha512_loop);
 4102     // load 128B of data into v12..v19
 4103     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4104     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4105     __ rev64(v12, __ T16B, v12);
 4106     __ rev64(v13, __ T16B, v13);
 4107     __ rev64(v14, __ T16B, v14);
 4108     __ rev64(v15, __ T16B, v15);
 4109     __ rev64(v16, __ T16B, v16);
 4110     __ rev64(v17, __ T16B, v17);
 4111     __ rev64(v18, __ T16B, v18);
 4112     __ rev64(v19, __ T16B, v19);
 4113 
 4114     __ mov(rscratch2, rscratch1);
 4115 
 4116     __ mov(v0, __ T16B, v8);
 4117     __ mov(v1, __ T16B, v9);
 4118     __ mov(v2, __ T16B, v10);
 4119     __ mov(v3, __ T16B, v11);
 4120 
 4121     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4122     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4123     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4124     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4125     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4126     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4127     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4128     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4129     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4130     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4131     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4132     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4133     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4134     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4135     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4136     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4137     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4138     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4139     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4140     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4141     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4142     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4143     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4144     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4145     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4146     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4147     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4148     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4149     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4150     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4151     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4152     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4153     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4154     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4155     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4156     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4157     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4158     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4159     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4160     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4161 
 4162     __ addv(v8, __ T2D, v8, v0);
 4163     __ addv(v9, __ T2D, v9, v1);
 4164     __ addv(v10, __ T2D, v10, v2);
 4165     __ addv(v11, __ T2D, v11, v3);
 4166 
 4167     if (multi_block) {
 4168       __ add(ofs, ofs, 128);
 4169       __ cmp(ofs, limit);
 4170       __ br(Assembler::LE, sha512_loop);
 4171       __ mov(c_rarg0, ofs); // return ofs
 4172     }
 4173 
 4174     __ st1(v8, v9, v10, v11, __ T2D, state);
 4175 
 4176     __ ldpd(v14, v15, Address(sp, 48));
 4177     __ ldpd(v12, v13, Address(sp, 32));
 4178     __ ldpd(v10, v11, Address(sp, 16));
 4179     __ ldpd(v8, v9, __ post(sp, 64));
 4180 
 4181     __ ret(lr);
 4182 
 4183     return start;
 4184   }
 4185 
 4186   // Execute one round of keccak of two computations in parallel.
 4187   // One of the states should be loaded into the lower halves of
 4188   // the vector registers v0-v24, the other should be loaded into
 4189   // the upper halves of those registers. The ld1r instruction loads
 4190   // the round constant into both halves of register v31.
 4191   // Intermediate results c0...c5 and d0...d5 are computed
 4192   // in registers v25...v30.
 4193   // All vector instructions that are used operate on both register
 4194   // halves in parallel.
 4195   // If only a single computation is needed, one can only load the lower halves.
 4196   void keccak_round(Register rscratch1) {
 4197   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4198   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4199   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4200   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4201   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4202   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4203   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4204   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4205   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4206   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4207 
 4208   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4209   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4210   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4211   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4212   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4213 
 4214   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4215   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4216   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4217   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4218   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4219   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4220   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4221   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4222   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4223   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4224   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4225   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4226   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4227   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4228   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4229   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4230   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4231   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4232   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4233   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4234   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4235   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4236   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4237   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4238   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4239 
 4240   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4241   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4242   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4243   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4244   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4245 
 4246   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4247 
 4248   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4249   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4250   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4251   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4252   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4253 
 4254   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4255   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4256   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4257   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4258   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4259 
 4260   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4261   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4262   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4263   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4264   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4265 
 4266   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4267   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4268   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4269   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4270   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4271 
 4272   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4273   }
 4274 
 4275   // Arguments:
 4276   //
 4277   // Inputs:
 4278   //   c_rarg0   - byte[]  source+offset
 4279   //   c_rarg1   - byte[]  SHA.state
 4280   //   c_rarg2   - int     block_size
 4281   //   c_rarg3   - int     offset
 4282   //   c_rarg4   - int     limit
 4283   //
 4284   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4285     bool multi_block;
 4286     switch (stub_id) {
 4287     case sha3_implCompress_id:
 4288       multi_block = false;
 4289       break;
 4290     case sha3_implCompressMB_id:
 4291       multi_block = true;
 4292       break;
 4293     default:
 4294       ShouldNotReachHere();
 4295     }
 4296 
 4297     static const uint64_t round_consts[24] = {
 4298       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4299       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4300       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4301       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4302       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4303       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4304       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4305       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4306     };
 4307 
 4308     __ align(CodeEntryAlignment);
 4309 
 4310     StubCodeMark mark(this, stub_id);
 4311     address start = __ pc();
 4312 
 4313     Register buf           = c_rarg0;
 4314     Register state         = c_rarg1;
 4315     Register block_size    = c_rarg2;
 4316     Register ofs           = c_rarg3;
 4317     Register limit         = c_rarg4;
 4318 
 4319     Label sha3_loop, rounds24_loop;
 4320     Label sha3_512_or_sha3_384, shake128;
 4321 
 4322     __ stpd(v8, v9, __ pre(sp, -64));
 4323     __ stpd(v10, v11, Address(sp, 16));
 4324     __ stpd(v12, v13, Address(sp, 32));
 4325     __ stpd(v14, v15, Address(sp, 48));
 4326 
 4327     // load state
 4328     __ add(rscratch1, state, 32);
 4329     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4330     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4331     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4332     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4333     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4334     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4335     __ ld1(v24, __ T1D, rscratch1);
 4336 
 4337     __ BIND(sha3_loop);
 4338 
 4339     // 24 keccak rounds
 4340     __ movw(rscratch2, 24);
 4341 
 4342     // load round_constants base
 4343     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4344 
 4345     // load input
 4346     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4347     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4348     __ eor(v0, __ T8B, v0, v25);
 4349     __ eor(v1, __ T8B, v1, v26);
 4350     __ eor(v2, __ T8B, v2, v27);
 4351     __ eor(v3, __ T8B, v3, v28);
 4352     __ eor(v4, __ T8B, v4, v29);
 4353     __ eor(v5, __ T8B, v5, v30);
 4354     __ eor(v6, __ T8B, v6, v31);
 4355 
 4356     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4357     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4358 
 4359     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4360     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4361     __ eor(v7, __ T8B, v7, v25);
 4362     __ eor(v8, __ T8B, v8, v26);
 4363     __ eor(v9, __ T8B, v9, v27);
 4364     __ eor(v10, __ T8B, v10, v28);
 4365     __ eor(v11, __ T8B, v11, v29);
 4366     __ eor(v12, __ T8B, v12, v30);
 4367     __ eor(v13, __ T8B, v13, v31);
 4368 
 4369     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4370     __ eor(v14, __ T8B, v14, v25);
 4371     __ eor(v15, __ T8B, v15, v26);
 4372     __ eor(v16, __ T8B, v16, v27);
 4373 
 4374     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4375     __ andw(c_rarg5, block_size, 48);
 4376     __ cbzw(c_rarg5, rounds24_loop);
 4377 
 4378     __ tbnz(block_size, 5, shake128);
 4379     // block_size == 144, bit5 == 0, SHA3-224
 4380     __ ldrd(v28, __ post(buf, 8));
 4381     __ eor(v17, __ T8B, v17, v28);
 4382     __ b(rounds24_loop);
 4383 
 4384     __ BIND(shake128);
 4385     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4386     __ eor(v17, __ T8B, v17, v28);
 4387     __ eor(v18, __ T8B, v18, v29);
 4388     __ eor(v19, __ T8B, v19, v30);
 4389     __ eor(v20, __ T8B, v20, v31);
 4390     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4391 
 4392     __ BIND(sha3_512_or_sha3_384);
 4393     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4394     __ eor(v7, __ T8B, v7, v25);
 4395     __ eor(v8, __ T8B, v8, v26);
 4396     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4397 
 4398     // SHA3-384
 4399     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4400     __ eor(v9,  __ T8B, v9,  v27);
 4401     __ eor(v10, __ T8B, v10, v28);
 4402     __ eor(v11, __ T8B, v11, v29);
 4403     __ eor(v12, __ T8B, v12, v30);
 4404 
 4405     __ BIND(rounds24_loop);
 4406     __ subw(rscratch2, rscratch2, 1);
 4407 
 4408     keccak_round(rscratch1);
 4409 
 4410     __ cbnzw(rscratch2, rounds24_loop);
 4411 
 4412     if (multi_block) {
 4413       __ add(ofs, ofs, block_size);
 4414       __ cmp(ofs, limit);
 4415       __ br(Assembler::LE, sha3_loop);
 4416       __ mov(c_rarg0, ofs); // return ofs
 4417     }
 4418 
 4419     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4420     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4421     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4422     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4423     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4424     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4425     __ st1(v24, __ T1D, state);
 4426 
 4427     // restore callee-saved registers
 4428     __ ldpd(v14, v15, Address(sp, 48));
 4429     __ ldpd(v12, v13, Address(sp, 32));
 4430     __ ldpd(v10, v11, Address(sp, 16));
 4431     __ ldpd(v8, v9, __ post(sp, 64));
 4432 
 4433     __ ret(lr);
 4434 
 4435     return start;
 4436   }
 4437 
 4438   // Inputs:
 4439   //   c_rarg0   - long[]  state0
 4440   //   c_rarg1   - long[]  state1
 4441   address generate_double_keccak() {
 4442     static const uint64_t round_consts[24] = {
 4443       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4444       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4445       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4446       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4447       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4448       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4449       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4450       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4451     };
 4452 
 4453     // Implements the double_keccak() method of the
 4454     // sun.secyrity.provider.SHA3Parallel class
 4455     __ align(CodeEntryAlignment);
 4456     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4457     address start = __ pc();
 4458     __ enter();
 4459 
 4460     Register state0        = c_rarg0;
 4461     Register state1        = c_rarg1;
 4462 
 4463     Label rounds24_loop;
 4464 
 4465     // save callee-saved registers
 4466     __ stpd(v8, v9, __ pre(sp, -64));
 4467     __ stpd(v10, v11, Address(sp, 16));
 4468     __ stpd(v12, v13, Address(sp, 32));
 4469     __ stpd(v14, v15, Address(sp, 48));
 4470 
 4471     // load states
 4472     __ add(rscratch1, state0, 32);
 4473     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4474     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4475     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4476     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4477     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4478     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4479     __ ld1(v24, __ D, 0, rscratch1);
 4480     __ add(rscratch1, state1, 32);
 4481     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4482     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4483     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4484     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4485     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4486     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4487     __ ld1(v24, __ D, 1, rscratch1);
 4488 
 4489     // 24 keccak rounds
 4490     __ movw(rscratch2, 24);
 4491 
 4492     // load round_constants base
 4493     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4494 
 4495     __ BIND(rounds24_loop);
 4496     __ subw(rscratch2, rscratch2, 1);
 4497     keccak_round(rscratch1);
 4498     __ cbnzw(rscratch2, rounds24_loop);
 4499 
 4500     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4501     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4502     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4503     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4504     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4505     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4506     __ st1(v24, __ D, 0, state0);
 4507     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4508     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4509     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4510     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4511     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4512     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4513     __ st1(v24, __ D, 1, state1);
 4514 
 4515     // restore callee-saved vector registers
 4516     __ ldpd(v14, v15, Address(sp, 48));
 4517     __ ldpd(v12, v13, Address(sp, 32));
 4518     __ ldpd(v10, v11, Address(sp, 16));
 4519     __ ldpd(v8, v9, __ post(sp, 64));
 4520 
 4521     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4522     __ mov(r0, zr); // return 0
 4523     __ ret(lr);
 4524 
 4525     return start;
 4526   }
 4527 
 4528   // ChaCha20 block function.  This version parallelizes the 32-bit
 4529   // state elements on each of 16 vectors, producing 4 blocks of
 4530   // keystream at a time.
 4531   //
 4532   // state (int[16]) = c_rarg0
 4533   // keystream (byte[256]) = c_rarg1
 4534   // return - number of bytes of produced keystream (always 256)
 4535   //
 4536   // This implementation takes each 32-bit integer from the state
 4537   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4538   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4539   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4540   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4541   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4542   // operations represented by one QUARTERROUND function, we instead stack all
 4543   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4544   // and then do the same for the second set of 4 quarter rounds.  This removes
 4545   // some latency that would otherwise be incurred by waiting for an add to
 4546   // complete before performing an xor (which depends on the result of the
 4547   // add), etc. An adjustment happens between the first and second groups of 4
 4548   // quarter rounds, but this is done only in the inputs to the macro functions
 4549   // that generate the assembly instructions - these adjustments themselves are
 4550   // not part of the resulting assembly.
 4551   // The 4 registers v0-v3 are used during the quarter round operations as
 4552   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4553   // registers become the vectors involved in adding the start state back onto
 4554   // the post-QR working state.  After the adds are complete, each of the 16
 4555   // vectors write their first lane back to the keystream buffer, followed
 4556   // by the second lane from all vectors and so on.
 4557   address generate_chacha20Block_blockpar() {
 4558     Label L_twoRounds, L_cc20_const;
 4559     // The constant data is broken into two 128-bit segments to be loaded
 4560     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4561     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4562     // The second 128-bits is a table constant used for 8-bit left rotations.
 4563     __ BIND(L_cc20_const);
 4564     __ emit_int64(0x0000000100000000UL);
 4565     __ emit_int64(0x0000000300000002UL);
 4566     __ emit_int64(0x0605040702010003UL);
 4567     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4568 
 4569     __ align(CodeEntryAlignment);
 4570     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4571     StubCodeMark mark(this, stub_id);
 4572     address start = __ pc();
 4573     __ enter();
 4574 
 4575     int i, j;
 4576     const Register state = c_rarg0;
 4577     const Register keystream = c_rarg1;
 4578     const Register loopCtr = r10;
 4579     const Register tmpAddr = r11;
 4580     const FloatRegister ctrAddOverlay = v28;
 4581     const FloatRegister lrot8Tbl = v29;
 4582 
 4583     // Organize SIMD registers in an array that facilitates
 4584     // putting repetitive opcodes into loop structures.  It is
 4585     // important that each grouping of 4 registers is monotonically
 4586     // increasing to support the requirements of multi-register
 4587     // instructions (e.g. ld4r, st4, etc.)
 4588     const FloatRegister workSt[16] = {
 4589          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4590         v20, v21, v22, v23, v24, v25, v26, v27
 4591     };
 4592 
 4593     // Pull in constant data.  The first 16 bytes are the add overlay
 4594     // which is applied to the vector holding the counter (state[12]).
 4595     // The second 16 bytes is the index register for the 8-bit left
 4596     // rotation tbl instruction.
 4597     __ adr(tmpAddr, L_cc20_const);
 4598     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4599 
 4600     // Load from memory and interlace across 16 SIMD registers,
 4601     // With each word from memory being broadcast to all lanes of
 4602     // each successive SIMD register.
 4603     //      Addr(0) -> All lanes in workSt[i]
 4604     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4605     __ mov(tmpAddr, state);
 4606     for (i = 0; i < 16; i += 4) {
 4607       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4608           __ post(tmpAddr, 16));
 4609     }
 4610     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4611 
 4612     // Before entering the loop, create 5 4-register arrays.  These
 4613     // will hold the 4 registers that represent the a/b/c/d fields
 4614     // in the quarter round operation.  For instance the "b" field
 4615     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4616     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4617     // since it is part of a diagonal organization.  The aSet and scratch
 4618     // register sets are defined at declaration time because they do not change
 4619     // organization at any point during the 20-round processing.
 4620     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4621     FloatRegister bSet[4];
 4622     FloatRegister cSet[4];
 4623     FloatRegister dSet[4];
 4624     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4625 
 4626     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4627     __ mov(loopCtr, 10);
 4628     __ BIND(L_twoRounds);
 4629 
 4630     // Set to columnar organization and do the following 4 quarter-rounds:
 4631     // QUARTERROUND(0, 4, 8, 12)
 4632     // QUARTERROUND(1, 5, 9, 13)
 4633     // QUARTERROUND(2, 6, 10, 14)
 4634     // QUARTERROUND(3, 7, 11, 15)
 4635     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4636     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4637     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4638 
 4639     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4640     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4641     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4642 
 4643     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4644     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4645     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4646 
 4647     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4648     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4649     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4650 
 4651     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4652     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4653     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4654 
 4655     // Set to diagonal organization and do the next 4 quarter-rounds:
 4656     // QUARTERROUND(0, 5, 10, 15)
 4657     // QUARTERROUND(1, 6, 11, 12)
 4658     // QUARTERROUND(2, 7, 8, 13)
 4659     // QUARTERROUND(3, 4, 9, 14)
 4660     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4661     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4662     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4663 
 4664     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4665     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4666     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4667 
 4668     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4669     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4670     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4671 
 4672     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4673     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4674     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4675 
 4676     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4677     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4678     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4679 
 4680     // Decrement and iterate
 4681     __ sub(loopCtr, loopCtr, 1);
 4682     __ cbnz(loopCtr, L_twoRounds);
 4683 
 4684     __ mov(tmpAddr, state);
 4685 
 4686     // Add the starting state back to the post-loop keystream
 4687     // state.  We read/interlace the state array from memory into
 4688     // 4 registers similar to what we did in the beginning.  Then
 4689     // add the counter overlay onto workSt[12] at the end.
 4690     for (i = 0; i < 16; i += 4) {
 4691       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4692       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4693       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4694       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4695       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4696     }
 4697     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4698 
 4699     // Write working state into the keystream buffer.  This is accomplished
 4700     // by taking the lane "i" from each of the four vectors and writing
 4701     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4702     // repeating with the next 4 vectors until all 16 vectors have been used.
 4703     // Then move to the next lane and repeat the process until all lanes have
 4704     // been written.
 4705     for (i = 0; i < 4; i++) {
 4706       for (j = 0; j < 16; j += 4) {
 4707         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4708             __ post(keystream, 16));
 4709       }
 4710     }
 4711 
 4712     __ mov(r0, 256);             // Return length of output keystream
 4713     __ leave();
 4714     __ ret(lr);
 4715 
 4716     return start;
 4717   }
 4718 
 4719   // Helpers to schedule parallel operation bundles across vector
 4720   // register sequences of size 2, 4 or 8.
 4721 
 4722   // Implement various primitive computations across vector sequences
 4723 
 4724   template<int N>
 4725   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4726                const VSeq<N>& v1, const VSeq<N>& v2) {
 4727     // output must not be constant
 4728     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4729     // output cannot overwrite pending inputs
 4730     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4731     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4732     for (int i = 0; i < N; i++) {
 4733       __ addv(v[i], T, v1[i], v2[i]);
 4734     }
 4735   }
 4736 
 4737   template<int N>
 4738   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4739                const VSeq<N>& v1, const VSeq<N>& v2) {
 4740     // output must not be constant
 4741     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4742     // output cannot overwrite pending inputs
 4743     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4744     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4745     for (int i = 0; i < N; i++) {
 4746       __ subv(v[i], T, v1[i], v2[i]);
 4747     }
 4748   }
 4749 
 4750   template<int N>
 4751   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4752                const VSeq<N>& v1, const VSeq<N>& v2) {
 4753     // output must not be constant
 4754     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4755     // output cannot overwrite pending inputs
 4756     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4757     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4758     for (int i = 0; i < N; i++) {
 4759       __ mulv(v[i], T, v1[i], v2[i]);
 4760     }
 4761   }
 4762 
 4763   template<int N>
 4764   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4765     // output must not be constant
 4766     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4767     // output cannot overwrite pending inputs
 4768     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4769     for (int i = 0; i < N; i++) {
 4770       __ negr(v[i], T, v1[i]);
 4771     }
 4772   }
 4773 
 4774   template<int N>
 4775   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4776                const VSeq<N>& v1, int shift) {
 4777     // output must not be constant
 4778     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4779     // output cannot overwrite pending inputs
 4780     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4781     for (int i = 0; i < N; i++) {
 4782       __ sshr(v[i], T, v1[i], shift);
 4783     }
 4784   }
 4785 
 4786   template<int N>
 4787   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4788     // output must not be constant
 4789     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4790     // output cannot overwrite pending inputs
 4791     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4792     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4793     for (int i = 0; i < N; i++) {
 4794       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4795     }
 4796   }
 4797 
 4798   template<int N>
 4799   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4800     // output must not be constant
 4801     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4802     // output cannot overwrite pending inputs
 4803     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4804     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4805     for (int i = 0; i < N; i++) {
 4806       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4807     }
 4808   }
 4809 
 4810   template<int N>
 4811   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4812     // output must not be constant
 4813     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4814     // output cannot overwrite pending inputs
 4815     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4816     for (int i = 0; i < N; i++) {
 4817       __ notr(v[i], __ T16B, v1[i]);
 4818     }
 4819   }
 4820 
 4821   template<int N>
 4822   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4823     // output must not be constant
 4824     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4825     // output cannot overwrite pending inputs
 4826     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4827     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4828     for (int i = 0; i < N; i++) {
 4829       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4830     }
 4831   }
 4832 
 4833   template<int N>
 4834   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4835     // output must not be constant
 4836     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4837     // output cannot overwrite pending inputs
 4838     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4839     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4840     for (int i = 0; i < N; i++) {
 4841       __ mlsv(v[i], T, v1[i], v2[i]);
 4842     }
 4843   }
 4844 
 4845   // load N/2 successive pairs of quadword values from memory in order
 4846   // into N successive vector registers of the sequence via the
 4847   // address supplied in base.
 4848   template<int N>
 4849   void vs_ldpq(const VSeq<N>& v, Register base) {
 4850     for (int i = 0; i < N; i += 2) {
 4851       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4852     }
 4853   }
 4854 
 4855   // load N/2 successive pairs of quadword values from memory in order
 4856   // into N vector registers of the sequence via the address supplied
 4857   // in base using post-increment addressing
 4858   template<int N>
 4859   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4860     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4861     for (int i = 0; i < N; i += 2) {
 4862       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4863     }
 4864   }
 4865 
 4866   // store N successive vector registers of the sequence into N/2
 4867   // successive pairs of quadword memory locations via the address
 4868   // supplied in base using post-increment addressing
 4869   template<int N>
 4870   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4871     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4872     for (int i = 0; i < N; i += 2) {
 4873       __ stpq(v[i], v[i+1], __ post(base, 32));
 4874     }
 4875   }
 4876 
 4877   // load N/2 pairs of quadword values from memory de-interleaved into
 4878   // N vector registers 2 at a time via the address supplied in base
 4879   // using post-increment addressing.
 4880   template<int N>
 4881   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4882     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4883     for (int i = 0; i < N; i += 2) {
 4884       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4885     }
 4886   }
 4887 
 4888   // store N vector registers interleaved into N/2 pairs of quadword
 4889   // memory locations via the address supplied in base using
 4890   // post-increment addressing.
 4891   template<int N>
 4892   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4893     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4894     for (int i = 0; i < N; i += 2) {
 4895       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4896     }
 4897   }
 4898 
 4899   // load N quadword values from memory de-interleaved into N vector
 4900   // registers 3 elements at a time via the address supplied in base.
 4901   template<int N>
 4902   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4903     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4904     for (int i = 0; i < N; i += 3) {
 4905       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4906     }
 4907   }
 4908 
 4909   // load N quadword values from memory de-interleaved into N vector
 4910   // registers 3 elements at a time via the address supplied in base
 4911   // using post-increment addressing.
 4912   template<int N>
 4913   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4914     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4915     for (int i = 0; i < N; i += 3) {
 4916       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4917     }
 4918   }
 4919 
 4920   // load N/2 pairs of quadword values from memory into N vector
 4921   // registers via the address supplied in base with each pair indexed
 4922   // using the the start offset plus the corresponding entry in the
 4923   // offsets array
 4924   template<int N>
 4925   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4926     for (int i = 0; i < N/2; i++) {
 4927       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4928     }
 4929   }
 4930 
 4931   // store N vector registers into N/2 pairs of quadword memory
 4932   // locations via the address supplied in base with each pair indexed
 4933   // using the the start offset plus the corresponding entry in the
 4934   // offsets array
 4935   template<int N>
 4936   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4937     for (int i = 0; i < N/2; i++) {
 4938       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4939     }
 4940   }
 4941 
 4942   // load N single quadword values from memory into N vector registers
 4943   // via the address supplied in base with each value indexed using
 4944   // the the start offset plus the corresponding entry in the offsets
 4945   // array
 4946   template<int N>
 4947   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4948                       int start, int (&offsets)[N]) {
 4949     for (int i = 0; i < N; i++) {
 4950       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4951     }
 4952   }
 4953 
 4954   // store N vector registers into N single quadword memory locations
 4955   // via the address supplied in base with each value indexed using
 4956   // the the start offset plus the corresponding entry in the offsets
 4957   // array
 4958   template<int N>
 4959   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4960                       int start, int (&offsets)[N]) {
 4961     for (int i = 0; i < N; i++) {
 4962       __ str(v[i], T, Address(base, start + offsets[i]));
 4963     }
 4964   }
 4965 
 4966   // load N/2 pairs of quadword values from memory de-interleaved into
 4967   // N vector registers 2 at a time via the address supplied in base
 4968   // with each pair indexed using the the start offset plus the
 4969   // corresponding entry in the offsets array
 4970   template<int N>
 4971   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4972                       Register tmp, int start, int (&offsets)[N/2]) {
 4973     for (int i = 0; i < N/2; i++) {
 4974       __ add(tmp, base, start + offsets[i]);
 4975       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4976     }
 4977   }
 4978 
 4979   // store N vector registers 2 at a time interleaved into N/2 pairs
 4980   // of quadword memory locations via the address supplied in base
 4981   // with each pair indexed using the the start offset plus the
 4982   // corresponding entry in the offsets array
 4983   template<int N>
 4984   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4985                       Register tmp, int start, int (&offsets)[N/2]) {
 4986     for (int i = 0; i < N/2; i++) {
 4987       __ add(tmp, base, start + offsets[i]);
 4988       __ st2(v[2*i], v[2*i+1], T, tmp);
 4989     }
 4990   }
 4991 
 4992   // Helper routines for various flavours of Montgomery multiply
 4993 
 4994   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 4995   // multiplications in parallel
 4996   //
 4997 
 4998   // See the montMul() method of the sun.security.provider.ML_DSA
 4999   // class.
 5000   //
 5001   // Computes 4x4S results or 8x8H results
 5002   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5003   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5004   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5005   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5006   // Outputs: va - 4x4S or 4x8H vector register sequences
 5007   // vb, vc, vtmp and vq must all be disjoint
 5008   // va must be disjoint from all other inputs/temps or must equal vc
 5009   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5010   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5011   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5012                    Assembler::SIMD_Arrangement T,
 5013                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5014     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5015     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5016     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5017     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5018 
 5019     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5020     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5021 
 5022     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5023 
 5024     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5025     assert(vs_disjoint(va, vb), "va and vb overlap");
 5026     assert(vs_disjoint(va, vq), "va and vq overlap");
 5027     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5028     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5029 
 5030     // schedule 4 streams of instructions across the vector sequences
 5031     for (int i = 0; i < 4; i++) {
 5032       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5033       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5034     }
 5035 
 5036     for (int i = 0; i < 4; i++) {
 5037       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5038     }
 5039 
 5040     for (int i = 0; i < 4; i++) {
 5041       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5042     }
 5043 
 5044     for (int i = 0; i < 4; i++) {
 5045       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5046     }
 5047   }
 5048 
 5049   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5050   // multiplications in parallel
 5051   //
 5052 
 5053   // See the montMul() method of the sun.security.provider.ML_DSA
 5054   // class.
 5055   //
 5056   // Computes 4x4S results or 8x8H results
 5057   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5058   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5059   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5060   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5061   // Outputs: va - 4x4S or 4x8H vector register sequences
 5062   // vb, vc, vtmp and vq must all be disjoint
 5063   // va must be disjoint from all other inputs/temps or must equal vc
 5064   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5065   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5066   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5067                    Assembler::SIMD_Arrangement T,
 5068                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5069     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5070     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5071     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5072     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5073 
 5074     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5075     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5076 
 5077     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5078 
 5079     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5080     assert(vs_disjoint(va, vb), "va and vb overlap");
 5081     assert(vs_disjoint(va, vq), "va and vq overlap");
 5082     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5083     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5084 
 5085     // schedule 2 streams of instructions across the vector sequences
 5086     for (int i = 0; i < 2; i++) {
 5087       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5088       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5089     }
 5090 
 5091     for (int i = 0; i < 2; i++) {
 5092       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5093     }
 5094 
 5095     for (int i = 0; i < 2; i++) {
 5096       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5097     }
 5098 
 5099     for (int i = 0; i < 2; i++) {
 5100       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5101     }
 5102   }
 5103 
 5104   // Perform 16 16-bit Montgomery multiplications in parallel.
 5105   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5106                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5107     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5108     // It will assert that the register use is valid
 5109     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5110   }
 5111 
 5112   // Perform 32 16-bit Montgomery multiplications in parallel.
 5113   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5114                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5115     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5116     // It will assert that the register use is valid
 5117     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5118   }
 5119 
 5120   // Perform 64 16-bit Montgomery multiplications in parallel.
 5121   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5122                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5123     // Schedule two successive 4x8H multiplies via the montmul helper
 5124     // on the front and back halves of va, vb and vc. The helper will
 5125     // assert that the register use has no overlap conflicts on each
 5126     // individual call but we also need to ensure that the necessary
 5127     // disjoint/equality constraints are met across both calls.
 5128 
 5129     // vb, vc, vtmp and vq must be disjoint. va must either be
 5130     // disjoint from all other registers or equal vc
 5131 
 5132     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5133     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5134     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5135 
 5136     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5137     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5138 
 5139     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5140 
 5141     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5142     assert(vs_disjoint(va, vb), "va and vb overlap");
 5143     assert(vs_disjoint(va, vq), "va and vq overlap");
 5144     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5145 
 5146     // we multiply the front and back halves of each sequence 4 at a
 5147     // time because
 5148     //
 5149     // 1) we are currently only able to get 4-way instruction
 5150     // parallelism at best
 5151     //
 5152     // 2) we need registers for the constants in vq and temporary
 5153     // scratch registers to hold intermediate results so vtmp can only
 5154     // be a VSeq<4> which means we only have 4 scratch slots
 5155 
 5156     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5157     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5158   }
 5159 
 5160   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5161                                const VSeq<4>& vc,
 5162                                const VSeq<4>& vtmp,
 5163                                const VSeq<2>& vq) {
 5164     // compute a = montmul(a1, c)
 5165     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5166     // ouptut a1 = a0 - a
 5167     vs_subv(va1, __ T8H, va0, vc);
 5168     //    and a0 = a0 + a
 5169     vs_addv(va0, __ T8H, va0, vc);
 5170   }
 5171 
 5172   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5173                                const VSeq<4>& vb,
 5174                                const VSeq<4>& vtmp1,
 5175                                const VSeq<4>& vtmp2,
 5176                                const VSeq<2>& vq) {
 5177     // compute c = a0 - a1
 5178     vs_subv(vtmp1, __ T8H, va0, va1);
 5179     // output a0 = a0 + a1
 5180     vs_addv(va0, __ T8H, va0, va1);
 5181     // output a1 = b montmul c
 5182     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5183   }
 5184 
 5185   void load64shorts(const VSeq<8>& v, Register shorts) {
 5186     vs_ldpq_post(v, shorts);
 5187   }
 5188 
 5189   void load32shorts(const VSeq<4>& v, Register shorts) {
 5190     vs_ldpq_post(v, shorts);
 5191   }
 5192 
 5193   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5194     vs_stpq_post(v, tmpAddr);
 5195   }
 5196 
 5197   // Kyber NTT function.
 5198   // Implements
 5199   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5200   //
 5201   // coeffs (short[256]) = c_rarg0
 5202   // ntt_zetas (short[256]) = c_rarg1
 5203   address generate_kyberNtt() {
 5204 
 5205     __ align(CodeEntryAlignment);
 5206     StubGenStubId stub_id = StubGenStubId::kyberNtt_id;
 5207     StubCodeMark mark(this, stub_id);
 5208     address start = __ pc();
 5209     __ enter();
 5210 
 5211     const Register coeffs = c_rarg0;
 5212     const Register zetas = c_rarg1;
 5213 
 5214     const Register kyberConsts = r10;
 5215     const Register tmpAddr = r11;
 5216 
 5217     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5218     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5219     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5220 
 5221     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5222     // load the montmul constants
 5223     vs_ldpq(vq, kyberConsts);
 5224 
 5225     // Each level corresponds to an iteration of the outermost loop of the
 5226     // Java method seilerNTT(int[] coeffs). There are some differences
 5227     // from what is done in the seilerNTT() method, though:
 5228     // 1. The computation is using 16-bit signed values, we do not convert them
 5229     // to ints here.
 5230     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5231     // this array for each level, it is easier that way to fill up the vector
 5232     // registers.
 5233     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5234     // multiplications (this is because that way there should not be any
 5235     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5236     // that we can use the 16-bit arithmetic in the vector unit.
 5237     //
 5238     // On each level, we fill up the vector registers in such a way that the
 5239     // array elements that need to be multiplied by the zetas go into one
 5240     // set of vector registers while the corresponding ones that don't need to
 5241     // be multiplied, go into another set.
 5242     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5243     // registers interleaving the steps of 4 identical computations,
 5244     // each done on 8 16-bit values per register.
 5245 
 5246     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5247     // to the zetas occur in discrete blocks whose size is some multiple
 5248     // of 32.
 5249 
 5250     // level 0
 5251     __ add(tmpAddr, coeffs, 256);
 5252     load64shorts(vs1, tmpAddr);
 5253     load64shorts(vs2, zetas);
 5254     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5255     __ add(tmpAddr, coeffs, 0);
 5256     load64shorts(vs1, tmpAddr);
 5257     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5258     vs_addv(vs1, __ T8H, vs1, vs2);
 5259     __ add(tmpAddr, coeffs, 0);
 5260     vs_stpq_post(vs1, tmpAddr);
 5261     __ add(tmpAddr, coeffs, 256);
 5262     vs_stpq_post(vs3, tmpAddr);
 5263     // restore montmul constants
 5264     vs_ldpq(vq, kyberConsts);
 5265     load64shorts(vs1, tmpAddr);
 5266     load64shorts(vs2, zetas);
 5267     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5268     __ add(tmpAddr, coeffs, 128);
 5269     load64shorts(vs1, tmpAddr);
 5270     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5271     vs_addv(vs1, __ T8H, vs1, vs2);
 5272     __ add(tmpAddr, coeffs, 128);
 5273     store64shorts(vs1, tmpAddr);
 5274     __ add(tmpAddr, coeffs, 384);
 5275     store64shorts(vs3, tmpAddr);
 5276 
 5277     // level 1
 5278     // restore montmul constants
 5279     vs_ldpq(vq, kyberConsts);
 5280     __ add(tmpAddr, coeffs, 128);
 5281     load64shorts(vs1, tmpAddr);
 5282     load64shorts(vs2, zetas);
 5283     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5284     __ add(tmpAddr, coeffs, 0);
 5285     load64shorts(vs1, tmpAddr);
 5286     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5287     vs_addv(vs1, __ T8H, vs1, vs2);
 5288     __ add(tmpAddr, coeffs, 0);
 5289     store64shorts(vs1, tmpAddr);
 5290     store64shorts(vs3, tmpAddr);
 5291     vs_ldpq(vq, kyberConsts);
 5292     __ add(tmpAddr, coeffs, 384);
 5293     load64shorts(vs1, tmpAddr);
 5294     load64shorts(vs2, zetas);
 5295     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5296     __ add(tmpAddr, coeffs, 256);
 5297     load64shorts(vs1, tmpAddr);
 5298     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5299     vs_addv(vs1, __ T8H, vs1, vs2);
 5300     __ add(tmpAddr, coeffs, 256);
 5301     store64shorts(vs1, tmpAddr);
 5302     store64shorts(vs3, tmpAddr);
 5303 
 5304     // level 2
 5305     vs_ldpq(vq, kyberConsts);
 5306     int offsets1[4] = { 0, 32, 128, 160 };
 5307     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5308     load64shorts(vs2, zetas);
 5309     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5310     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5311     // kyber_subv_addv64();
 5312     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5313     vs_addv(vs1, __ T8H, vs1, vs2);
 5314     __ add(tmpAddr, coeffs, 0);
 5315     vs_stpq_post(vs_front(vs1), tmpAddr);
 5316     vs_stpq_post(vs_front(vs3), tmpAddr);
 5317     vs_stpq_post(vs_back(vs1), tmpAddr);
 5318     vs_stpq_post(vs_back(vs3), tmpAddr);
 5319     vs_ldpq(vq, kyberConsts);
 5320     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5321     load64shorts(vs2, zetas);
 5322     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5323     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5324     // kyber_subv_addv64();
 5325     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5326     vs_addv(vs1, __ T8H, vs1, vs2);
 5327     __ add(tmpAddr, coeffs, 256);
 5328     vs_stpq_post(vs_front(vs1), tmpAddr);
 5329     vs_stpq_post(vs_front(vs3), tmpAddr);
 5330     vs_stpq_post(vs_back(vs1), tmpAddr);
 5331     vs_stpq_post(vs_back(vs3), tmpAddr);
 5332 
 5333     // level 3
 5334     vs_ldpq(vq, kyberConsts);
 5335     int offsets2[4] = { 0, 64, 128, 192 };
 5336     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5337     load64shorts(vs2, zetas);
 5338     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5339     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5340     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5341     vs_addv(vs1, __ T8H, vs1, vs2);
 5342     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5343     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5344 
 5345     vs_ldpq(vq, kyberConsts);
 5346     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5347     load64shorts(vs2, zetas);
 5348     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5349     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5350     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5351     vs_addv(vs1, __ T8H, vs1, vs2);
 5352     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5353     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5354 
 5355     // level 4
 5356     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5357     // so they are loaded using employing an ldr at 8 distinct offsets.
 5358 
 5359     vs_ldpq(vq, kyberConsts);
 5360     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5361     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5362     load64shorts(vs2, zetas);
 5363     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5364     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5365     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5366     vs_addv(vs1, __ T8H, vs1, vs2);
 5367     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5368     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5369 
 5370     vs_ldpq(vq, kyberConsts);
 5371     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5372     load64shorts(vs2, zetas);
 5373     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5374     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5375     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5376     vs_addv(vs1, __ T8H, vs1, vs2);
 5377     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5378     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5379 
 5380     // level 5
 5381     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5382     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5383 
 5384     vs_ldpq(vq, kyberConsts);
 5385     int offsets4[4] = { 0, 32, 64, 96 };
 5386     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5387     load32shorts(vs_front(vs2), zetas);
 5388     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5389     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5390     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5391     load32shorts(vs_front(vs2), zetas);
 5392     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5393     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5394     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5395     load32shorts(vs_front(vs2), zetas);
 5396     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5397     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5398 
 5399     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5400     load32shorts(vs_front(vs2), zetas);
 5401     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5402     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5403 
 5404     // level 6
 5405     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5406     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5407 
 5408     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5409     load32shorts(vs_front(vs2), zetas);
 5410     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5411     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5412     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5413     // __ ldpq(v18, v19, __ post(zetas, 32));
 5414     load32shorts(vs_front(vs2), zetas);
 5415     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5416     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5417 
 5418     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5419     load32shorts(vs_front(vs2), zetas);
 5420     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5421     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5422 
 5423     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5424     load32shorts(vs_front(vs2), zetas);
 5425     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5426     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5427 
 5428     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5429     __ mov(r0, zr); // return 0
 5430     __ ret(lr);
 5431 
 5432     return start;
 5433   }
 5434 
 5435   // Kyber Inverse NTT function
 5436   // Implements
 5437   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5438   //
 5439   // coeffs (short[256]) = c_rarg0
 5440   // ntt_zetas (short[256]) = c_rarg1
 5441   address generate_kyberInverseNtt() {
 5442 
 5443     __ align(CodeEntryAlignment);
 5444     StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id;
 5445     StubCodeMark mark(this, stub_id);
 5446     address start = __ pc();
 5447     __ enter();
 5448 
 5449     const Register coeffs = c_rarg0;
 5450     const Register zetas = c_rarg1;
 5451 
 5452     const Register kyberConsts = r10;
 5453     const Register tmpAddr = r11;
 5454     const Register tmpAddr2 = c_rarg2;
 5455 
 5456     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5457     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5458     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5459 
 5460     __ lea(kyberConsts,
 5461              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5462 
 5463     // level 0
 5464     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5465     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5466 
 5467     vs_ldpq(vq, kyberConsts);
 5468     int offsets4[4] = { 0, 32, 64, 96 };
 5469     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5470     load32shorts(vs_front(vs2), zetas);
 5471     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5472                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5473     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5474     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5475     load32shorts(vs_front(vs2), zetas);
 5476     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5477                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5478     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5479     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5480     load32shorts(vs_front(vs2), zetas);
 5481     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5482                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5483     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5484     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5485     load32shorts(vs_front(vs2), zetas);
 5486     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5487                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5488     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5489 
 5490     // level 1
 5491     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5492     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5493 
 5494     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5495     load32shorts(vs_front(vs2), zetas);
 5496     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5497                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5498     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5499     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5500     load32shorts(vs_front(vs2), zetas);
 5501     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5502                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5503     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5504 
 5505     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5506     load32shorts(vs_front(vs2), zetas);
 5507     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5508                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5509     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5510     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5511     load32shorts(vs_front(vs2), zetas);
 5512     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5513                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5514     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5515 
 5516     // level 2
 5517     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5518     // so they are loaded using employing an ldr at 8 distinct offsets.
 5519 
 5520     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5521     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5522     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5523     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5524     vs_subv(vs1, __ T8H, vs1, vs2);
 5525     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5526     load64shorts(vs2, zetas);
 5527     vs_ldpq(vq, kyberConsts);
 5528     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5529     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5530 
 5531     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5532     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5533     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5534     vs_subv(vs1, __ T8H, vs1, vs2);
 5535     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5536     load64shorts(vs2, zetas);
 5537     vs_ldpq(vq, kyberConsts);
 5538     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5539     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5540 
 5541     // Barrett reduction at indexes where overflow may happen
 5542 
 5543     // load q and the multiplier for the Barrett reduction
 5544     __ add(tmpAddr, kyberConsts, 16);
 5545     vs_ldpq(vq, tmpAddr);
 5546 
 5547     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5548     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5549     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5550     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5551     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5552     vs_sshr(vs2, __ T8H, vs2, 11);
 5553     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5554     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5555     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5556     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5557     vs_sshr(vs2, __ T8H, vs2, 11);
 5558     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5559     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5560 
 5561     // level 3
 5562     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5563     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5564 
 5565     int offsets2[4] = { 0, 64, 128, 192 };
 5566     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5567     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5568     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5569     vs_subv(vs1, __ T8H, vs1, vs2);
 5570     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5571     load64shorts(vs2, zetas);
 5572     vs_ldpq(vq, kyberConsts);
 5573     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5574     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5575 
 5576     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5577     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5578     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5579     vs_subv(vs1, __ T8H, vs1, vs2);
 5580     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5581     load64shorts(vs2, zetas);
 5582     vs_ldpq(vq, kyberConsts);
 5583     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5584     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5585 
 5586     // level 4
 5587 
 5588     int offsets1[4] = { 0, 32, 128, 160 };
 5589     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5590     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5591     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5592     vs_subv(vs1, __ T8H, vs1, vs2);
 5593     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5594     load64shorts(vs2, zetas);
 5595     vs_ldpq(vq, kyberConsts);
 5596     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5597     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5598 
 5599     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5600     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5601     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5602     vs_subv(vs1, __ T8H, vs1, vs2);
 5603     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5604     load64shorts(vs2, zetas);
 5605     vs_ldpq(vq, kyberConsts);
 5606     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5607     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5608 
 5609     // level 5
 5610 
 5611     __ add(tmpAddr, coeffs, 0);
 5612     load64shorts(vs1, tmpAddr);
 5613     __ add(tmpAddr, coeffs, 128);
 5614     load64shorts(vs2, tmpAddr);
 5615     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5616     vs_subv(vs1, __ T8H, vs1, vs2);
 5617     __ add(tmpAddr, coeffs, 0);
 5618     store64shorts(vs3, tmpAddr);
 5619     load64shorts(vs2, zetas);
 5620     vs_ldpq(vq, kyberConsts);
 5621     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5622     __ add(tmpAddr, coeffs, 128);
 5623     store64shorts(vs2, tmpAddr);
 5624 
 5625     load64shorts(vs1, tmpAddr);
 5626     __ add(tmpAddr, coeffs, 384);
 5627     load64shorts(vs2, tmpAddr);
 5628     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5629     vs_subv(vs1, __ T8H, vs1, vs2);
 5630     __ add(tmpAddr, coeffs, 256);
 5631     store64shorts(vs3, tmpAddr);
 5632     load64shorts(vs2, zetas);
 5633     vs_ldpq(vq, kyberConsts);
 5634     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5635     __ add(tmpAddr, coeffs, 384);
 5636     store64shorts(vs2, tmpAddr);
 5637 
 5638     // Barrett reduction at indexes where overflow may happen
 5639 
 5640     // load q and the multiplier for the Barrett reduction
 5641     __ add(tmpAddr, kyberConsts, 16);
 5642     vs_ldpq(vq, tmpAddr);
 5643 
 5644     int offsets0[2] = { 0, 256 };
 5645     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5646     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5647     vs_sshr(vs2, __ T8H, vs2, 11);
 5648     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5649     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5650 
 5651     // level 6
 5652 
 5653     __ add(tmpAddr, coeffs, 0);
 5654     load64shorts(vs1, tmpAddr);
 5655     __ add(tmpAddr, coeffs, 256);
 5656     load64shorts(vs2, tmpAddr);
 5657     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5658     vs_subv(vs1, __ T8H, vs1, vs2);
 5659     __ add(tmpAddr, coeffs, 0);
 5660     store64shorts(vs3, tmpAddr);
 5661     load64shorts(vs2, zetas);
 5662     vs_ldpq(vq, kyberConsts);
 5663     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5664     __ add(tmpAddr, coeffs, 256);
 5665     store64shorts(vs2, tmpAddr);
 5666 
 5667     __ add(tmpAddr, coeffs, 128);
 5668     load64shorts(vs1, tmpAddr);
 5669     __ add(tmpAddr, coeffs, 384);
 5670     load64shorts(vs2, tmpAddr);
 5671     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5672     vs_subv(vs1, __ T8H, vs1, vs2);
 5673     __ add(tmpAddr, coeffs, 128);
 5674     store64shorts(vs3, tmpAddr);
 5675     load64shorts(vs2, zetas);
 5676     vs_ldpq(vq, kyberConsts);
 5677     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5678     __ add(tmpAddr, coeffs, 384);
 5679     store64shorts(vs2, tmpAddr);
 5680 
 5681     // multiply by 2^-n
 5682 
 5683     // load toMont(2^-n mod q)
 5684     __ add(tmpAddr, kyberConsts, 48);
 5685     __ ldr(v29, __ Q, tmpAddr);
 5686 
 5687     vs_ldpq(vq, kyberConsts);
 5688     __ add(tmpAddr, coeffs, 0);
 5689     load64shorts(vs1, tmpAddr);
 5690     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5691     __ add(tmpAddr, coeffs, 0);
 5692     store64shorts(vs2, tmpAddr);
 5693 
 5694     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5695     load64shorts(vs1, tmpAddr);
 5696     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5697     __ add(tmpAddr, coeffs, 128);
 5698     store64shorts(vs2, tmpAddr);
 5699 
 5700     // now tmpAddr contains coeffs + 256
 5701     load64shorts(vs1, tmpAddr);
 5702     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5703     __ add(tmpAddr, coeffs, 256);
 5704     store64shorts(vs2, tmpAddr);
 5705 
 5706     // now tmpAddr contains coeffs + 384
 5707     load64shorts(vs1, tmpAddr);
 5708     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5709     __ add(tmpAddr, coeffs, 384);
 5710     store64shorts(vs2, tmpAddr);
 5711 
 5712     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5713     __ mov(r0, zr); // return 0
 5714     __ ret(lr);
 5715 
 5716     return start;
 5717   }
 5718 
 5719   // Kyber multiply polynomials in the NTT domain.
 5720   // Implements
 5721   // static int implKyberNttMult(
 5722   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5723   //
 5724   // result (short[256]) = c_rarg0
 5725   // ntta (short[256]) = c_rarg1
 5726   // nttb (short[256]) = c_rarg2
 5727   // zetas (short[128]) = c_rarg3
 5728   address generate_kyberNttMult() {
 5729 
 5730     __ align(CodeEntryAlignment);
 5731     StubGenStubId stub_id = StubGenStubId::kyberNttMult_id;
 5732     StubCodeMark mark(this, stub_id);
 5733     address start = __ pc();
 5734     __ enter();
 5735 
 5736     const Register result = c_rarg0;
 5737     const Register ntta = c_rarg1;
 5738     const Register nttb = c_rarg2;
 5739     const Register zetas = c_rarg3;
 5740 
 5741     const Register kyberConsts = r10;
 5742     const Register limit = r11;
 5743 
 5744     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5745     VSeq<4> vs3(16), vs4(20);
 5746     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5747     VSeq<2> vz(28);          // pair of zetas
 5748     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5749 
 5750     __ lea(kyberConsts,
 5751              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5752 
 5753     Label kyberNttMult_loop;
 5754 
 5755     __ add(limit, result, 512);
 5756 
 5757     // load q and qinv
 5758     vs_ldpq(vq, kyberConsts);
 5759 
 5760     // load R^2 mod q (to convert back from Montgomery representation)
 5761     __ add(kyberConsts, kyberConsts, 64);
 5762     __ ldr(v27, __ Q, kyberConsts);
 5763 
 5764     __ BIND(kyberNttMult_loop);
 5765 
 5766     // load 16 zetas
 5767     vs_ldpq_post(vz, zetas);
 5768 
 5769     // load 2 sets of 32 coefficients from the two input arrays
 5770     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5771     // are striped across pairs of vector registers
 5772     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5773     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5774     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5775     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5776 
 5777     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5778     // i.e. montmul the first and second halves of vs1 in order and
 5779     // then with one sequence reversed storing the two results in vs3
 5780     //
 5781     // vs3[0] <- montmul(a0, b0)
 5782     // vs3[1] <- montmul(a1, b1)
 5783     // vs3[2] <- montmul(a0, b1)
 5784     // vs3[3] <- montmul(a1, b0)
 5785     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5786     kyber_montmul16(vs_back(vs3),
 5787                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5788 
 5789     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5790     // i.e. montmul the first and second halves of vs4 in order and
 5791     // then with one sequence reversed storing the two results in vs1
 5792     //
 5793     // vs1[0] <- montmul(a2, b2)
 5794     // vs1[1] <- montmul(a3, b3)
 5795     // vs1[2] <- montmul(a2, b3)
 5796     // vs1[3] <- montmul(a3, b2)
 5797     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5798     kyber_montmul16(vs_back(vs1),
 5799                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5800 
 5801     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5802     // We can schedule two montmuls at a time if we use a suitable vector
 5803     // sequence <vs3[1], vs1[1]>.
 5804     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5805     VSeq<2> vs5(vs3[1], delta);
 5806 
 5807     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5808     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5809     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5810 
 5811     // add results in pairs storing in vs3
 5812     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5813     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5814     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5815 
 5816     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5817     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5818     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5819 
 5820     // vs1 <- montmul(vs3, montRSquareModQ)
 5821     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5822 
 5823     // store back the two pairs of result vectors de-interleaved as 8H elements
 5824     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5825     // in memory
 5826     vs_st2_post(vs1, __ T8H, result);
 5827 
 5828     __ cmp(result, limit);
 5829     __ br(Assembler::NE, kyberNttMult_loop);
 5830 
 5831     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5832     __ mov(r0, zr); // return 0
 5833     __ ret(lr);
 5834 
 5835     return start;
 5836   }
 5837 
 5838   // Kyber add 2 polynomials.
 5839   // Implements
 5840   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5841   //
 5842   // result (short[256]) = c_rarg0
 5843   // a (short[256]) = c_rarg1
 5844   // b (short[256]) = c_rarg2
 5845   address generate_kyberAddPoly_2() {
 5846 
 5847     __ align(CodeEntryAlignment);
 5848     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id;
 5849     StubCodeMark mark(this, stub_id);
 5850     address start = __ pc();
 5851     __ enter();
 5852 
 5853     const Register result = c_rarg0;
 5854     const Register a = c_rarg1;
 5855     const Register b = c_rarg2;
 5856 
 5857     const Register kyberConsts = r11;
 5858 
 5859     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5860     // So, we can load, add and store the data in 3 groups of 11,
 5861     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5862     // registers. A further constraint is that the mapping needs
 5863     // to skip callee saves. So, we allocate the register
 5864     // sequences using two 8 sequences, two 2 sequences and two
 5865     // single registers.
 5866     VSeq<8> vs1_1(0);
 5867     VSeq<2> vs1_2(16);
 5868     FloatRegister vs1_3 = v28;
 5869     VSeq<8> vs2_1(18);
 5870     VSeq<2> vs2_2(26);
 5871     FloatRegister vs2_3 = v29;
 5872 
 5873     // two constant vector sequences
 5874     VSeq<8> vc_1(31, 0);
 5875     VSeq<2> vc_2(31, 0);
 5876 
 5877     FloatRegister vc_3 = v31;
 5878     __ lea(kyberConsts,
 5879              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5880 
 5881     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5882     for (int i = 0; i < 3; i++) {
 5883       // load 80 or 88 values from a into vs1_1/2/3
 5884       vs_ldpq_post(vs1_1, a);
 5885       vs_ldpq_post(vs1_2, a);
 5886       if (i < 2) {
 5887         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5888       }
 5889       // load 80 or 88 values from b into vs2_1/2/3
 5890       vs_ldpq_post(vs2_1, b);
 5891       vs_ldpq_post(vs2_2, b);
 5892       if (i < 2) {
 5893         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5894       }
 5895       // sum 80 or 88 values across vs1 and vs2 into vs1
 5896       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5897       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5898       if (i < 2) {
 5899         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5900       }
 5901       // add constant to all 80 or 88 results
 5902       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5903       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5904       if (i < 2) {
 5905         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5906       }
 5907       // store 80 or 88 values
 5908       vs_stpq_post(vs1_1, result);
 5909       vs_stpq_post(vs1_2, result);
 5910       if (i < 2) {
 5911         __ str(vs1_3, __ Q, __ post(result, 16));
 5912       }
 5913     }
 5914 
 5915     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5916     __ mov(r0, zr); // return 0
 5917     __ ret(lr);
 5918 
 5919     return start;
 5920   }
 5921 
 5922   // Kyber add 3 polynomials.
 5923   // Implements
 5924   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5925   //
 5926   // result (short[256]) = c_rarg0
 5927   // a (short[256]) = c_rarg1
 5928   // b (short[256]) = c_rarg2
 5929   // c (short[256]) = c_rarg3
 5930   address generate_kyberAddPoly_3() {
 5931 
 5932     __ align(CodeEntryAlignment);
 5933     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id;
 5934     StubCodeMark mark(this, stub_id);
 5935     address start = __ pc();
 5936     __ enter();
 5937 
 5938     const Register result = c_rarg0;
 5939     const Register a = c_rarg1;
 5940     const Register b = c_rarg2;
 5941     const Register c = c_rarg3;
 5942 
 5943     const Register kyberConsts = r11;
 5944 
 5945     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5946     // quadwords.  So, we can load, add and store the data in 3
 5947     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5948     // of 10 or 11 registers. A further constraint is that the
 5949     // mapping needs to skip callee saves. So, we allocate the
 5950     // register sequences using two 8 sequences, two 2 sequences
 5951     // and two single registers.
 5952     VSeq<8> vs1_1(0);
 5953     VSeq<2> vs1_2(16);
 5954     FloatRegister vs1_3 = v28;
 5955     VSeq<8> vs2_1(18);
 5956     VSeq<2> vs2_2(26);
 5957     FloatRegister vs2_3 = v29;
 5958 
 5959     // two constant vector sequences
 5960     VSeq<8> vc_1(31, 0);
 5961     VSeq<2> vc_2(31, 0);
 5962 
 5963     FloatRegister vc_3 = v31;
 5964 
 5965     __ lea(kyberConsts,
 5966              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5967 
 5968     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5969     for (int i = 0; i < 3; i++) {
 5970       // load 80 or 88 values from a into vs1_1/2/3
 5971       vs_ldpq_post(vs1_1, a);
 5972       vs_ldpq_post(vs1_2, a);
 5973       if (i < 2) {
 5974         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5975       }
 5976       // load 80 or 88 values from b into vs2_1/2/3
 5977       vs_ldpq_post(vs2_1, b);
 5978       vs_ldpq_post(vs2_2, b);
 5979       if (i < 2) {
 5980         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5981       }
 5982       // sum 80 or 88 values across vs1 and vs2 into vs1
 5983       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5984       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5985       if (i < 2) {
 5986         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5987       }
 5988       // load 80 or 88 values from c into vs2_1/2/3
 5989       vs_ldpq_post(vs2_1, c);
 5990       vs_ldpq_post(vs2_2, c);
 5991       if (i < 2) {
 5992         __ ldr(vs2_3, __ Q, __ post(c, 16));
 5993       }
 5994       // sum 80 or 88 values across vs1 and vs2 into vs1
 5995       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5996       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5997       if (i < 2) {
 5998         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5999       }
 6000       // add constant to all 80 or 88 results
 6001       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6002       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6003       if (i < 2) {
 6004         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6005       }
 6006       // store 80 or 88 values
 6007       vs_stpq_post(vs1_1, result);
 6008       vs_stpq_post(vs1_2, result);
 6009       if (i < 2) {
 6010         __ str(vs1_3, __ Q, __ post(result, 16));
 6011       }
 6012     }
 6013 
 6014     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6015     __ mov(r0, zr); // return 0
 6016     __ ret(lr);
 6017 
 6018     return start;
 6019   }
 6020 
 6021   // Kyber parse XOF output to polynomial coefficient candidates
 6022   // or decodePoly(12, ...).
 6023   // Implements
 6024   // static int implKyber12To16(
 6025   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6026   //
 6027   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6028   //
 6029   // condensed (byte[]) = c_rarg0
 6030   // condensedIndex = c_rarg1
 6031   // parsed (short[112 or 256]) = c_rarg2
 6032   // parsedLength (112 or 256) = c_rarg3
 6033   address generate_kyber12To16() {
 6034     Label L_F00, L_loop, L_end;
 6035 
 6036     __ BIND(L_F00);
 6037     __ emit_int64(0x0f000f000f000f00);
 6038     __ emit_int64(0x0f000f000f000f00);
 6039 
 6040     __ align(CodeEntryAlignment);
 6041     StubGenStubId stub_id = StubGenStubId::kyber12To16_id;
 6042     StubCodeMark mark(this, stub_id);
 6043     address start = __ pc();
 6044     __ enter();
 6045 
 6046     const Register condensed = c_rarg0;
 6047     const Register condensedOffs = c_rarg1;
 6048     const Register parsed = c_rarg2;
 6049     const Register parsedLength = c_rarg3;
 6050 
 6051     const Register tmpAddr = r11;
 6052 
 6053     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6054     // quadwords so we need a 6 vector sequence for the inputs.
 6055     // Parsing produces 64 shorts, employing two 8 vector
 6056     // sequences to store and combine the intermediate data.
 6057     VSeq<6> vin(24);
 6058     VSeq<8> va(0), vb(16);
 6059 
 6060     __ adr(tmpAddr, L_F00);
 6061     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6062     __ add(condensed, condensed, condensedOffs);
 6063 
 6064     __ BIND(L_loop);
 6065     // load 96 (6 x 16B) byte values
 6066     vs_ld3_post(vin, __ T16B, condensed);
 6067 
 6068     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6069     // holds 48 (16x3) contiguous bytes from memory striped
 6070     // horizontally across each of the 16 byte lanes. Equivalently,
 6071     // that is 16 pairs of 12-bit integers. Likewise the back half
 6072     // holds the next 48 bytes in the same arrangement.
 6073 
 6074     // Each vector in the front half can also be viewed as a vertical
 6075     // strip across the 16 pairs of 12 bit integers. Each byte in
 6076     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6077     // byte in vin[1] stores the high 4 bits of the first int and the
 6078     // low 4 bits of the second int. Each byte in vin[2] stores the
 6079     // high 8 bits of the second int. Likewise the vectors in second
 6080     // half.
 6081 
 6082     // Converting the data to 16-bit shorts requires first of all
 6083     // expanding each of the 6 x 16B vectors into 6 corresponding
 6084     // pairs of 8H vectors. Mask, shift and add operations on the
 6085     // resulting vector pairs can be used to combine 4 and 8 bit
 6086     // parts of related 8H vector elements.
 6087     //
 6088     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6089     // twice, one copy manipulated to provide the lower 4 bits
 6090     // belonging to the first short in a pair and another copy
 6091     // manipulated to provide the higher 4 bits belonging to the
 6092     // second short in a pair. This is why the the vector sequences va
 6093     // and vb used to hold the expanded 8H elements are of length 8.
 6094 
 6095     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6096     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6097     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6098     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6099     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6100     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6101     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6102     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6103 
 6104     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6105     // and vb[4:5]
 6106     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6107     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6108     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6109     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6110     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6111     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6112 
 6113     // shift lo byte of copy 1 of the middle stripe into the high byte
 6114     __ shl(va[2], __ T8H, va[2], 8);
 6115     __ shl(va[3], __ T8H, va[3], 8);
 6116     __ shl(vb[2], __ T8H, vb[2], 8);
 6117     __ shl(vb[3], __ T8H, vb[3], 8);
 6118 
 6119     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6120     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6121     // are in bit positions [4..11].
 6122     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6123     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6124     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6125     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6126 
 6127     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6128     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6129     // copy2
 6130     __ andr(va[2], __ T16B, va[2], v31);
 6131     __ andr(va[3], __ T16B, va[3], v31);
 6132     __ ushr(va[4], __ T8H, va[4], 4);
 6133     __ ushr(va[5], __ T8H, va[5], 4);
 6134     __ andr(vb[2], __ T16B, vb[2], v31);
 6135     __ andr(vb[3], __ T16B, vb[3], v31);
 6136     __ ushr(vb[4], __ T8H, vb[4], 4);
 6137     __ ushr(vb[5], __ T8H, vb[5], 4);
 6138 
 6139     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6140     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6141     // n.b. the ordering ensures: i) inputs are consumed before they
 6142     // are overwritten ii) the order of 16-bit results across successive
 6143     // pairs of vectors in va and then vb reflects the order of the
 6144     // corresponding 12-bit inputs
 6145     __ addv(va[0], __ T8H, va[0], va[2]);
 6146     __ addv(va[2], __ T8H, va[1], va[3]);
 6147     __ addv(va[1], __ T8H, va[4], va[6]);
 6148     __ addv(va[3], __ T8H, va[5], va[7]);
 6149     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6150     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6151     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6152     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6153 
 6154     // store 64 results interleaved as shorts
 6155     vs_st2_post(vs_front(va), __ T8H, parsed);
 6156     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6157 
 6158     __ sub(parsedLength, parsedLength, 64);
 6159     __ cmp(parsedLength, (u1)64);
 6160     __ br(Assembler::GE, L_loop);
 6161     __ cbz(parsedLength, L_end);
 6162 
 6163     // if anything is left it should be a final 72 bytes of input
 6164     // i.e. a final 48 12-bit values. so we handle this by loading
 6165     // 48 bytes into all 16B lanes of front(vin) and only 24
 6166     // bytes into the lower 8B lane of back(vin)
 6167     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6168     vs_ld3(vs_back(vin), __ T8B, condensed);
 6169 
 6170     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6171     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6172     // 5 and target element 2 of vb duplicates element 4.
 6173     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6174     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6175     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6176     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6177     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6178     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6179 
 6180     // This time expand just the lower 8 lanes
 6181     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6182     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6183     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6184 
 6185     // shift lo byte of copy 1 of the middle stripe into the high byte
 6186     __ shl(va[2], __ T8H, va[2], 8);
 6187     __ shl(va[3], __ T8H, va[3], 8);
 6188     __ shl(vb[2], __ T8H, vb[2], 8);
 6189 
 6190     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6191     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6192     // int are in bit positions [4..11].
 6193     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6194     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6195     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6196 
 6197     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6198     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6199     // copy2
 6200     __ andr(va[2], __ T16B, va[2], v31);
 6201     __ andr(va[3], __ T16B, va[3], v31);
 6202     __ ushr(va[4], __ T8H, va[4], 4);
 6203     __ ushr(va[5], __ T8H, va[5], 4);
 6204     __ andr(vb[2], __ T16B, vb[2], v31);
 6205     __ ushr(vb[4], __ T8H, vb[4], 4);
 6206 
 6207 
 6208 
 6209     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6210     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6211 
 6212     // n.b. ordering ensures: i) inputs are consumed before they are
 6213     // overwritten ii) order of 16-bit results across succsessive
 6214     // pairs of vectors in va and then lower half of vb reflects order
 6215     // of corresponding 12-bit inputs
 6216     __ addv(va[0], __ T8H, va[0], va[2]);
 6217     __ addv(va[2], __ T8H, va[1], va[3]);
 6218     __ addv(va[1], __ T8H, va[4], va[6]);
 6219     __ addv(va[3], __ T8H, va[5], va[7]);
 6220     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6221     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6222 
 6223     // store 48 results interleaved as shorts
 6224     vs_st2_post(vs_front(va), __ T8H, parsed);
 6225     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6226 
 6227     __ BIND(L_end);
 6228 
 6229     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6230     __ mov(r0, zr); // return 0
 6231     __ ret(lr);
 6232 
 6233     return start;
 6234   }
 6235 
 6236   // Kyber Barrett reduce function.
 6237   // Implements
 6238   // static int implKyberBarrettReduce(short[] coeffs) {}
 6239   //
 6240   // coeffs (short[256]) = c_rarg0
 6241   address generate_kyberBarrettReduce() {
 6242 
 6243     __ align(CodeEntryAlignment);
 6244     StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id;
 6245     StubCodeMark mark(this, stub_id);
 6246     address start = __ pc();
 6247     __ enter();
 6248 
 6249     const Register coeffs = c_rarg0;
 6250 
 6251     const Register kyberConsts = r10;
 6252     const Register result = r11;
 6253 
 6254     // As above we process 256 sets of values in total i.e. 32 x
 6255     // 8H quadwords. So, we can load, add and store the data in 3
 6256     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6257     // of 10 or 11 registers. A further constraint is that the
 6258     // mapping needs to skip callee saves. So, we allocate the
 6259     // register sequences using two 8 sequences, two 2 sequences
 6260     // and two single registers.
 6261     VSeq<8> vs1_1(0);
 6262     VSeq<2> vs1_2(16);
 6263     FloatRegister vs1_3 = v28;
 6264     VSeq<8> vs2_1(18);
 6265     VSeq<2> vs2_2(26);
 6266     FloatRegister vs2_3 = v29;
 6267 
 6268     // we also need a pair of corresponding constant sequences
 6269 
 6270     VSeq<8> vc1_1(30, 0);
 6271     VSeq<2> vc1_2(30, 0);
 6272     FloatRegister vc1_3 = v30; // for kyber_q
 6273 
 6274     VSeq<8> vc2_1(31, 0);
 6275     VSeq<2> vc2_2(31, 0);
 6276     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6277 
 6278     __ add(result, coeffs, 0);
 6279     __ lea(kyberConsts,
 6280              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6281 
 6282     // load q and the multiplier for the Barrett reduction
 6283     __ add(kyberConsts, kyberConsts, 16);
 6284     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6285 
 6286     for (int i = 0; i < 3; i++) {
 6287       // load 80 or 88 coefficients
 6288       vs_ldpq_post(vs1_1, coeffs);
 6289       vs_ldpq_post(vs1_2, coeffs);
 6290       if (i < 2) {
 6291         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6292       }
 6293 
 6294       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6295       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6296       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6297       if (i < 2) {
 6298         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6299       }
 6300 
 6301       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6302       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6303       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6304       if (i < 2) {
 6305         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6306       }
 6307 
 6308       // vs1 <- vs1 - vs2 * kyber_q
 6309       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6310       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6311       if (i < 2) {
 6312         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6313       }
 6314 
 6315       vs_stpq_post(vs1_1, result);
 6316       vs_stpq_post(vs1_2, result);
 6317       if (i < 2) {
 6318         __ str(vs1_3, __ Q, __ post(result, 16));
 6319       }
 6320     }
 6321 
 6322     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6323     __ mov(r0, zr); // return 0
 6324     __ ret(lr);
 6325 
 6326     return start;
 6327   }
 6328 
 6329 
 6330   // Dilithium-specific montmul helper routines that generate parallel
 6331   // code for, respectively, a single 4x4s vector sequence montmul or
 6332   // two such multiplies in a row.
 6333 
 6334   // Perform 16 32-bit Montgomery multiplications in parallel
 6335   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6336                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6337     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6338     // It will assert that the register use is valid
 6339     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6340   }
 6341 
 6342   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6343   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6344                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6345     // Schedule two successive 4x4S multiplies via the montmul helper
 6346     // on the front and back halves of va, vb and vc. The helper will
 6347     // assert that the register use has no overlap conflicts on each
 6348     // individual call but we also need to ensure that the necessary
 6349     // disjoint/equality constraints are met across both calls.
 6350 
 6351     // vb, vc, vtmp and vq must be disjoint. va must either be
 6352     // disjoint from all other registers or equal vc
 6353 
 6354     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6355     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6356     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6357 
 6358     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6359     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6360 
 6361     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6362 
 6363     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6364     assert(vs_disjoint(va, vb), "va and vb overlap");
 6365     assert(vs_disjoint(va, vq), "va and vq overlap");
 6366     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6367 
 6368     // We multiply the front and back halves of each sequence 4 at a
 6369     // time because
 6370     //
 6371     // 1) we are currently only able to get 4-way instruction
 6372     // parallelism at best
 6373     //
 6374     // 2) we need registers for the constants in vq and temporary
 6375     // scratch registers to hold intermediate results so vtmp can only
 6376     // be a VSeq<4> which means we only have 4 scratch slots.
 6377 
 6378     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6379     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6380   }
 6381 
 6382   // Perform combined montmul then add/sub on 4x4S vectors.
 6383   void dilithium_montmul16_sub_add(
 6384           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6385           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6386     // compute a = montmul(a1, c)
 6387     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6388     // ouptut a1 = a0 - a
 6389     vs_subv(va1, __ T4S, va0, vc);
 6390     //    and a0 = a0 + a
 6391     vs_addv(va0, __ T4S, va0, vc);
 6392   }
 6393 
 6394   // Perform combined add/sub then montul on 4x4S vectors.
 6395   void dilithium_sub_add_montmul16(
 6396           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6397           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6398     // compute c = a0 - a1
 6399     vs_subv(vtmp1, __ T4S, va0, va1);
 6400     // output a0 = a0 + a1
 6401     vs_addv(va0, __ T4S, va0, va1);
 6402     // output a1 = b montmul c
 6403     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6404   }
 6405 
 6406   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6407   // in the Java implementation come in sequences of at least 8, so we
 6408   // can use ldpq to collect the corresponding data into pairs of vector
 6409   // registers.
 6410   // We collect the coefficients corresponding to the 'j+l' indexes into
 6411   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6412   // then we do the (Montgomery) multiplications by the zetas in parallel
 6413   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6414   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6415   // v0-v7 and finally save the results back to the coeffs array.
 6416   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6417     const Register coeffs, const Register zetas) {
 6418     int c1 = 0;
 6419     int c2 = 512;
 6420     int startIncr;
 6421     // don't use callee save registers v8 - v15
 6422     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6423     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6424     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6425     int offsets[4] = { 0, 32, 64, 96 };
 6426 
 6427     for (int level = 0; level < 5; level++) {
 6428       int c1Start = c1;
 6429       int c2Start = c2;
 6430       if (level == 3) {
 6431         offsets[1] = 32;
 6432         offsets[2] = 128;
 6433         offsets[3] = 160;
 6434       } else if (level == 4) {
 6435         offsets[1] = 64;
 6436         offsets[2] = 128;
 6437         offsets[3] = 192;
 6438       }
 6439 
 6440       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6441       // time at 4 different offsets and multiply them in order by the
 6442       // next set of input values. So we employ indexed load and store
 6443       // pair instructions with arrangement 4S.
 6444       for (int i = 0; i < 4; i++) {
 6445         // reload q and qinv
 6446         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6447         // load 8x4S coefficients via second start pos == c2
 6448         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6449         // load next 8x4S inputs == b
 6450         vs_ldpq_post(vs2, zetas);
 6451         // compute a == c2 * b mod MONT_Q
 6452         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6453         // load 8x4s coefficients via first start pos == c1
 6454         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6455         // compute a1 =  c1 + a
 6456         vs_addv(vs3, __ T4S, vs1, vs2);
 6457         // compute a2 =  c1 - a
 6458         vs_subv(vs1, __ T4S, vs1, vs2);
 6459         // output a1 and a2
 6460         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6461         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6462 
 6463         int k = 4 * level + i;
 6464 
 6465         if (k > 7) {
 6466           startIncr = 256;
 6467         } else if (k == 5) {
 6468           startIncr = 384;
 6469         } else {
 6470           startIncr = 128;
 6471         }
 6472 
 6473         c1Start += startIncr;
 6474         c2Start += startIncr;
 6475       }
 6476 
 6477       c2 /= 2;
 6478     }
 6479   }
 6480 
 6481   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6482   // Implements the method
 6483   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6484   // of the Java class sun.security.provider
 6485   //
 6486   // coeffs (int[256]) = c_rarg0
 6487   // zetas (int[256]) = c_rarg1
 6488   address generate_dilithiumAlmostNtt() {
 6489 
 6490     __ align(CodeEntryAlignment);
 6491     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 6492     StubCodeMark mark(this, stub_id);
 6493     address start = __ pc();
 6494     __ enter();
 6495 
 6496     const Register coeffs = c_rarg0;
 6497     const Register zetas = c_rarg1;
 6498 
 6499     const Register tmpAddr = r9;
 6500     const Register dilithiumConsts = r10;
 6501     const Register result = r11;
 6502     // don't use callee save registers v8 - v15
 6503     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6504     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6505     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6506     int offsets[4] = { 0, 32, 64, 96};
 6507     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6508     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6509     __ add(result, coeffs, 0);
 6510     __ lea(dilithiumConsts,
 6511              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6512 
 6513     // Each level represents one iteration of the outer for loop of the Java version.
 6514 
 6515     // level 0-4
 6516     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6517 
 6518     // level 5
 6519 
 6520     // At level 5 the coefficients we need to combine with the zetas
 6521     // are grouped in memory in blocks of size 4. So, for both sets of
 6522     // coefficients we load 4 adjacent values at 8 different offsets
 6523     // using an indexed ldr with register variant Q and multiply them
 6524     // in sequence order by the next set of inputs. Likewise we store
 6525     // the resuls using an indexed str with register variant Q.
 6526     for (int i = 0; i < 1024; i += 256) {
 6527       // reload constants q, qinv each iteration as they get clobbered later
 6528       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6529       // load 32 (8x4S) coefficients via first offsets = c1
 6530       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6531       // load next 32 (8x4S) inputs = b
 6532       vs_ldpq_post(vs2, zetas);
 6533       // a = b montul c1
 6534       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6535       // load 32 (8x4S) coefficients via second offsets = c2
 6536       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6537       // add/sub with result of multiply
 6538       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6539       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6540       // write back new coefficients using same offsets
 6541       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6542       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6543     }
 6544 
 6545     // level 6
 6546     // At level 6 the coefficients we need to combine with the zetas
 6547     // are grouped in memory in pairs, the first two being montmul
 6548     // inputs and the second add/sub inputs. We can still implement
 6549     // the montmul+sub+add using 4-way parallelism but only if we
 6550     // combine the coefficients with the zetas 16 at a time. We load 8
 6551     // adjacent values at 4 different offsets using an ld2 load with
 6552     // arrangement 2D. That interleaves the lower and upper halves of
 6553     // each pair of quadwords into successive vector registers. We
 6554     // then need to montmul the 4 even elements of the coefficients
 6555     // register sequence by the zetas in order and then add/sub the 4
 6556     // odd elements of the coefficients register sequence. We use an
 6557     // equivalent st2 operation to store the results back into memory
 6558     // de-interleaved.
 6559     for (int i = 0; i < 1024; i += 128) {
 6560       // reload constants q, qinv each iteration as they get clobbered later
 6561       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6562       // load interleaved 16 (4x2D) coefficients via offsets
 6563       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6564       // load next 16 (4x4S) inputs
 6565       vs_ldpq_post(vs_front(vs2), zetas);
 6566       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6567       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6568                                   vs_front(vs2), vtmp, vq);
 6569       // store interleaved 16 (4x2D) coefficients via offsets
 6570       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6571     }
 6572 
 6573     // level 7
 6574     // At level 7 the coefficients we need to combine with the zetas
 6575     // occur singly with montmul inputs alterating with add/sub
 6576     // inputs. Once again we can use 4-way parallelism to combine 16
 6577     // zetas at a time. However, we have to load 8 adjacent values at
 6578     // 4 different offsets using an ld2 load with arrangement 4S. That
 6579     // interleaves the the odd words of each pair into one
 6580     // coefficients vector register and the even words of the pair
 6581     // into the next register. We then need to montmul the 4 even
 6582     // elements of the coefficients register sequence by the zetas in
 6583     // order and then add/sub the 4 odd elements of the coefficients
 6584     // register sequence. We use an equivalent st2 operation to store
 6585     // the results back into memory de-interleaved.
 6586 
 6587     for (int i = 0; i < 1024; i += 128) {
 6588       // reload constants q, qinv each iteration as they get clobbered later
 6589       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6590       // load interleaved 16 (4x4S) coefficients via offsets
 6591       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6592       // load next 16 (4x4S) inputs
 6593       vs_ldpq_post(vs_front(vs2), zetas);
 6594       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6595       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6596                                   vs_front(vs2), vtmp, vq);
 6597       // store interleaved 16 (4x4S) coefficients via offsets
 6598       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6599     }
 6600     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6601     __ mov(r0, zr); // return 0
 6602     __ ret(lr);
 6603 
 6604     return start;
 6605   }
 6606 
 6607   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6608   // in the Java implementation come in sequences of at least 8, so we
 6609   // can use ldpq to collect the corresponding data into pairs of vector
 6610   // registers
 6611   // We collect the coefficients that correspond to the 'j's into vs1
 6612   // the coefficiets that correspond to the 'j+l's into vs2 then
 6613   // do the additions into vs3 and the subtractions into vs1 then
 6614   // save the result of the additions, load the zetas into vs2
 6615   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6616   // finally save the results back to the coeffs array
 6617   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6618     const Register coeffs, const Register zetas) {
 6619     int c1 = 0;
 6620     int c2 = 32;
 6621     int startIncr;
 6622     int offsets[4];
 6623     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6624     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6625     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6626 
 6627     offsets[0] = 0;
 6628 
 6629     for (int level = 3; level < 8; level++) {
 6630       int c1Start = c1;
 6631       int c2Start = c2;
 6632       if (level == 3) {
 6633         offsets[1] = 64;
 6634         offsets[2] = 128;
 6635         offsets[3] = 192;
 6636       } else if (level == 4) {
 6637         offsets[1] = 32;
 6638         offsets[2] = 128;
 6639         offsets[3] = 160;
 6640       } else {
 6641         offsets[1] = 32;
 6642         offsets[2] = 64;
 6643         offsets[3] = 96;
 6644       }
 6645 
 6646       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6647       // time at 4 different offsets and multiply them in order by the
 6648       // next set of input values. So we employ indexed load and store
 6649       // pair instructions with arrangement 4S.
 6650       for (int i = 0; i < 4; i++) {
 6651         // load v1 32 (8x4S) coefficients relative to first start index
 6652         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6653         // load v2 32 (8x4S) coefficients relative to second start index
 6654         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6655         // a0 = v1 + v2 -- n.b. clobbers vqs
 6656         vs_addv(vs3, __ T4S, vs1, vs2);
 6657         // a1 = v1 - v2
 6658         vs_subv(vs1, __ T4S, vs1, vs2);
 6659         // save a1 relative to first start index
 6660         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6661         // load constants q, qinv each iteration as they get clobbered above
 6662         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6663         // load b next 32 (8x4S) inputs
 6664         vs_ldpq_post(vs2, zetas);
 6665         // a = a1 montmul b
 6666         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6667         // save a relative to second start index
 6668         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6669 
 6670         int k = 4 * level + i;
 6671 
 6672         if (k < 24) {
 6673           startIncr = 256;
 6674         } else if (k == 25) {
 6675           startIncr = 384;
 6676         } else {
 6677           startIncr = 128;
 6678         }
 6679 
 6680         c1Start += startIncr;
 6681         c2Start += startIncr;
 6682       }
 6683 
 6684       c2 *= 2;
 6685     }
 6686   }
 6687 
 6688   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6689   // Implements the method
 6690   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6691   // the sun.security.provider.ML_DSA class.
 6692   //
 6693   // coeffs (int[256]) = c_rarg0
 6694   // zetas (int[256]) = c_rarg1
 6695   address generate_dilithiumAlmostInverseNtt() {
 6696 
 6697     __ align(CodeEntryAlignment);
 6698     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 6699     StubCodeMark mark(this, stub_id);
 6700     address start = __ pc();
 6701     __ enter();
 6702 
 6703     const Register coeffs = c_rarg0;
 6704     const Register zetas = c_rarg1;
 6705 
 6706     const Register tmpAddr = r9;
 6707     const Register dilithiumConsts = r10;
 6708     const Register result = r11;
 6709     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6710     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6711     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6712     int offsets[4] = { 0, 32, 64, 96 };
 6713     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6714     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6715 
 6716     __ add(result, coeffs, 0);
 6717     __ lea(dilithiumConsts,
 6718              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6719 
 6720     // Each level represents one iteration of the outer for loop of the Java version
 6721 
 6722     // level 0
 6723     // At level 0 we need to interleave adjacent quartets of
 6724     // coefficients before we multiply and add/sub by the next 16
 6725     // zetas just as we did for level 7 in the multiply code. So we
 6726     // load and store the values using an ld2/st2 with arrangement 4S.
 6727     for (int i = 0; i < 1024; i += 128) {
 6728       // load constants q, qinv
 6729       // n.b. this can be moved out of the loop as they do not get
 6730       // clobbered by first two loops
 6731       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6732       // a0/a1 load interleaved 32 (8x4S) coefficients
 6733       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6734       // b load next 32 (8x4S) inputs
 6735       vs_ldpq_post(vs_front(vs2), zetas);
 6736       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6737       // n.b. second half of vs2 provides temporary register storage
 6738       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6739                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6740       // a0/a1 store interleaved 32 (8x4S) coefficients
 6741       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6742     }
 6743 
 6744     // level 1
 6745     // At level 1 we need to interleave pairs of adjacent pairs of
 6746     // coefficients before we multiply by the next 16 zetas just as we
 6747     // did for level 6 in the multiply code. So we load and store the
 6748     // values an ld2/st2 with arrangement 2D.
 6749     for (int i = 0; i < 1024; i += 128) {
 6750       // a0/a1 load interleaved 32 (8x2D) coefficients
 6751       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6752       // b load next 16 (4x4S) inputs
 6753       vs_ldpq_post(vs_front(vs2), zetas);
 6754       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6755       // n.b. second half of vs2 provides temporary register storage
 6756       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6757                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6758       // a0/a1 store interleaved 32 (8x2D) coefficients
 6759       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6760     }
 6761 
 6762     // level 2
 6763     // At level 2 coefficients come in blocks of 4. So, we load 4
 6764     // adjacent coefficients at 8 distinct offsets for both the first
 6765     // and second coefficient sequences, using an ldr with register
 6766     // variant Q then combine them with next set of 32 zetas. Likewise
 6767     // we store the results using an str with register variant Q.
 6768     for (int i = 0; i < 1024; i += 256) {
 6769       // c0 load 32 (8x4S) coefficients via first offsets
 6770       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6771       // c1 load 32 (8x4S) coefficients via second offsets
 6772       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6773       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6774       vs_addv(vs3, __ T4S, vs1, vs2);
 6775       // c = c0 - c1
 6776       vs_subv(vs1, __ T4S, vs1, vs2);
 6777       // store a0 32 (8x4S) coefficients via first offsets
 6778       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6779       // b load 32 (8x4S) next inputs
 6780       vs_ldpq_post(vs2, zetas);
 6781       // reload constants q, qinv -- they were clobbered earlier
 6782       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6783       // compute a1 = b montmul c
 6784       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6785       // store a1 32 (8x4S) coefficients via second offsets
 6786       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6787     }
 6788 
 6789     // level 3-7
 6790     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6791 
 6792     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6793     __ mov(r0, zr); // return 0
 6794     __ ret(lr);
 6795 
 6796     return start;
 6797   }
 6798 
 6799   // Dilithium multiply polynomials in the NTT domain.
 6800   // Straightforward implementation of the method
 6801   // static int implDilithiumNttMult(
 6802   //              int[] result, int[] ntta, int[] nttb {} of
 6803   // the sun.security.provider.ML_DSA class.
 6804   //
 6805   // result (int[256]) = c_rarg0
 6806   // poly1 (int[256]) = c_rarg1
 6807   // poly2 (int[256]) = c_rarg2
 6808   address generate_dilithiumNttMult() {
 6809 
 6810         __ align(CodeEntryAlignment);
 6811     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 6812     StubCodeMark mark(this, stub_id);
 6813     address start = __ pc();
 6814     __ enter();
 6815 
 6816     Label L_loop;
 6817 
 6818     const Register result = c_rarg0;
 6819     const Register poly1 = c_rarg1;
 6820     const Register poly2 = c_rarg2;
 6821 
 6822     const Register dilithiumConsts = r10;
 6823     const Register len = r11;
 6824 
 6825     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6826     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6827     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6828     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6829 
 6830     __ lea(dilithiumConsts,
 6831              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6832 
 6833     // load constants q, qinv
 6834     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6835     // load constant rSquare into v29
 6836     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6837 
 6838     __ mov(len, zr);
 6839     __ add(len, len, 1024);
 6840 
 6841     __ BIND(L_loop);
 6842 
 6843     // b load 32 (8x4S) next inputs from poly1
 6844     vs_ldpq_post(vs1, poly1);
 6845     // c load 32 (8x4S) next inputs from poly2
 6846     vs_ldpq_post(vs2, poly2);
 6847     // compute a = b montmul c
 6848     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6849     // compute a = rsquare montmul a
 6850     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6851     // save a 32 (8x4S) results
 6852     vs_stpq_post(vs2, result);
 6853 
 6854     __ sub(len, len, 128);
 6855     __ cmp(len, (u1)128);
 6856     __ br(Assembler::GE, L_loop);
 6857 
 6858     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6859     __ mov(r0, zr); // return 0
 6860     __ ret(lr);
 6861 
 6862     return start;
 6863   }
 6864 
 6865   // Dilithium Motgomery multiply an array by a constant.
 6866   // A straightforward implementation of the method
 6867   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6868   // of the sun.security.provider.MLDSA class
 6869   //
 6870   // coeffs (int[256]) = c_rarg0
 6871   // constant (int) = c_rarg1
 6872   address generate_dilithiumMontMulByConstant() {
 6873 
 6874     __ align(CodeEntryAlignment);
 6875     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 6876     StubCodeMark mark(this, stub_id);
 6877     address start = __ pc();
 6878     __ enter();
 6879 
 6880     Label L_loop;
 6881 
 6882     const Register coeffs = c_rarg0;
 6883     const Register constant = c_rarg1;
 6884 
 6885     const Register dilithiumConsts = r10;
 6886     const Register result = r11;
 6887     const Register len = r12;
 6888 
 6889     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6890     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6891     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6892     VSeq<8> vconst(29, 0);             // for montmul by constant
 6893 
 6894     // results track inputs
 6895     __ add(result, coeffs, 0);
 6896     __ lea(dilithiumConsts,
 6897              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6898 
 6899     // load constants q, qinv -- they do not get clobbered by first two loops
 6900     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6901     // copy caller supplied constant across vconst
 6902     __ dup(vconst[0], __ T4S, constant);
 6903     __ mov(len, zr);
 6904     __ add(len, len, 1024);
 6905 
 6906     __ BIND(L_loop);
 6907 
 6908     // load next 32 inputs
 6909     vs_ldpq_post(vs2, coeffs);
 6910     // mont mul by constant
 6911     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6912     // write next 32 results
 6913     vs_stpq_post(vs2, result);
 6914 
 6915     __ sub(len, len, 128);
 6916     __ cmp(len, (u1)128);
 6917     __ br(Assembler::GE, L_loop);
 6918 
 6919     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6920     __ mov(r0, zr); // return 0
 6921     __ ret(lr);
 6922 
 6923     return start;
 6924   }
 6925 
 6926   // Dilithium decompose poly.
 6927   // Implements the method
 6928   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6929   // of the sun.security.provider.ML_DSA class
 6930   //
 6931   // input (int[256]) = c_rarg0
 6932   // lowPart (int[256]) = c_rarg1
 6933   // highPart (int[256]) = c_rarg2
 6934   // twoGamma2  (int) = c_rarg3
 6935   // multiplier (int) = c_rarg4
 6936   address generate_dilithiumDecomposePoly() {
 6937 
 6938     __ align(CodeEntryAlignment);
 6939     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 6940     StubCodeMark mark(this, stub_id);
 6941     address start = __ pc();
 6942     Label L_loop;
 6943 
 6944     const Register input = c_rarg0;
 6945     const Register lowPart = c_rarg1;
 6946     const Register highPart = c_rarg2;
 6947     const Register twoGamma2 = c_rarg3;
 6948     const Register multiplier = c_rarg4;
 6949 
 6950     const Register len = r9;
 6951     const Register dilithiumConsts = r10;
 6952     const Register tmp = r11;
 6953 
 6954     // 6 independent sets of 4x4s values
 6955     VSeq<4> vs1(0), vs2(4), vs3(8);
 6956     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6957 
 6958     // 7 constants for cross-multiplying
 6959     VSeq<4> one(25, 0);
 6960     VSeq<4> qminus1(26, 0);
 6961     VSeq<4> g2(27, 0);
 6962     VSeq<4> twog2(28, 0);
 6963     VSeq<4> mult(29, 0);
 6964     VSeq<4> q(30, 0);
 6965     VSeq<4> qadd(31, 0);
 6966 
 6967     __ enter();
 6968 
 6969     __ lea(dilithiumConsts,
 6970              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6971 
 6972     // save callee-saved registers
 6973     __ stpd(v8, v9, __ pre(sp, -64));
 6974     __ stpd(v10, v11, Address(sp, 16));
 6975     __ stpd(v12, v13, Address(sp, 32));
 6976     __ stpd(v14, v15, Address(sp, 48));
 6977 
 6978     // populate constant registers
 6979     __ mov(tmp, zr);
 6980     __ add(tmp, tmp, 1);
 6981     __ dup(one[0], __ T4S, tmp); // 1
 6982     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6983     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6984     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6985     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6986     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6987     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6988 
 6989     __ mov(len, zr);
 6990     __ add(len, len, 1024);
 6991 
 6992     __ BIND(L_loop);
 6993 
 6994     // load next 4x4S inputs interleaved: rplus --> vs1
 6995     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 6996 
 6997     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 6998     vs_addv(vtmp, __ T4S, vs1, qadd);
 6999     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7000     vs_mulv(vtmp, __ T4S, vtmp, q);
 7001     vs_subv(vs1, __ T4S, vs1, vtmp);
 7002 
 7003     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7004     vs_sshr(vtmp, __ T4S, vs1, 31);
 7005     vs_andr(vtmp, vtmp, q);
 7006     vs_addv(vs1, __ T4S, vs1, vtmp);
 7007 
 7008     // quotient --> vs2
 7009     // int quotient = (rplus * multiplier) >> 22;
 7010     vs_mulv(vtmp, __ T4S, vs1, mult);
 7011     vs_sshr(vs2, __ T4S, vtmp, 22);
 7012 
 7013     // r0 --> vs3
 7014     // int r0 = rplus - quotient * twoGamma2;
 7015     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7016     vs_subv(vs3, __ T4S, vs1, vtmp);
 7017 
 7018     // mask --> vs4
 7019     // int mask = (twoGamma2 - r0) >> 22;
 7020     vs_subv(vtmp, __ T4S, twog2, vs3);
 7021     vs_sshr(vs4, __ T4S, vtmp, 22);
 7022 
 7023     // r0 -= (mask & twoGamma2);
 7024     vs_andr(vtmp, vs4, twog2);
 7025     vs_subv(vs3, __ T4S, vs3, vtmp);
 7026 
 7027     //  quotient += (mask & 1);
 7028     vs_andr(vtmp, vs4, one);
 7029     vs_addv(vs2, __ T4S, vs2, vtmp);
 7030 
 7031     // mask = (twoGamma2 / 2 - r0) >> 31;
 7032     vs_subv(vtmp, __ T4S, g2, vs3);
 7033     vs_sshr(vs4, __ T4S, vtmp, 31);
 7034 
 7035     // r0 -= (mask & twoGamma2);
 7036     vs_andr(vtmp, vs4, twog2);
 7037     vs_subv(vs3, __ T4S, vs3, vtmp);
 7038 
 7039     // quotient += (mask & 1);
 7040     vs_andr(vtmp, vs4, one);
 7041     vs_addv(vs2, __ T4S, vs2, vtmp);
 7042 
 7043     // r1 --> vs5
 7044     // int r1 = rplus - r0 - (dilithium_q - 1);
 7045     vs_subv(vtmp, __ T4S, vs1, vs3);
 7046     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7047 
 7048     // r1 --> vs1 (overwriting rplus)
 7049     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7050     vs_negr(vtmp, __ T4S, vs5);
 7051     vs_orr(vtmp, vs5, vtmp);
 7052     vs_sshr(vs1, __ T4S, vtmp, 31);
 7053 
 7054     // r0 += ~r1;
 7055     vs_notr(vtmp, vs1);
 7056     vs_addv(vs3, __ T4S, vs3, vtmp);
 7057 
 7058     // r1 = r1 & quotient;
 7059     vs_andr(vs1, vs2, vs1);
 7060 
 7061     // store results inteleaved
 7062     // lowPart[m] = r0;
 7063     // highPart[m] = r1;
 7064     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7065     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7066 
 7067     __ sub(len, len, 64);
 7068     __ cmp(len, (u1)64);
 7069     __ br(Assembler::GE, L_loop);
 7070 
 7071     // restore callee-saved vector registers
 7072     __ ldpd(v14, v15, Address(sp, 48));
 7073     __ ldpd(v12, v13, Address(sp, 32));
 7074     __ ldpd(v10, v11, Address(sp, 16));
 7075     __ ldpd(v8, v9, __ post(sp, 64));
 7076 
 7077     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7078     __ mov(r0, zr); // return 0
 7079     __ ret(lr);
 7080 
 7081     return start;
 7082   }
 7083 
 7084   /**
 7085    *  Arguments:
 7086    *
 7087    * Inputs:
 7088    *   c_rarg0   - int crc
 7089    *   c_rarg1   - byte* buf
 7090    *   c_rarg2   - int length
 7091    *
 7092    * Output:
 7093    *       rax   - int crc result
 7094    */
 7095   address generate_updateBytesCRC32() {
 7096     assert(UseCRC32Intrinsics, "what are we doing here?");
 7097 
 7098     __ align(CodeEntryAlignment);
 7099     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 7100     StubCodeMark mark(this, stub_id);
 7101 
 7102     address start = __ pc();
 7103 
 7104     const Register crc   = c_rarg0;  // crc
 7105     const Register buf   = c_rarg1;  // source java byte array address
 7106     const Register len   = c_rarg2;  // length
 7107     const Register table0 = c_rarg3; // crc_table address
 7108     const Register table1 = c_rarg4;
 7109     const Register table2 = c_rarg5;
 7110     const Register table3 = c_rarg6;
 7111     const Register tmp3 = c_rarg7;
 7112 
 7113     BLOCK_COMMENT("Entry:");
 7114     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7115 
 7116     __ kernel_crc32(crc, buf, len,
 7117               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7118 
 7119     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7120     __ ret(lr);
 7121 
 7122     return start;
 7123   }
 7124 
 7125   /**
 7126    *  Arguments:
 7127    *
 7128    * Inputs:
 7129    *   c_rarg0   - int crc
 7130    *   c_rarg1   - byte* buf
 7131    *   c_rarg2   - int length
 7132    *   c_rarg3   - int* table
 7133    *
 7134    * Output:
 7135    *       r0   - int crc result
 7136    */
 7137   address generate_updateBytesCRC32C() {
 7138     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7139 
 7140     __ align(CodeEntryAlignment);
 7141     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 7142     StubCodeMark mark(this, stub_id);
 7143 
 7144     address start = __ pc();
 7145 
 7146     const Register crc   = c_rarg0;  // crc
 7147     const Register buf   = c_rarg1;  // source java byte array address
 7148     const Register len   = c_rarg2;  // length
 7149     const Register table0 = c_rarg3; // crc_table address
 7150     const Register table1 = c_rarg4;
 7151     const Register table2 = c_rarg5;
 7152     const Register table3 = c_rarg6;
 7153     const Register tmp3 = c_rarg7;
 7154 
 7155     BLOCK_COMMENT("Entry:");
 7156     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7157 
 7158     __ kernel_crc32c(crc, buf, len,
 7159               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7160 
 7161     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7162     __ ret(lr);
 7163 
 7164     return start;
 7165   }
 7166 
 7167   /***
 7168    *  Arguments:
 7169    *
 7170    *  Inputs:
 7171    *   c_rarg0   - int   adler
 7172    *   c_rarg1   - byte* buff
 7173    *   c_rarg2   - int   len
 7174    *
 7175    * Output:
 7176    *   c_rarg0   - int adler result
 7177    */
 7178   address generate_updateBytesAdler32() {
 7179     __ align(CodeEntryAlignment);
 7180     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 7181     StubCodeMark mark(this, stub_id);
 7182     address start = __ pc();
 7183 
 7184     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7185 
 7186     // Aliases
 7187     Register adler  = c_rarg0;
 7188     Register s1     = c_rarg0;
 7189     Register s2     = c_rarg3;
 7190     Register buff   = c_rarg1;
 7191     Register len    = c_rarg2;
 7192     Register nmax  = r4;
 7193     Register base  = r5;
 7194     Register count = r6;
 7195     Register temp0 = rscratch1;
 7196     Register temp1 = rscratch2;
 7197     FloatRegister vbytes = v0;
 7198     FloatRegister vs1acc = v1;
 7199     FloatRegister vs2acc = v2;
 7200     FloatRegister vtable = v3;
 7201 
 7202     // Max number of bytes we can process before having to take the mod
 7203     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7204     uint64_t BASE = 0xfff1;
 7205     uint64_t NMAX = 0x15B0;
 7206 
 7207     __ mov(base, BASE);
 7208     __ mov(nmax, NMAX);
 7209 
 7210     // Load accumulation coefficients for the upper 16 bits
 7211     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7212     __ ld1(vtable, __ T16B, Address(temp0));
 7213 
 7214     // s1 is initialized to the lower 16 bits of adler
 7215     // s2 is initialized to the upper 16 bits of adler
 7216     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7217     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7218 
 7219     // The pipelined loop needs at least 16 elements for 1 iteration
 7220     // It does check this, but it is more effective to skip to the cleanup loop
 7221     __ cmp(len, (u1)16);
 7222     __ br(Assembler::HS, L_nmax);
 7223     __ cbz(len, L_combine);
 7224 
 7225     __ bind(L_simple_by1_loop);
 7226     __ ldrb(temp0, Address(__ post(buff, 1)));
 7227     __ add(s1, s1, temp0);
 7228     __ add(s2, s2, s1);
 7229     __ subs(len, len, 1);
 7230     __ br(Assembler::HI, L_simple_by1_loop);
 7231 
 7232     // s1 = s1 % BASE
 7233     __ subs(temp0, s1, base);
 7234     __ csel(s1, temp0, s1, Assembler::HS);
 7235 
 7236     // s2 = s2 % BASE
 7237     __ lsr(temp0, s2, 16);
 7238     __ lsl(temp1, temp0, 4);
 7239     __ sub(temp1, temp1, temp0);
 7240     __ add(s2, temp1, s2, ext::uxth);
 7241 
 7242     __ subs(temp0, s2, base);
 7243     __ csel(s2, temp0, s2, Assembler::HS);
 7244 
 7245     __ b(L_combine);
 7246 
 7247     __ bind(L_nmax);
 7248     __ subs(len, len, nmax);
 7249     __ sub(count, nmax, 16);
 7250     __ br(Assembler::LO, L_by16);
 7251 
 7252     __ bind(L_nmax_loop);
 7253 
 7254     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7255                                       vbytes, vs1acc, vs2acc, vtable);
 7256 
 7257     __ subs(count, count, 16);
 7258     __ br(Assembler::HS, L_nmax_loop);
 7259 
 7260     // s1 = s1 % BASE
 7261     __ lsr(temp0, s1, 16);
 7262     __ lsl(temp1, temp0, 4);
 7263     __ sub(temp1, temp1, temp0);
 7264     __ add(temp1, temp1, s1, ext::uxth);
 7265 
 7266     __ lsr(temp0, temp1, 16);
 7267     __ lsl(s1, temp0, 4);
 7268     __ sub(s1, s1, temp0);
 7269     __ add(s1, s1, temp1, ext:: uxth);
 7270 
 7271     __ subs(temp0, s1, base);
 7272     __ csel(s1, temp0, s1, Assembler::HS);
 7273 
 7274     // s2 = s2 % BASE
 7275     __ lsr(temp0, s2, 16);
 7276     __ lsl(temp1, temp0, 4);
 7277     __ sub(temp1, temp1, temp0);
 7278     __ add(temp1, temp1, s2, ext::uxth);
 7279 
 7280     __ lsr(temp0, temp1, 16);
 7281     __ lsl(s2, temp0, 4);
 7282     __ sub(s2, s2, temp0);
 7283     __ add(s2, s2, temp1, ext:: uxth);
 7284 
 7285     __ subs(temp0, s2, base);
 7286     __ csel(s2, temp0, s2, Assembler::HS);
 7287 
 7288     __ subs(len, len, nmax);
 7289     __ sub(count, nmax, 16);
 7290     __ br(Assembler::HS, L_nmax_loop);
 7291 
 7292     __ bind(L_by16);
 7293     __ adds(len, len, count);
 7294     __ br(Assembler::LO, L_by1);
 7295 
 7296     __ bind(L_by16_loop);
 7297 
 7298     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7299                                       vbytes, vs1acc, vs2acc, vtable);
 7300 
 7301     __ subs(len, len, 16);
 7302     __ br(Assembler::HS, L_by16_loop);
 7303 
 7304     __ bind(L_by1);
 7305     __ adds(len, len, 15);
 7306     __ br(Assembler::LO, L_do_mod);
 7307 
 7308     __ bind(L_by1_loop);
 7309     __ ldrb(temp0, Address(__ post(buff, 1)));
 7310     __ add(s1, temp0, s1);
 7311     __ add(s2, s2, s1);
 7312     __ subs(len, len, 1);
 7313     __ br(Assembler::HS, L_by1_loop);
 7314 
 7315     __ bind(L_do_mod);
 7316     // s1 = s1 % BASE
 7317     __ lsr(temp0, s1, 16);
 7318     __ lsl(temp1, temp0, 4);
 7319     __ sub(temp1, temp1, temp0);
 7320     __ add(temp1, temp1, s1, ext::uxth);
 7321 
 7322     __ lsr(temp0, temp1, 16);
 7323     __ lsl(s1, temp0, 4);
 7324     __ sub(s1, s1, temp0);
 7325     __ add(s1, s1, temp1, ext:: uxth);
 7326 
 7327     __ subs(temp0, s1, base);
 7328     __ csel(s1, temp0, s1, Assembler::HS);
 7329 
 7330     // s2 = s2 % BASE
 7331     __ lsr(temp0, s2, 16);
 7332     __ lsl(temp1, temp0, 4);
 7333     __ sub(temp1, temp1, temp0);
 7334     __ add(temp1, temp1, s2, ext::uxth);
 7335 
 7336     __ lsr(temp0, temp1, 16);
 7337     __ lsl(s2, temp0, 4);
 7338     __ sub(s2, s2, temp0);
 7339     __ add(s2, s2, temp1, ext:: uxth);
 7340 
 7341     __ subs(temp0, s2, base);
 7342     __ csel(s2, temp0, s2, Assembler::HS);
 7343 
 7344     // Combine lower bits and higher bits
 7345     __ bind(L_combine);
 7346     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7347 
 7348     __ ret(lr);
 7349 
 7350     return start;
 7351   }
 7352 
 7353   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7354           Register temp0, Register temp1, FloatRegister vbytes,
 7355           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7356     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7357     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7358     // In non-vectorized code, we update s1 and s2 as:
 7359     //   s1 <- s1 + b1
 7360     //   s2 <- s2 + s1
 7361     //   s1 <- s1 + b2
 7362     //   s2 <- s2 + b1
 7363     //   ...
 7364     //   s1 <- s1 + b16
 7365     //   s2 <- s2 + s1
 7366     // Putting above assignments together, we have:
 7367     //   s1_new = s1 + b1 + b2 + ... + b16
 7368     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7369     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7370     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7371     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7372 
 7373     // s2 = s2 + s1 * 16
 7374     __ add(s2, s2, s1, Assembler::LSL, 4);
 7375 
 7376     // vs1acc = b1 + b2 + b3 + ... + b16
 7377     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7378     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7379     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7380     __ uaddlv(vs1acc, __ T16B, vbytes);
 7381     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7382 
 7383     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7384     __ fmovd(temp0, vs1acc);
 7385     __ fmovd(temp1, vs2acc);
 7386     __ add(s1, s1, temp0);
 7387     __ add(s2, s2, temp1);
 7388   }
 7389 
 7390   /**
 7391    *  Arguments:
 7392    *
 7393    *  Input:
 7394    *    c_rarg0   - x address
 7395    *    c_rarg1   - x length
 7396    *    c_rarg2   - y address
 7397    *    c_rarg3   - y length
 7398    *    c_rarg4   - z address
 7399    */
 7400   address generate_multiplyToLen() {
 7401     __ align(CodeEntryAlignment);
 7402     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 7403     StubCodeMark mark(this, stub_id);
 7404 
 7405     address start = __ pc();
 7406     const Register x     = r0;
 7407     const Register xlen  = r1;
 7408     const Register y     = r2;
 7409     const Register ylen  = r3;
 7410     const Register z     = r4;
 7411 
 7412     const Register tmp0  = r5;
 7413     const Register tmp1  = r10;
 7414     const Register tmp2  = r11;
 7415     const Register tmp3  = r12;
 7416     const Register tmp4  = r13;
 7417     const Register tmp5  = r14;
 7418     const Register tmp6  = r15;
 7419     const Register tmp7  = r16;
 7420 
 7421     BLOCK_COMMENT("Entry:");
 7422     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7423     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7424     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7425     __ ret(lr);
 7426 
 7427     return start;
 7428   }
 7429 
 7430   address generate_squareToLen() {
 7431     // squareToLen algorithm for sizes 1..127 described in java code works
 7432     // faster than multiply_to_len on some CPUs and slower on others, but
 7433     // multiply_to_len shows a bit better overall results
 7434     __ align(CodeEntryAlignment);
 7435     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 7436     StubCodeMark mark(this, stub_id);
 7437     address start = __ pc();
 7438 
 7439     const Register x     = r0;
 7440     const Register xlen  = r1;
 7441     const Register z     = r2;
 7442     const Register y     = r4; // == x
 7443     const Register ylen  = r5; // == xlen
 7444 
 7445     const Register tmp0  = r3;
 7446     const Register tmp1  = r10;
 7447     const Register tmp2  = r11;
 7448     const Register tmp3  = r12;
 7449     const Register tmp4  = r13;
 7450     const Register tmp5  = r14;
 7451     const Register tmp6  = r15;
 7452     const Register tmp7  = r16;
 7453 
 7454     RegSet spilled_regs = RegSet::of(y, ylen);
 7455     BLOCK_COMMENT("Entry:");
 7456     __ enter();
 7457     __ push(spilled_regs, sp);
 7458     __ mov(y, x);
 7459     __ mov(ylen, xlen);
 7460     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7461     __ pop(spilled_regs, sp);
 7462     __ leave();
 7463     __ ret(lr);
 7464     return start;
 7465   }
 7466 
 7467   address generate_mulAdd() {
 7468     __ align(CodeEntryAlignment);
 7469     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 7470     StubCodeMark mark(this, stub_id);
 7471 
 7472     address start = __ pc();
 7473 
 7474     const Register out     = r0;
 7475     const Register in      = r1;
 7476     const Register offset  = r2;
 7477     const Register len     = r3;
 7478     const Register k       = r4;
 7479 
 7480     BLOCK_COMMENT("Entry:");
 7481     __ enter();
 7482     __ mul_add(out, in, offset, len, k);
 7483     __ leave();
 7484     __ ret(lr);
 7485 
 7486     return start;
 7487   }
 7488 
 7489   // Arguments:
 7490   //
 7491   // Input:
 7492   //   c_rarg0   - newArr address
 7493   //   c_rarg1   - oldArr address
 7494   //   c_rarg2   - newIdx
 7495   //   c_rarg3   - shiftCount
 7496   //   c_rarg4   - numIter
 7497   //
 7498   address generate_bigIntegerRightShift() {
 7499     __ align(CodeEntryAlignment);
 7500     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 7501     StubCodeMark mark(this, stub_id);
 7502     address start = __ pc();
 7503 
 7504     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7505 
 7506     Register newArr        = c_rarg0;
 7507     Register oldArr        = c_rarg1;
 7508     Register newIdx        = c_rarg2;
 7509     Register shiftCount    = c_rarg3;
 7510     Register numIter       = c_rarg4;
 7511     Register idx           = numIter;
 7512 
 7513     Register newArrCur     = rscratch1;
 7514     Register shiftRevCount = rscratch2;
 7515     Register oldArrCur     = r13;
 7516     Register oldArrNext    = r14;
 7517 
 7518     FloatRegister oldElem0        = v0;
 7519     FloatRegister oldElem1        = v1;
 7520     FloatRegister newElem         = v2;
 7521     FloatRegister shiftVCount     = v3;
 7522     FloatRegister shiftVRevCount  = v4;
 7523 
 7524     __ cbz(idx, Exit);
 7525 
 7526     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7527 
 7528     // left shift count
 7529     __ movw(shiftRevCount, 32);
 7530     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7531 
 7532     // numIter too small to allow a 4-words SIMD loop, rolling back
 7533     __ cmp(numIter, (u1)4);
 7534     __ br(Assembler::LT, ShiftThree);
 7535 
 7536     __ dup(shiftVCount,    __ T4S, shiftCount);
 7537     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7538     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7539 
 7540     __ BIND(ShiftSIMDLoop);
 7541 
 7542     // Calculate the load addresses
 7543     __ sub(idx, idx, 4);
 7544     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7545     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7546     __ add(oldArrCur,  oldArrNext, 4);
 7547 
 7548     // Load 4 words and process
 7549     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7550     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7551     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7552     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7553     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7554     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7555 
 7556     __ cmp(idx, (u1)4);
 7557     __ br(Assembler::LT, ShiftTwoLoop);
 7558     __ b(ShiftSIMDLoop);
 7559 
 7560     __ BIND(ShiftTwoLoop);
 7561     __ cbz(idx, Exit);
 7562     __ cmp(idx, (u1)1);
 7563     __ br(Assembler::EQ, ShiftOne);
 7564 
 7565     // Calculate the load addresses
 7566     __ sub(idx, idx, 2);
 7567     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7568     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7569     __ add(oldArrCur,  oldArrNext, 4);
 7570 
 7571     // Load 2 words and process
 7572     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7573     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7574     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7575     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7576     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7577     __ st1(newElem,   __ T2S, Address(newArrCur));
 7578     __ b(ShiftTwoLoop);
 7579 
 7580     __ BIND(ShiftThree);
 7581     __ tbz(idx, 1, ShiftOne);
 7582     __ tbz(idx, 0, ShiftTwo);
 7583     __ ldrw(r10,  Address(oldArr, 12));
 7584     __ ldrw(r11,  Address(oldArr, 8));
 7585     __ lsrvw(r10, r10, shiftCount);
 7586     __ lslvw(r11, r11, shiftRevCount);
 7587     __ orrw(r12,  r10, r11);
 7588     __ strw(r12,  Address(newArr, 8));
 7589 
 7590     __ BIND(ShiftTwo);
 7591     __ ldrw(r10,  Address(oldArr, 8));
 7592     __ ldrw(r11,  Address(oldArr, 4));
 7593     __ lsrvw(r10, r10, shiftCount);
 7594     __ lslvw(r11, r11, shiftRevCount);
 7595     __ orrw(r12,  r10, r11);
 7596     __ strw(r12,  Address(newArr, 4));
 7597 
 7598     __ BIND(ShiftOne);
 7599     __ ldrw(r10,  Address(oldArr, 4));
 7600     __ ldrw(r11,  Address(oldArr));
 7601     __ lsrvw(r10, r10, shiftCount);
 7602     __ lslvw(r11, r11, shiftRevCount);
 7603     __ orrw(r12,  r10, r11);
 7604     __ strw(r12,  Address(newArr));
 7605 
 7606     __ BIND(Exit);
 7607     __ ret(lr);
 7608 
 7609     return start;
 7610   }
 7611 
 7612   // Arguments:
 7613   //
 7614   // Input:
 7615   //   c_rarg0   - newArr address
 7616   //   c_rarg1   - oldArr address
 7617   //   c_rarg2   - newIdx
 7618   //   c_rarg3   - shiftCount
 7619   //   c_rarg4   - numIter
 7620   //
 7621   address generate_bigIntegerLeftShift() {
 7622     __ align(CodeEntryAlignment);
 7623     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 7624     StubCodeMark mark(this, stub_id);
 7625     address start = __ pc();
 7626 
 7627     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7628 
 7629     Register newArr        = c_rarg0;
 7630     Register oldArr        = c_rarg1;
 7631     Register newIdx        = c_rarg2;
 7632     Register shiftCount    = c_rarg3;
 7633     Register numIter       = c_rarg4;
 7634 
 7635     Register shiftRevCount = rscratch1;
 7636     Register oldArrNext    = rscratch2;
 7637 
 7638     FloatRegister oldElem0        = v0;
 7639     FloatRegister oldElem1        = v1;
 7640     FloatRegister newElem         = v2;
 7641     FloatRegister shiftVCount     = v3;
 7642     FloatRegister shiftVRevCount  = v4;
 7643 
 7644     __ cbz(numIter, Exit);
 7645 
 7646     __ add(oldArrNext, oldArr, 4);
 7647     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7648 
 7649     // right shift count
 7650     __ movw(shiftRevCount, 32);
 7651     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7652 
 7653     // numIter too small to allow a 4-words SIMD loop, rolling back
 7654     __ cmp(numIter, (u1)4);
 7655     __ br(Assembler::LT, ShiftThree);
 7656 
 7657     __ dup(shiftVCount,     __ T4S, shiftCount);
 7658     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 7659     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 7660 
 7661     __ BIND(ShiftSIMDLoop);
 7662 
 7663     // load 4 words and process
 7664     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 7665     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 7666     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7667     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7668     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7669     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 7670     __ sub(numIter,   numIter, 4);
 7671 
 7672     __ cmp(numIter, (u1)4);
 7673     __ br(Assembler::LT, ShiftTwoLoop);
 7674     __ b(ShiftSIMDLoop);
 7675 
 7676     __ BIND(ShiftTwoLoop);
 7677     __ cbz(numIter, Exit);
 7678     __ cmp(numIter, (u1)1);
 7679     __ br(Assembler::EQ, ShiftOne);
 7680 
 7681     // load 2 words and process
 7682     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 7683     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 7684     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 7685     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 7686     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 7687     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 7688     __ sub(numIter,   numIter, 2);
 7689     __ b(ShiftTwoLoop);
 7690 
 7691     __ BIND(ShiftThree);
 7692     __ ldrw(r10,  __ post(oldArr, 4));
 7693     __ ldrw(r11,  __ post(oldArrNext, 4));
 7694     __ lslvw(r10, r10, shiftCount);
 7695     __ lsrvw(r11, r11, shiftRevCount);
 7696     __ orrw(r12,  r10, r11);
 7697     __ strw(r12,  __ post(newArr, 4));
 7698     __ tbz(numIter, 1, Exit);
 7699     __ tbz(numIter, 0, ShiftOne);
 7700 
 7701     __ BIND(ShiftTwo);
 7702     __ ldrw(r10,  __ post(oldArr, 4));
 7703     __ ldrw(r11,  __ post(oldArrNext, 4));
 7704     __ lslvw(r10, r10, shiftCount);
 7705     __ lsrvw(r11, r11, shiftRevCount);
 7706     __ orrw(r12,  r10, r11);
 7707     __ strw(r12,  __ post(newArr, 4));
 7708 
 7709     __ BIND(ShiftOne);
 7710     __ ldrw(r10,  Address(oldArr));
 7711     __ ldrw(r11,  Address(oldArrNext));
 7712     __ lslvw(r10, r10, shiftCount);
 7713     __ lsrvw(r11, r11, shiftRevCount);
 7714     __ orrw(r12,  r10, r11);
 7715     __ strw(r12,  Address(newArr));
 7716 
 7717     __ BIND(Exit);
 7718     __ ret(lr);
 7719 
 7720     return start;
 7721   }
 7722 
 7723   address generate_count_positives(address &count_positives_long) {
 7724     const u1 large_loop_size = 64;
 7725     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 7726     int dcache_line = VM_Version::dcache_line_size();
 7727 
 7728     Register ary1 = r1, len = r2, result = r0;
 7729 
 7730     __ align(CodeEntryAlignment);
 7731 
 7732     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 7733     StubCodeMark mark(this, stub_id);
 7734 
 7735     address entry = __ pc();
 7736 
 7737     __ enter();
 7738     // precondition: a copy of len is already in result
 7739     // __ mov(result, len);
 7740 
 7741   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 7742         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 7743 
 7744   __ cmp(len, (u1)15);
 7745   __ br(Assembler::GT, LEN_OVER_15);
 7746   // The only case when execution falls into this code is when pointer is near
 7747   // the end of memory page and we have to avoid reading next page
 7748   __ add(ary1, ary1, len);
 7749   __ subs(len, len, 8);
 7750   __ br(Assembler::GT, LEN_OVER_8);
 7751   __ ldr(rscratch2, Address(ary1, -8));
 7752   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 7753   __ lsrv(rscratch2, rscratch2, rscratch1);
 7754   __ tst(rscratch2, UPPER_BIT_MASK);
 7755   __ csel(result, zr, result, Assembler::NE);
 7756   __ leave();
 7757   __ ret(lr);
 7758   __ bind(LEN_OVER_8);
 7759   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 7760   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 7761   __ tst(rscratch2, UPPER_BIT_MASK);
 7762   __ br(Assembler::NE, RET_NO_POP);
 7763   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 7764   __ lsrv(rscratch1, rscratch1, rscratch2);
 7765   __ tst(rscratch1, UPPER_BIT_MASK);
 7766   __ bind(RET_NO_POP);
 7767   __ csel(result, zr, result, Assembler::NE);
 7768   __ leave();
 7769   __ ret(lr);
 7770 
 7771   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 7772   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 7773 
 7774   count_positives_long = __ pc(); // 2nd entry point
 7775 
 7776   __ enter();
 7777 
 7778   __ bind(LEN_OVER_15);
 7779     __ push(spilled_regs, sp);
 7780     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 7781     __ cbz(rscratch2, ALIGNED);
 7782     __ ldp(tmp6, tmp1, Address(ary1));
 7783     __ mov(tmp5, 16);
 7784     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 7785     __ add(ary1, ary1, rscratch1);
 7786     __ orr(tmp6, tmp6, tmp1);
 7787     __ tst(tmp6, UPPER_BIT_MASK);
 7788     __ br(Assembler::NE, RET_ADJUST);
 7789     __ sub(len, len, rscratch1);
 7790 
 7791   __ bind(ALIGNED);
 7792     __ cmp(len, large_loop_size);
 7793     __ br(Assembler::LT, CHECK_16);
 7794     // Perform 16-byte load as early return in pre-loop to handle situation
 7795     // when initially aligned large array has negative values at starting bytes,
 7796     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 7797     // slower. Cases with negative bytes further ahead won't be affected that
 7798     // much. In fact, it'll be faster due to early loads, less instructions and
 7799     // less branches in LARGE_LOOP.
 7800     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 7801     __ sub(len, len, 16);
 7802     __ orr(tmp6, tmp6, tmp1);
 7803     __ tst(tmp6, UPPER_BIT_MASK);
 7804     __ br(Assembler::NE, RET_ADJUST_16);
 7805     __ cmp(len, large_loop_size);
 7806     __ br(Assembler::LT, CHECK_16);
 7807 
 7808     if (SoftwarePrefetchHintDistance >= 0
 7809         && SoftwarePrefetchHintDistance >= dcache_line) {
 7810       // initial prefetch
 7811       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 7812     }
 7813   __ bind(LARGE_LOOP);
 7814     if (SoftwarePrefetchHintDistance >= 0) {
 7815       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 7816     }
 7817     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 7818     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 7819     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 7820     // instructions per cycle and have less branches, but this approach disables
 7821     // early return, thus, all 64 bytes are loaded and checked every time.
 7822     __ ldp(tmp2, tmp3, Address(ary1));
 7823     __ ldp(tmp4, tmp5, Address(ary1, 16));
 7824     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 7825     __ ldp(tmp6, tmp1, Address(ary1, 48));
 7826     __ add(ary1, ary1, large_loop_size);
 7827     __ sub(len, len, large_loop_size);
 7828     __ orr(tmp2, tmp2, tmp3);
 7829     __ orr(tmp4, tmp4, tmp5);
 7830     __ orr(rscratch1, rscratch1, rscratch2);
 7831     __ orr(tmp6, tmp6, tmp1);
 7832     __ orr(tmp2, tmp2, tmp4);
 7833     __ orr(rscratch1, rscratch1, tmp6);
 7834     __ orr(tmp2, tmp2, rscratch1);
 7835     __ tst(tmp2, UPPER_BIT_MASK);
 7836     __ br(Assembler::NE, RET_ADJUST_LONG);
 7837     __ cmp(len, large_loop_size);
 7838     __ br(Assembler::GE, LARGE_LOOP);
 7839 
 7840   __ bind(CHECK_16); // small 16-byte load pre-loop
 7841     __ cmp(len, (u1)16);
 7842     __ br(Assembler::LT, POST_LOOP16);
 7843 
 7844   __ bind(LOOP16); // small 16-byte load loop
 7845     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 7846     __ sub(len, len, 16);
 7847     __ orr(tmp2, tmp2, tmp3);
 7848     __ tst(tmp2, UPPER_BIT_MASK);
 7849     __ br(Assembler::NE, RET_ADJUST_16);
 7850     __ cmp(len, (u1)16);
 7851     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 7852 
 7853   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 7854     __ cmp(len, (u1)8);
 7855     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 7856     __ ldr(tmp3, Address(__ post(ary1, 8)));
 7857     __ tst(tmp3, UPPER_BIT_MASK);
 7858     __ br(Assembler::NE, RET_ADJUST);
 7859     __ sub(len, len, 8);
 7860 
 7861   __ bind(POST_LOOP16_LOAD_TAIL);
 7862     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 7863     __ ldr(tmp1, Address(ary1));
 7864     __ mov(tmp2, 64);
 7865     __ sub(tmp4, tmp2, len, __ LSL, 3);
 7866     __ lslv(tmp1, tmp1, tmp4);
 7867     __ tst(tmp1, UPPER_BIT_MASK);
 7868     __ br(Assembler::NE, RET_ADJUST);
 7869     // Fallthrough
 7870 
 7871   __ bind(RET_LEN);
 7872     __ pop(spilled_regs, sp);
 7873     __ leave();
 7874     __ ret(lr);
 7875 
 7876     // difference result - len is the count of guaranteed to be
 7877     // positive bytes
 7878 
 7879   __ bind(RET_ADJUST_LONG);
 7880     __ add(len, len, (u1)(large_loop_size - 16));
 7881   __ bind(RET_ADJUST_16);
 7882     __ add(len, len, 16);
 7883   __ bind(RET_ADJUST);
 7884     __ pop(spilled_regs, sp);
 7885     __ leave();
 7886     __ sub(result, result, len);
 7887     __ ret(lr);
 7888 
 7889     return entry;
 7890   }
 7891 
 7892   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 7893         bool usePrefetch, Label &NOT_EQUAL) {
 7894     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7895         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7896         tmp7 = r12, tmp8 = r13;
 7897     Label LOOP;
 7898 
 7899     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7900     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7901     __ bind(LOOP);
 7902     if (usePrefetch) {
 7903       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7904       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7905     }
 7906     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7907     __ eor(tmp1, tmp1, tmp2);
 7908     __ eor(tmp3, tmp3, tmp4);
 7909     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7910     __ orr(tmp1, tmp1, tmp3);
 7911     __ cbnz(tmp1, NOT_EQUAL);
 7912     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7913     __ eor(tmp5, tmp5, tmp6);
 7914     __ eor(tmp7, tmp7, tmp8);
 7915     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7916     __ orr(tmp5, tmp5, tmp7);
 7917     __ cbnz(tmp5, NOT_EQUAL);
 7918     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7919     __ eor(tmp1, tmp1, tmp2);
 7920     __ eor(tmp3, tmp3, tmp4);
 7921     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7922     __ orr(tmp1, tmp1, tmp3);
 7923     __ cbnz(tmp1, NOT_EQUAL);
 7924     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7925     __ eor(tmp5, tmp5, tmp6);
 7926     __ sub(cnt1, cnt1, 8 * wordSize);
 7927     __ eor(tmp7, tmp7, tmp8);
 7928     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7929     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 7930     // cmp) because subs allows an unlimited range of immediate operand.
 7931     __ subs(tmp6, cnt1, loopThreshold);
 7932     __ orr(tmp5, tmp5, tmp7);
 7933     __ cbnz(tmp5, NOT_EQUAL);
 7934     __ br(__ GE, LOOP);
 7935     // post-loop
 7936     __ eor(tmp1, tmp1, tmp2);
 7937     __ eor(tmp3, tmp3, tmp4);
 7938     __ orr(tmp1, tmp1, tmp3);
 7939     __ sub(cnt1, cnt1, 2 * wordSize);
 7940     __ cbnz(tmp1, NOT_EQUAL);
 7941   }
 7942 
 7943   void generate_large_array_equals_loop_simd(int loopThreshold,
 7944         bool usePrefetch, Label &NOT_EQUAL) {
 7945     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7946         tmp2 = rscratch2;
 7947     Label LOOP;
 7948 
 7949     __ bind(LOOP);
 7950     if (usePrefetch) {
 7951       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7952       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7953     }
 7954     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 7955     __ sub(cnt1, cnt1, 8 * wordSize);
 7956     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 7957     __ subs(tmp1, cnt1, loopThreshold);
 7958     __ eor(v0, __ T16B, v0, v4);
 7959     __ eor(v1, __ T16B, v1, v5);
 7960     __ eor(v2, __ T16B, v2, v6);
 7961     __ eor(v3, __ T16B, v3, v7);
 7962     __ orr(v0, __ T16B, v0, v1);
 7963     __ orr(v1, __ T16B, v2, v3);
 7964     __ orr(v0, __ T16B, v0, v1);
 7965     __ umov(tmp1, v0, __ D, 0);
 7966     __ umov(tmp2, v0, __ D, 1);
 7967     __ orr(tmp1, tmp1, tmp2);
 7968     __ cbnz(tmp1, NOT_EQUAL);
 7969     __ br(__ GE, LOOP);
 7970   }
 7971 
 7972   // a1 = r1 - array1 address
 7973   // a2 = r2 - array2 address
 7974   // result = r0 - return value. Already contains "false"
 7975   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 7976   // r3-r5 are reserved temporary registers
 7977   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 7978   address generate_large_array_equals() {
 7979     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7980         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7981         tmp7 = r12, tmp8 = r13;
 7982     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 7983         SMALL_LOOP, POST_LOOP;
 7984     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 7985     // calculate if at least 32 prefetched bytes are used
 7986     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 7987     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 7988     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 7989     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 7990         tmp5, tmp6, tmp7, tmp8);
 7991 
 7992     __ align(CodeEntryAlignment);
 7993 
 7994     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 7995     StubCodeMark mark(this, stub_id);
 7996 
 7997     address entry = __ pc();
 7998     __ enter();
 7999     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8000     // also advance pointers to use post-increment instead of pre-increment
 8001     __ add(a1, a1, wordSize);
 8002     __ add(a2, a2, wordSize);
 8003     if (AvoidUnalignedAccesses) {
 8004       // both implementations (SIMD/nonSIMD) are using relatively large load
 8005       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8006       // on some CPUs in case of address is not at least 16-byte aligned.
 8007       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8008       // load if needed at least for 1st address and make if 16-byte aligned.
 8009       Label ALIGNED16;
 8010       __ tbz(a1, 3, ALIGNED16);
 8011       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8012       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8013       __ sub(cnt1, cnt1, wordSize);
 8014       __ eor(tmp1, tmp1, tmp2);
 8015       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8016       __ bind(ALIGNED16);
 8017     }
 8018     if (UseSIMDForArrayEquals) {
 8019       if (SoftwarePrefetchHintDistance >= 0) {
 8020         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8021         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8022         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8023             /* prfm = */ true, NOT_EQUAL);
 8024         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8025         __ br(__ LT, TAIL);
 8026       }
 8027       __ bind(NO_PREFETCH_LARGE_LOOP);
 8028       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8029           /* prfm = */ false, NOT_EQUAL);
 8030     } else {
 8031       __ push(spilled_regs, sp);
 8032       if (SoftwarePrefetchHintDistance >= 0) {
 8033         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8034         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8035         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8036             /* prfm = */ true, NOT_EQUAL);
 8037         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8038         __ br(__ LT, TAIL);
 8039       }
 8040       __ bind(NO_PREFETCH_LARGE_LOOP);
 8041       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8042           /* prfm = */ false, NOT_EQUAL);
 8043     }
 8044     __ bind(TAIL);
 8045       __ cbz(cnt1, EQUAL);
 8046       __ subs(cnt1, cnt1, wordSize);
 8047       __ br(__ LE, POST_LOOP);
 8048     __ bind(SMALL_LOOP);
 8049       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8050       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8051       __ subs(cnt1, cnt1, wordSize);
 8052       __ eor(tmp1, tmp1, tmp2);
 8053       __ cbnz(tmp1, NOT_EQUAL);
 8054       __ br(__ GT, SMALL_LOOP);
 8055     __ bind(POST_LOOP);
 8056       __ ldr(tmp1, Address(a1, cnt1));
 8057       __ ldr(tmp2, Address(a2, cnt1));
 8058       __ eor(tmp1, tmp1, tmp2);
 8059       __ cbnz(tmp1, NOT_EQUAL);
 8060     __ bind(EQUAL);
 8061       __ mov(result, true);
 8062     __ bind(NOT_EQUAL);
 8063       if (!UseSIMDForArrayEquals) {
 8064         __ pop(spilled_regs, sp);
 8065       }
 8066     __ bind(NOT_EQUAL_NO_POP);
 8067     __ leave();
 8068     __ ret(lr);
 8069     return entry;
 8070   }
 8071 
 8072   // result = r0 - return value. Contains initial hashcode value on entry.
 8073   // ary = r1 - array address
 8074   // cnt = r2 - elements count
 8075   // Clobbers: v0-v13, rscratch1, rscratch2
 8076   address generate_large_arrays_hashcode(BasicType eltype) {
 8077     const Register result = r0, ary = r1, cnt = r2;
 8078     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8079     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8080     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8081     const FloatRegister vpowm = v13;
 8082 
 8083     ARRAYS_HASHCODE_REGISTERS;
 8084 
 8085     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8086 
 8087     unsigned int vf; // vectorization factor
 8088     bool multiply_by_halves;
 8089     Assembler::SIMD_Arrangement load_arrangement;
 8090     switch (eltype) {
 8091     case T_BOOLEAN:
 8092     case T_BYTE:
 8093       load_arrangement = Assembler::T8B;
 8094       multiply_by_halves = true;
 8095       vf = 8;
 8096       break;
 8097     case T_CHAR:
 8098     case T_SHORT:
 8099       load_arrangement = Assembler::T8H;
 8100       multiply_by_halves = true;
 8101       vf = 8;
 8102       break;
 8103     case T_INT:
 8104       load_arrangement = Assembler::T4S;
 8105       multiply_by_halves = false;
 8106       vf = 4;
 8107       break;
 8108     default:
 8109       ShouldNotReachHere();
 8110     }
 8111 
 8112     // Unroll factor
 8113     const unsigned uf = 4;
 8114 
 8115     // Effective vectorization factor
 8116     const unsigned evf = vf * uf;
 8117 
 8118     __ align(CodeEntryAlignment);
 8119 
 8120     StubGenStubId stub_id;
 8121     switch (eltype) {
 8122     case T_BOOLEAN:
 8123       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 8124       break;
 8125     case T_BYTE:
 8126       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 8127       break;
 8128     case T_CHAR:
 8129       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 8130       break;
 8131     case T_SHORT:
 8132       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 8133       break;
 8134     case T_INT:
 8135       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 8136       break;
 8137     default:
 8138       stub_id = StubGenStubId::NO_STUBID;
 8139       ShouldNotReachHere();
 8140     };
 8141 
 8142     StubCodeMark mark(this, stub_id);
 8143 
 8144     address entry = __ pc();
 8145     __ enter();
 8146 
 8147     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8148     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8149     // value shouldn't change throughout both loops.
 8150     __ movw(rscratch1, intpow(31U, 3));
 8151     __ mov(vpow, Assembler::S, 0, rscratch1);
 8152     __ movw(rscratch1, intpow(31U, 2));
 8153     __ mov(vpow, Assembler::S, 1, rscratch1);
 8154     __ movw(rscratch1, intpow(31U, 1));
 8155     __ mov(vpow, Assembler::S, 2, rscratch1);
 8156     __ movw(rscratch1, intpow(31U, 0));
 8157     __ mov(vpow, Assembler::S, 3, rscratch1);
 8158 
 8159     __ mov(vmul0, Assembler::T16B, 0);
 8160     __ mov(vmul0, Assembler::S, 3, result);
 8161 
 8162     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8163     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8164 
 8165     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8166     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8167 
 8168     // SMALL LOOP
 8169     __ bind(SMALL_LOOP);
 8170 
 8171     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8172     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8173     __ subsw(rscratch2, rscratch2, vf);
 8174 
 8175     if (load_arrangement == Assembler::T8B) {
 8176       // Extend 8B to 8H to be able to use vector multiply
 8177       // instructions
 8178       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8179       if (is_signed_subword_type(eltype)) {
 8180         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8181       } else {
 8182         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8183       }
 8184     }
 8185 
 8186     switch (load_arrangement) {
 8187     case Assembler::T4S:
 8188       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8189       break;
 8190     case Assembler::T8B:
 8191     case Assembler::T8H:
 8192       assert(is_subword_type(eltype), "subword type expected");
 8193       if (is_signed_subword_type(eltype)) {
 8194         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8195       } else {
 8196         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8197       }
 8198       break;
 8199     default:
 8200       __ should_not_reach_here();
 8201     }
 8202 
 8203     // Process the upper half of a vector
 8204     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8205       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8206       if (is_signed_subword_type(eltype)) {
 8207         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8208       } else {
 8209         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8210       }
 8211     }
 8212 
 8213     __ br(Assembler::HI, SMALL_LOOP);
 8214 
 8215     // SMALL LOOP'S EPILOQUE
 8216     __ lsr(rscratch2, cnt, exact_log2(evf));
 8217     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8218 
 8219     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8220     __ addv(vmul0, Assembler::T4S, vmul0);
 8221     __ umov(result, vmul0, Assembler::S, 0);
 8222 
 8223     // TAIL
 8224     __ bind(TAIL);
 8225 
 8226     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8227     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8228     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8229     __ andr(rscratch2, cnt, vf - 1);
 8230     __ bind(TAIL_SHORTCUT);
 8231     __ adr(rscratch1, BR_BASE);
 8232     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8233     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8234     __ movw(rscratch2, 0x1f);
 8235     __ br(rscratch1);
 8236 
 8237     for (size_t i = 0; i < vf - 1; ++i) {
 8238       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8239                                    eltype);
 8240       __ maddw(result, result, rscratch2, rscratch1);
 8241       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8242       // Generate 2nd nop to have 4 instructions per iteration.
 8243       if (VM_Version::supports_a53mac()) {
 8244         __ nop();
 8245       }
 8246     }
 8247     __ bind(BR_BASE);
 8248 
 8249     __ leave();
 8250     __ ret(lr);
 8251 
 8252     // LARGE LOOP
 8253     __ bind(LARGE_LOOP_PREHEADER);
 8254 
 8255     __ lsr(rscratch2, cnt, exact_log2(evf));
 8256 
 8257     if (multiply_by_halves) {
 8258       // 31^4 - multiplier between lower and upper parts of a register
 8259       __ movw(rscratch1, intpow(31U, vf / 2));
 8260       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8261       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8262       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8263       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8264     } else {
 8265       // 31^16
 8266       __ movw(rscratch1, intpow(31U, evf));
 8267       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8268     }
 8269 
 8270     __ mov(vmul3, Assembler::T16B, 0);
 8271     __ mov(vmul2, Assembler::T16B, 0);
 8272     __ mov(vmul1, Assembler::T16B, 0);
 8273 
 8274     __ bind(LARGE_LOOP);
 8275 
 8276     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8277     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8278     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8279     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8280 
 8281     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8282            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8283 
 8284     if (load_arrangement == Assembler::T8B) {
 8285       // Extend 8B to 8H to be able to use vector multiply
 8286       // instructions
 8287       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8288       if (is_signed_subword_type(eltype)) {
 8289         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8290         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8291         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8292         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8293       } else {
 8294         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8295         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8296         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8297         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8298       }
 8299     }
 8300 
 8301     switch (load_arrangement) {
 8302     case Assembler::T4S:
 8303       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8304       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8305       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8306       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8307       break;
 8308     case Assembler::T8B:
 8309     case Assembler::T8H:
 8310       assert(is_subword_type(eltype), "subword type expected");
 8311       if (is_signed_subword_type(eltype)) {
 8312         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8313         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8314         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8315         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8316       } else {
 8317         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8318         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8319         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8320         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8321       }
 8322       break;
 8323     default:
 8324       __ should_not_reach_here();
 8325     }
 8326 
 8327     // Process the upper half of a vector
 8328     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8329       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8330       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8331       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8332       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8333       if (is_signed_subword_type(eltype)) {
 8334         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8335         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8336         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8337         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8338       } else {
 8339         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8340         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8341         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8342         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8343       }
 8344     }
 8345 
 8346     __ subsw(rscratch2, rscratch2, 1);
 8347     __ br(Assembler::HI, LARGE_LOOP);
 8348 
 8349     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8350     __ addv(vmul3, Assembler::T4S, vmul3);
 8351     __ umov(result, vmul3, Assembler::S, 0);
 8352 
 8353     __ mov(rscratch2, intpow(31U, vf));
 8354 
 8355     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8356     __ addv(vmul2, Assembler::T4S, vmul2);
 8357     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8358     __ maddw(result, result, rscratch2, rscratch1);
 8359 
 8360     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8361     __ addv(vmul1, Assembler::T4S, vmul1);
 8362     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8363     __ maddw(result, result, rscratch2, rscratch1);
 8364 
 8365     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8366     __ addv(vmul0, Assembler::T4S, vmul0);
 8367     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8368     __ maddw(result, result, rscratch2, rscratch1);
 8369 
 8370     __ andr(rscratch2, cnt, vf - 1);
 8371     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8372 
 8373     __ leave();
 8374     __ ret(lr);
 8375 
 8376     return entry;
 8377   }
 8378 
 8379   address generate_dsin_dcos(bool isCos) {
 8380     __ align(CodeEntryAlignment);
 8381     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 8382     StubCodeMark mark(this, stub_id);
 8383     address start = __ pc();
 8384     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8385         (address)StubRoutines::aarch64::_two_over_pi,
 8386         (address)StubRoutines::aarch64::_pio2,
 8387         (address)StubRoutines::aarch64::_dsin_coef,
 8388         (address)StubRoutines::aarch64::_dcos_coef);
 8389     return start;
 8390   }
 8391 
 8392   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8393   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8394       Label &DIFF2) {
 8395     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8396     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8397 
 8398     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8399     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8400     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8401     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8402 
 8403     __ fmovd(tmpL, vtmp3);
 8404     __ eor(rscratch2, tmp3, tmpL);
 8405     __ cbnz(rscratch2, DIFF2);
 8406 
 8407     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8408     __ umov(tmpL, vtmp3, __ D, 1);
 8409     __ eor(rscratch2, tmpU, tmpL);
 8410     __ cbnz(rscratch2, DIFF1);
 8411 
 8412     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8413     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8414     __ fmovd(tmpL, vtmp);
 8415     __ eor(rscratch2, tmp3, tmpL);
 8416     __ cbnz(rscratch2, DIFF2);
 8417 
 8418     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8419     __ umov(tmpL, vtmp, __ D, 1);
 8420     __ eor(rscratch2, tmpU, tmpL);
 8421     __ cbnz(rscratch2, DIFF1);
 8422   }
 8423 
 8424   // r0  = result
 8425   // r1  = str1
 8426   // r2  = cnt1
 8427   // r3  = str2
 8428   // r4  = cnt2
 8429   // r10 = tmp1
 8430   // r11 = tmp2
 8431   address generate_compare_long_string_different_encoding(bool isLU) {
 8432     __ align(CodeEntryAlignment);
 8433     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 8434     StubCodeMark mark(this, stub_id);
 8435     address entry = __ pc();
 8436     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8437         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8438         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8439     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8440         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8441     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8442     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8443 
 8444     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8445 
 8446     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8447     // cnt2 == amount of characters left to compare
 8448     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8449     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8450     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8451     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8452     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8453     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8454     __ eor(rscratch2, tmp1, tmp2);
 8455     __ mov(rscratch1, tmp2);
 8456     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8457     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8458              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8459     __ push(spilled_regs, sp);
 8460     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8461     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8462 
 8463     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8464 
 8465     if (SoftwarePrefetchHintDistance >= 0) {
 8466       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8467       __ br(__ LT, NO_PREFETCH);
 8468       __ bind(LARGE_LOOP_PREFETCH);
 8469         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8470         __ mov(tmp4, 2);
 8471         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8472         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8473           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8474           __ subs(tmp4, tmp4, 1);
 8475           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8476           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8477           __ mov(tmp4, 2);
 8478         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8479           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8480           __ subs(tmp4, tmp4, 1);
 8481           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8482           __ sub(cnt2, cnt2, 64);
 8483           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8484           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8485     }
 8486     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8487     __ bind(NO_PREFETCH);
 8488     __ subs(cnt2, cnt2, 16);
 8489     __ br(__ LT, TAIL);
 8490     __ align(OptoLoopAlignment);
 8491     __ bind(SMALL_LOOP); // smaller loop
 8492       __ subs(cnt2, cnt2, 16);
 8493       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8494       __ br(__ GE, SMALL_LOOP);
 8495       __ cmn(cnt2, (u1)16);
 8496       __ br(__ EQ, LOAD_LAST);
 8497     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8498       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8499       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8500       __ ldr(tmp3, Address(cnt1, -8));
 8501       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8502       __ b(LOAD_LAST);
 8503     __ bind(DIFF2);
 8504       __ mov(tmpU, tmp3);
 8505     __ bind(DIFF1);
 8506       __ pop(spilled_regs, sp);
 8507       __ b(CALCULATE_DIFFERENCE);
 8508     __ bind(LOAD_LAST);
 8509       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8510       // No need to load it again
 8511       __ mov(tmpU, tmp3);
 8512       __ pop(spilled_regs, sp);
 8513 
 8514       // tmp2 points to the address of the last 4 Latin1 characters right now
 8515       __ ldrs(vtmp, Address(tmp2));
 8516       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8517       __ fmovd(tmpL, vtmp);
 8518 
 8519       __ eor(rscratch2, tmpU, tmpL);
 8520       __ cbz(rscratch2, DONE);
 8521 
 8522     // Find the first different characters in the longwords and
 8523     // compute their difference.
 8524     __ bind(CALCULATE_DIFFERENCE);
 8525       __ rev(rscratch2, rscratch2);
 8526       __ clz(rscratch2, rscratch2);
 8527       __ andr(rscratch2, rscratch2, -16);
 8528       __ lsrv(tmp1, tmp1, rscratch2);
 8529       __ uxthw(tmp1, tmp1);
 8530       __ lsrv(rscratch1, rscratch1, rscratch2);
 8531       __ uxthw(rscratch1, rscratch1);
 8532       __ subw(result, tmp1, rscratch1);
 8533     __ bind(DONE);
 8534       __ ret(lr);
 8535     return entry;
 8536   }
 8537 
 8538   // r0 = input (float16)
 8539   // v0 = result (float)
 8540   // v1 = temporary float register
 8541   address generate_float16ToFloat() {
 8542     __ align(CodeEntryAlignment);
 8543     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 8544     StubCodeMark mark(this, stub_id);
 8545     address entry = __ pc();
 8546     BLOCK_COMMENT("Entry:");
 8547     __ flt16_to_flt(v0, r0, v1);
 8548     __ ret(lr);
 8549     return entry;
 8550   }
 8551 
 8552   // v0 = input (float)
 8553   // r0 = result (float16)
 8554   // v1 = temporary float register
 8555   address generate_floatToFloat16() {
 8556     __ align(CodeEntryAlignment);
 8557     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 8558     StubCodeMark mark(this, stub_id);
 8559     address entry = __ pc();
 8560     BLOCK_COMMENT("Entry:");
 8561     __ flt_to_flt16(r0, v0, v1);
 8562     __ ret(lr);
 8563     return entry;
 8564   }
 8565 
 8566   address generate_method_entry_barrier() {
 8567     __ align(CodeEntryAlignment);
 8568     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 8569     StubCodeMark mark(this, stub_id);
 8570 
 8571     Label deoptimize_label;
 8572 
 8573     address start = __ pc();
 8574 
 8575     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8576 
 8577     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8578       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8579       // We can get here despite the nmethod being good, if we have not
 8580       // yet applied our cross modification fence (or data fence).
 8581       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8582       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8583       __ ldrw(rscratch2, rscratch2);
 8584       __ strw(rscratch2, thread_epoch_addr);
 8585       __ isb();
 8586       __ membar(__ LoadLoad);
 8587     }
 8588 
 8589     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8590 
 8591     __ enter();
 8592     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8593 
 8594     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8595 
 8596     __ push_call_clobbered_registers();
 8597 
 8598     __ mov(c_rarg0, rscratch2);
 8599     __ call_VM_leaf
 8600          (CAST_FROM_FN_PTR
 8601           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8602 
 8603     __ reset_last_Java_frame(true);
 8604 
 8605     __ mov(rscratch1, r0);
 8606 
 8607     __ pop_call_clobbered_registers();
 8608 
 8609     __ cbnz(rscratch1, deoptimize_label);
 8610 
 8611     __ leave();
 8612     __ ret(lr);
 8613 
 8614     __ BIND(deoptimize_label);
 8615 
 8616     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8617     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8618 
 8619     __ mov(sp, rscratch1);
 8620     __ br(rscratch2);
 8621 
 8622     return start;
 8623   }
 8624 
 8625   // r0  = result
 8626   // r1  = str1
 8627   // r2  = cnt1
 8628   // r3  = str2
 8629   // r4  = cnt2
 8630   // r10 = tmp1
 8631   // r11 = tmp2
 8632   address generate_compare_long_string_same_encoding(bool isLL) {
 8633     __ align(CodeEntryAlignment);
 8634     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 8635     StubCodeMark mark(this, stub_id);
 8636     address entry = __ pc();
 8637     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8638         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 8639 
 8640     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 8641 
 8642     // exit from large loop when less than 64 bytes left to read or we're about
 8643     // to prefetch memory behind array border
 8644     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 8645 
 8646     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 8647     __ eor(rscratch2, tmp1, tmp2);
 8648     __ cbnz(rscratch2, CAL_DIFFERENCE);
 8649 
 8650     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 8651     // update pointers, because of previous read
 8652     __ add(str1, str1, wordSize);
 8653     __ add(str2, str2, wordSize);
 8654     if (SoftwarePrefetchHintDistance >= 0) {
 8655       __ align(OptoLoopAlignment);
 8656       __ bind(LARGE_LOOP_PREFETCH);
 8657         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 8658         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 8659 
 8660         for (int i = 0; i < 4; i++) {
 8661           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 8662           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 8663           __ cmp(tmp1, tmp2);
 8664           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8665           __ br(Assembler::NE, DIFF);
 8666         }
 8667         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 8668         __ add(str1, str1, 64);
 8669         __ add(str2, str2, 64);
 8670         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 8671         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 8672         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 8673     }
 8674 
 8675     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 8676     __ br(Assembler::LE, LESS16);
 8677     __ align(OptoLoopAlignment);
 8678     __ bind(LOOP_COMPARE16);
 8679       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8680       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8681       __ cmp(tmp1, tmp2);
 8682       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8683       __ br(Assembler::NE, DIFF);
 8684       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8685       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8686       __ br(Assembler::LT, LESS16);
 8687 
 8688       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8689       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8690       __ cmp(tmp1, tmp2);
 8691       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8692       __ br(Assembler::NE, DIFF);
 8693       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8694       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8695       __ br(Assembler::GE, LOOP_COMPARE16);
 8696       __ cbz(cnt2, LENGTH_DIFF);
 8697 
 8698     __ bind(LESS16);
 8699       // each 8 compare
 8700       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 8701       __ br(Assembler::LE, LESS8);
 8702       __ ldr(tmp1, Address(__ post(str1, 8)));
 8703       __ ldr(tmp2, Address(__ post(str2, 8)));
 8704       __ eor(rscratch2, tmp1, tmp2);
 8705       __ cbnz(rscratch2, CAL_DIFFERENCE);
 8706       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 8707 
 8708     __ bind(LESS8); // directly load last 8 bytes
 8709       if (!isLL) {
 8710         __ add(cnt2, cnt2, cnt2);
 8711       }
 8712       __ ldr(tmp1, Address(str1, cnt2));
 8713       __ ldr(tmp2, Address(str2, cnt2));
 8714       __ eor(rscratch2, tmp1, tmp2);
 8715       __ cbz(rscratch2, LENGTH_DIFF);
 8716       __ b(CAL_DIFFERENCE);
 8717 
 8718     __ bind(DIFF);
 8719       __ cmp(tmp1, tmp2);
 8720       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 8721       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 8722       // reuse rscratch2 register for the result of eor instruction
 8723       __ eor(rscratch2, tmp1, tmp2);
 8724 
 8725     __ bind(CAL_DIFFERENCE);
 8726       __ rev(rscratch2, rscratch2);
 8727       __ clz(rscratch2, rscratch2);
 8728       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 8729       __ lsrv(tmp1, tmp1, rscratch2);
 8730       __ lsrv(tmp2, tmp2, rscratch2);
 8731       if (isLL) {
 8732         __ uxtbw(tmp1, tmp1);
 8733         __ uxtbw(tmp2, tmp2);
 8734       } else {
 8735         __ uxthw(tmp1, tmp1);
 8736         __ uxthw(tmp2, tmp2);
 8737       }
 8738       __ subw(result, tmp1, tmp2);
 8739 
 8740     __ bind(LENGTH_DIFF);
 8741       __ ret(lr);
 8742     return entry;
 8743   }
 8744 
 8745   enum string_compare_mode {
 8746     LL,
 8747     LU,
 8748     UL,
 8749     UU,
 8750   };
 8751 
 8752   // The following registers are declared in aarch64.ad
 8753   // r0  = result
 8754   // r1  = str1
 8755   // r2  = cnt1
 8756   // r3  = str2
 8757   // r4  = cnt2
 8758   // r10 = tmp1
 8759   // r11 = tmp2
 8760   // z0  = ztmp1
 8761   // z1  = ztmp2
 8762   // p0  = pgtmp1
 8763   // p1  = pgtmp2
 8764   address generate_compare_long_string_sve(string_compare_mode mode) {
 8765     StubGenStubId stub_id;
 8766     switch (mode) {
 8767       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 8768       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 8769       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 8770       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 8771       default: ShouldNotReachHere();
 8772     }
 8773 
 8774     __ align(CodeEntryAlignment);
 8775     address entry = __ pc();
 8776     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8777              tmp1 = r10, tmp2 = r11;
 8778 
 8779     Label LOOP, DONE, MISMATCH;
 8780     Register vec_len = tmp1;
 8781     Register idx = tmp2;
 8782     // The minimum of the string lengths has been stored in cnt2.
 8783     Register cnt = cnt2;
 8784     FloatRegister ztmp1 = z0, ztmp2 = z1;
 8785     PRegister pgtmp1 = p0, pgtmp2 = p1;
 8786 
 8787 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 8788     switch (mode) {                                                            \
 8789       case LL:                                                                 \
 8790         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 8791         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 8792         break;                                                                 \
 8793       case LU:                                                                 \
 8794         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 8795         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8796         break;                                                                 \
 8797       case UL:                                                                 \
 8798         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8799         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 8800         break;                                                                 \
 8801       case UU:                                                                 \
 8802         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8803         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8804         break;                                                                 \
 8805       default:                                                                 \
 8806         ShouldNotReachHere();                                                  \
 8807     }
 8808 
 8809     StubCodeMark mark(this, stub_id);
 8810 
 8811     __ mov(idx, 0);
 8812     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8813 
 8814     if (mode == LL) {
 8815       __ sve_cntb(vec_len);
 8816     } else {
 8817       __ sve_cnth(vec_len);
 8818     }
 8819 
 8820     __ sub(rscratch1, cnt, vec_len);
 8821 
 8822     __ bind(LOOP);
 8823 
 8824       // main loop
 8825       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8826       __ add(idx, idx, vec_len);
 8827       // Compare strings.
 8828       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8829       __ br(__ NE, MISMATCH);
 8830       __ cmp(idx, rscratch1);
 8831       __ br(__ LT, LOOP);
 8832 
 8833     // post loop, last iteration
 8834     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8835 
 8836     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8837     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8838     __ br(__ EQ, DONE);
 8839 
 8840     __ bind(MISMATCH);
 8841 
 8842     // Crop the vector to find its location.
 8843     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 8844     // Extract the first different characters of each string.
 8845     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 8846     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 8847 
 8848     // Compute the difference of the first different characters.
 8849     __ sub(result, rscratch1, rscratch2);
 8850 
 8851     __ bind(DONE);
 8852     __ ret(lr);
 8853 #undef LOAD_PAIR
 8854     return entry;
 8855   }
 8856 
 8857   void generate_compare_long_strings() {
 8858     if (UseSVE == 0) {
 8859       StubRoutines::aarch64::_compare_long_string_LL
 8860           = generate_compare_long_string_same_encoding(true);
 8861       StubRoutines::aarch64::_compare_long_string_UU
 8862           = generate_compare_long_string_same_encoding(false);
 8863       StubRoutines::aarch64::_compare_long_string_LU
 8864           = generate_compare_long_string_different_encoding(true);
 8865       StubRoutines::aarch64::_compare_long_string_UL
 8866           = generate_compare_long_string_different_encoding(false);
 8867     } else {
 8868       StubRoutines::aarch64::_compare_long_string_LL
 8869           = generate_compare_long_string_sve(LL);
 8870       StubRoutines::aarch64::_compare_long_string_UU
 8871           = generate_compare_long_string_sve(UU);
 8872       StubRoutines::aarch64::_compare_long_string_LU
 8873           = generate_compare_long_string_sve(LU);
 8874       StubRoutines::aarch64::_compare_long_string_UL
 8875           = generate_compare_long_string_sve(UL);
 8876     }
 8877   }
 8878 
 8879   // R0 = result
 8880   // R1 = str2
 8881   // R2 = cnt1
 8882   // R3 = str1
 8883   // R4 = cnt2
 8884   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 8885   //
 8886   // This generic linear code use few additional ideas, which makes it faster:
 8887   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 8888   // in order to skip initial loading(help in systems with 1 ld pipeline)
 8889   // 2) we can use "fast" algorithm of finding single character to search for
 8890   // first symbol with less branches(1 branch per each loaded register instead
 8891   // of branch for each symbol), so, this is where constants like
 8892   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 8893   // 3) after loading and analyzing 1st register of source string, it can be
 8894   // used to search for every 1st character entry, saving few loads in
 8895   // comparison with "simplier-but-slower" implementation
 8896   // 4) in order to avoid lots of push/pop operations, code below is heavily
 8897   // re-using/re-initializing/compressing register values, which makes code
 8898   // larger and a bit less readable, however, most of extra operations are
 8899   // issued during loads or branches, so, penalty is minimal
 8900   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 8901     StubGenStubId stub_id;
 8902     if (str1_isL) {
 8903       if (str2_isL) {
 8904         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 8905       } else {
 8906         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 8907       }
 8908     } else {
 8909       if (str2_isL) {
 8910         ShouldNotReachHere();
 8911       } else {
 8912         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 8913       }
 8914     }
 8915     __ align(CodeEntryAlignment);
 8916     StubCodeMark mark(this, stub_id);
 8917     address entry = __ pc();
 8918 
 8919     int str1_chr_size = str1_isL ? 1 : 2;
 8920     int str2_chr_size = str2_isL ? 1 : 2;
 8921     int str1_chr_shift = str1_isL ? 0 : 1;
 8922     int str2_chr_shift = str2_isL ? 0 : 1;
 8923     bool isL = str1_isL && str2_isL;
 8924    // parameters
 8925     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 8926     // temporary registers
 8927     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 8928     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 8929     // redefinitions
 8930     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 8931 
 8932     __ push(spilled_regs, sp);
 8933     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 8934         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 8935         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 8936         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 8937         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 8938         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 8939     // Read whole register from str1. It is safe, because length >=8 here
 8940     __ ldr(ch1, Address(str1));
 8941     // Read whole register from str2. It is safe, because length >=8 here
 8942     __ ldr(ch2, Address(str2));
 8943     __ sub(cnt2, cnt2, cnt1);
 8944     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 8945     if (str1_isL != str2_isL) {
 8946       __ eor(v0, __ T16B, v0, v0);
 8947     }
 8948     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 8949     __ mul(first, first, tmp1);
 8950     // check if we have less than 1 register to check
 8951     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 8952     if (str1_isL != str2_isL) {
 8953       __ fmovd(v1, ch1);
 8954     }
 8955     __ br(__ LE, L_SMALL);
 8956     __ eor(ch2, first, ch2);
 8957     if (str1_isL != str2_isL) {
 8958       __ zip1(v1, __ T16B, v1, v0);
 8959     }
 8960     __ sub(tmp2, ch2, tmp1);
 8961     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8962     __ bics(tmp2, tmp2, ch2);
 8963     if (str1_isL != str2_isL) {
 8964       __ fmovd(ch1, v1);
 8965     }
 8966     __ br(__ NE, L_HAS_ZERO);
 8967     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8968     __ add(result, result, wordSize/str2_chr_size);
 8969     __ add(str2, str2, wordSize);
 8970     __ br(__ LT, L_POST_LOOP);
 8971     __ BIND(L_LOOP);
 8972       __ ldr(ch2, Address(str2));
 8973       __ eor(ch2, first, ch2);
 8974       __ sub(tmp2, ch2, tmp1);
 8975       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8976       __ bics(tmp2, tmp2, ch2);
 8977       __ br(__ NE, L_HAS_ZERO);
 8978     __ BIND(L_LOOP_PROCEED);
 8979       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8980       __ add(str2, str2, wordSize);
 8981       __ add(result, result, wordSize/str2_chr_size);
 8982       __ br(__ GE, L_LOOP);
 8983     __ BIND(L_POST_LOOP);
 8984       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 8985       __ br(__ LE, NOMATCH);
 8986       __ ldr(ch2, Address(str2));
 8987       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 8988       __ eor(ch2, first, ch2);
 8989       __ sub(tmp2, ch2, tmp1);
 8990       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8991       __ mov(tmp4, -1); // all bits set
 8992       __ b(L_SMALL_PROCEED);
 8993     __ align(OptoLoopAlignment);
 8994     __ BIND(L_SMALL);
 8995       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 8996       __ eor(ch2, first, ch2);
 8997       if (str1_isL != str2_isL) {
 8998         __ zip1(v1, __ T16B, v1, v0);
 8999       }
 9000       __ sub(tmp2, ch2, tmp1);
 9001       __ mov(tmp4, -1); // all bits set
 9002       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9003       if (str1_isL != str2_isL) {
 9004         __ fmovd(ch1, v1); // move converted 4 symbols
 9005       }
 9006     __ BIND(L_SMALL_PROCEED);
 9007       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9008       __ bic(tmp2, tmp2, ch2);
 9009       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9010       __ rbit(tmp2, tmp2);
 9011       __ br(__ EQ, NOMATCH);
 9012     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9013       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9014       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9015       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9016       if (str2_isL) { // LL
 9017         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9018         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9019         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9020         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9021         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9022       } else {
 9023         __ mov(ch2, 0xE); // all bits in byte set except last one
 9024         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9025         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9026         __ lslv(tmp2, tmp2, tmp4);
 9027         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9028         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9029         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9030         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9031       }
 9032       __ cmp(ch1, ch2);
 9033       __ mov(tmp4, wordSize/str2_chr_size);
 9034       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9035     __ BIND(L_SMALL_CMP_LOOP);
 9036       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9037                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9038       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9039                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9040       __ add(tmp4, tmp4, 1);
 9041       __ cmp(tmp4, cnt1);
 9042       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9043       __ cmp(first, ch2);
 9044       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9045     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9046       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9047       __ clz(tmp4, tmp2);
 9048       __ add(result, result, 1); // advance index
 9049       __ add(str2, str2, str2_chr_size); // advance pointer
 9050       __ b(L_SMALL_HAS_ZERO_LOOP);
 9051     __ align(OptoLoopAlignment);
 9052     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9053       __ cmp(first, ch2);
 9054       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9055       __ b(DONE);
 9056     __ align(OptoLoopAlignment);
 9057     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9058       if (str2_isL) { // LL
 9059         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9060         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9061         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9062         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9063         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9064       } else {
 9065         __ mov(ch2, 0xE); // all bits in byte set except last one
 9066         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9067         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9068         __ lslv(tmp2, tmp2, tmp4);
 9069         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9070         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9071         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9072         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9073       }
 9074       __ cmp(ch1, ch2);
 9075       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9076       __ b(DONE);
 9077     __ align(OptoLoopAlignment);
 9078     __ BIND(L_HAS_ZERO);
 9079       __ rbit(tmp2, tmp2);
 9080       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9081       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9082       // It's fine because both counters are 32bit and are not changed in this
 9083       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9084       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9085       __ sub(result, result, 1);
 9086     __ BIND(L_HAS_ZERO_LOOP);
 9087       __ mov(cnt1, wordSize/str2_chr_size);
 9088       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9089       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9090       if (str2_isL) {
 9091         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9092         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9093         __ lslv(tmp2, tmp2, tmp4);
 9094         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9095         __ add(tmp4, tmp4, 1);
 9096         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9097         __ lsl(tmp2, tmp2, 1);
 9098         __ mov(tmp4, wordSize/str2_chr_size);
 9099       } else {
 9100         __ mov(ch2, 0xE);
 9101         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9102         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9103         __ lslv(tmp2, tmp2, tmp4);
 9104         __ add(tmp4, tmp4, 1);
 9105         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9106         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9107         __ lsl(tmp2, tmp2, 1);
 9108         __ mov(tmp4, wordSize/str2_chr_size);
 9109         __ sub(str2, str2, str2_chr_size);
 9110       }
 9111       __ cmp(ch1, ch2);
 9112       __ mov(tmp4, wordSize/str2_chr_size);
 9113       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9114     __ BIND(L_CMP_LOOP);
 9115       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9116                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9117       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9118                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9119       __ add(tmp4, tmp4, 1);
 9120       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9121       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9122       __ cmp(cnt1, ch2);
 9123       __ br(__ EQ, L_CMP_LOOP);
 9124     __ BIND(L_CMP_LOOP_NOMATCH);
 9125       // here we're not matched
 9126       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9127       __ clz(tmp4, tmp2);
 9128       __ add(str2, str2, str2_chr_size); // advance pointer
 9129       __ b(L_HAS_ZERO_LOOP);
 9130     __ align(OptoLoopAlignment);
 9131     __ BIND(L_CMP_LOOP_LAST_CMP);
 9132       __ cmp(cnt1, ch2);
 9133       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9134       __ b(DONE);
 9135     __ align(OptoLoopAlignment);
 9136     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9137       if (str2_isL) {
 9138         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9139         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9140         __ lslv(tmp2, tmp2, tmp4);
 9141         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9142         __ add(tmp4, tmp4, 1);
 9143         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9144         __ lsl(tmp2, tmp2, 1);
 9145       } else {
 9146         __ mov(ch2, 0xE);
 9147         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9148         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9149         __ lslv(tmp2, tmp2, tmp4);
 9150         __ add(tmp4, tmp4, 1);
 9151         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9152         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9153         __ lsl(tmp2, tmp2, 1);
 9154         __ sub(str2, str2, str2_chr_size);
 9155       }
 9156       __ cmp(ch1, ch2);
 9157       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9158       __ b(DONE);
 9159     __ align(OptoLoopAlignment);
 9160     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9161       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9162       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9163       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9164       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9165       // result by analyzed characters value, so, we can just reset lower bits
 9166       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9167       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9168       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9169       // index of last analyzed substring inside current octet. So, str2 in at
 9170       // respective start address. We need to advance it to next octet
 9171       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9172       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9173       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9174       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9175       __ movw(cnt2, cnt2);
 9176       __ b(L_LOOP_PROCEED);
 9177     __ align(OptoLoopAlignment);
 9178     __ BIND(NOMATCH);
 9179       __ mov(result, -1);
 9180     __ BIND(DONE);
 9181       __ pop(spilled_regs, sp);
 9182       __ ret(lr);
 9183     return entry;
 9184   }
 9185 
 9186   void generate_string_indexof_stubs() {
 9187     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9188     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9189     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9190   }
 9191 
 9192   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9193       FloatRegister src1, FloatRegister src2) {
 9194     Register dst = r1;
 9195     __ zip1(v1, __ T16B, src1, v0);
 9196     __ zip2(v2, __ T16B, src1, v0);
 9197     if (generatePrfm) {
 9198       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9199     }
 9200     __ zip1(v3, __ T16B, src2, v0);
 9201     __ zip2(v4, __ T16B, src2, v0);
 9202     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9203   }
 9204 
 9205   // R0 = src
 9206   // R1 = dst
 9207   // R2 = len
 9208   // R3 = len >> 3
 9209   // V0 = 0
 9210   // v1 = loaded 8 bytes
 9211   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9212   address generate_large_byte_array_inflate() {
 9213     __ align(CodeEntryAlignment);
 9214     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 9215     StubCodeMark mark(this, stub_id);
 9216     address entry = __ pc();
 9217     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9218     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9219     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9220 
 9221     // do one more 8-byte read to have address 16-byte aligned in most cases
 9222     // also use single store instruction
 9223     __ ldrd(v2, __ post(src, 8));
 9224     __ sub(octetCounter, octetCounter, 2);
 9225     __ zip1(v1, __ T16B, v1, v0);
 9226     __ zip1(v2, __ T16B, v2, v0);
 9227     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9228     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9229     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9230     __ br(__ LE, LOOP_START);
 9231     __ b(LOOP_PRFM_START);
 9232     __ bind(LOOP_PRFM);
 9233       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9234     __ bind(LOOP_PRFM_START);
 9235       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9236       __ sub(octetCounter, octetCounter, 8);
 9237       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9238       inflate_and_store_2_fp_registers(true, v3, v4);
 9239       inflate_and_store_2_fp_registers(true, v5, v6);
 9240       __ br(__ GT, LOOP_PRFM);
 9241       __ cmp(octetCounter, (u1)8);
 9242       __ br(__ LT, DONE);
 9243     __ bind(LOOP);
 9244       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9245       __ bind(LOOP_START);
 9246       __ sub(octetCounter, octetCounter, 8);
 9247       __ cmp(octetCounter, (u1)8);
 9248       inflate_and_store_2_fp_registers(false, v3, v4);
 9249       inflate_and_store_2_fp_registers(false, v5, v6);
 9250       __ br(__ GE, LOOP);
 9251     __ bind(DONE);
 9252       __ ret(lr);
 9253     return entry;
 9254   }
 9255 
 9256   /**
 9257    *  Arguments:
 9258    *
 9259    *  Input:
 9260    *  c_rarg0   - current state address
 9261    *  c_rarg1   - H key address
 9262    *  c_rarg2   - data address
 9263    *  c_rarg3   - number of blocks
 9264    *
 9265    *  Output:
 9266    *  Updated state at c_rarg0
 9267    */
 9268   address generate_ghash_processBlocks() {
 9269     // Bafflingly, GCM uses little-endian for the byte order, but
 9270     // big-endian for the bit order.  For example, the polynomial 1 is
 9271     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9272     //
 9273     // So, we must either reverse the bytes in each word and do
 9274     // everything big-endian or reverse the bits in each byte and do
 9275     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9276     // the bits in each byte (we have an instruction, RBIT, to do
 9277     // that) and keep the data in little-endian bit order through the
 9278     // calculation, bit-reversing the inputs and outputs.
 9279 
 9280     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 9281     StubCodeMark mark(this, stub_id);
 9282     __ align(wordSize * 2);
 9283     address p = __ pc();
 9284     __ emit_int64(0x87);  // The low-order bits of the field
 9285                           // polynomial (i.e. p = z^7+z^2+z+1)
 9286                           // repeated in the low and high parts of a
 9287                           // 128-bit vector
 9288     __ emit_int64(0x87);
 9289 
 9290     __ align(CodeEntryAlignment);
 9291     address start = __ pc();
 9292 
 9293     Register state   = c_rarg0;
 9294     Register subkeyH = c_rarg1;
 9295     Register data    = c_rarg2;
 9296     Register blocks  = c_rarg3;
 9297 
 9298     FloatRegister vzr = v30;
 9299     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9300 
 9301     __ ldrq(v24, p);    // The field polynomial
 9302 
 9303     __ ldrq(v0, Address(state));
 9304     __ ldrq(v1, Address(subkeyH));
 9305 
 9306     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9307     __ rbit(v0, __ T16B, v0);
 9308     __ rev64(v1, __ T16B, v1);
 9309     __ rbit(v1, __ T16B, v1);
 9310 
 9311     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9312     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9313 
 9314     {
 9315       Label L_ghash_loop;
 9316       __ bind(L_ghash_loop);
 9317 
 9318       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9319                                                  // reversing each byte
 9320       __ rbit(v2, __ T16B, v2);
 9321       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9322 
 9323       // Multiply state in v2 by subkey in v1
 9324       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9325                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9326                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9327       // Reduce v7:v5 by the field polynomial
 9328       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9329 
 9330       __ sub(blocks, blocks, 1);
 9331       __ cbnz(blocks, L_ghash_loop);
 9332     }
 9333 
 9334     // The bit-reversed result is at this point in v0
 9335     __ rev64(v0, __ T16B, v0);
 9336     __ rbit(v0, __ T16B, v0);
 9337 
 9338     __ st1(v0, __ T16B, state);
 9339     __ ret(lr);
 9340 
 9341     return start;
 9342   }
 9343 
 9344   address generate_ghash_processBlocks_wide() {
 9345     address small = generate_ghash_processBlocks();
 9346 
 9347     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 9348     StubCodeMark mark(this, stub_id);
 9349     __ align(wordSize * 2);
 9350     address p = __ pc();
 9351     __ emit_int64(0x87);  // The low-order bits of the field
 9352                           // polynomial (i.e. p = z^7+z^2+z+1)
 9353                           // repeated in the low and high parts of a
 9354                           // 128-bit vector
 9355     __ emit_int64(0x87);
 9356 
 9357     __ align(CodeEntryAlignment);
 9358     address start = __ pc();
 9359 
 9360     Register state   = c_rarg0;
 9361     Register subkeyH = c_rarg1;
 9362     Register data    = c_rarg2;
 9363     Register blocks  = c_rarg3;
 9364 
 9365     const int unroll = 4;
 9366 
 9367     __ cmp(blocks, (unsigned char)(unroll * 2));
 9368     __ br(__ LT, small);
 9369 
 9370     if (unroll > 1) {
 9371     // Save state before entering routine
 9372       __ sub(sp, sp, 4 * 16);
 9373       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9374       __ sub(sp, sp, 4 * 16);
 9375       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9376     }
 9377 
 9378     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9379 
 9380     if (unroll > 1) {
 9381       // And restore state
 9382       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9383       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9384     }
 9385 
 9386     __ cmp(blocks, (unsigned char)0);
 9387     __ br(__ GT, small);
 9388 
 9389     __ ret(lr);
 9390 
 9391     return start;
 9392   }
 9393 
 9394   void generate_base64_encode_simdround(Register src, Register dst,
 9395         FloatRegister codec, u8 size) {
 9396 
 9397     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9398     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9399     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9400 
 9401     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9402 
 9403     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9404 
 9405     __ ushr(ind0, arrangement, in0,  2);
 9406 
 9407     __ ushr(ind1, arrangement, in1,  2);
 9408     __ shl(in0,   arrangement, in0,  6);
 9409     __ orr(ind1,  arrangement, ind1, in0);
 9410     __ ushr(ind1, arrangement, ind1, 2);
 9411 
 9412     __ ushr(ind2, arrangement, in2,  4);
 9413     __ shl(in1,   arrangement, in1,  4);
 9414     __ orr(ind2,  arrangement, in1,  ind2);
 9415     __ ushr(ind2, arrangement, ind2, 2);
 9416 
 9417     __ shl(ind3,  arrangement, in2,  2);
 9418     __ ushr(ind3, arrangement, ind3, 2);
 9419 
 9420     __ tbl(out0,  arrangement, codec,  4, ind0);
 9421     __ tbl(out1,  arrangement, codec,  4, ind1);
 9422     __ tbl(out2,  arrangement, codec,  4, ind2);
 9423     __ tbl(out3,  arrangement, codec,  4, ind3);
 9424 
 9425     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9426   }
 9427 
 9428    /**
 9429    *  Arguments:
 9430    *
 9431    *  Input:
 9432    *  c_rarg0   - src_start
 9433    *  c_rarg1   - src_offset
 9434    *  c_rarg2   - src_length
 9435    *  c_rarg3   - dest_start
 9436    *  c_rarg4   - dest_offset
 9437    *  c_rarg5   - isURL
 9438    *
 9439    */
 9440   address generate_base64_encodeBlock() {
 9441 
 9442     static const char toBase64[64] = {
 9443       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9444       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9445       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9446       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9447       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9448     };
 9449 
 9450     static const char toBase64URL[64] = {
 9451       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9452       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9453       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9454       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9455       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9456     };
 9457 
 9458     __ align(CodeEntryAlignment);
 9459     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 9460     StubCodeMark mark(this, stub_id);
 9461     address start = __ pc();
 9462 
 9463     Register src   = c_rarg0;  // source array
 9464     Register soff  = c_rarg1;  // source start offset
 9465     Register send  = c_rarg2;  // source end offset
 9466     Register dst   = c_rarg3;  // dest array
 9467     Register doff  = c_rarg4;  // position for writing to dest array
 9468     Register isURL = c_rarg5;  // Base64 or URL character set
 9469 
 9470     // c_rarg6 and c_rarg7 are free to use as temps
 9471     Register codec  = c_rarg6;
 9472     Register length = c_rarg7;
 9473 
 9474     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9475 
 9476     __ add(src, src, soff);
 9477     __ add(dst, dst, doff);
 9478     __ sub(length, send, soff);
 9479 
 9480     // load the codec base address
 9481     __ lea(codec, ExternalAddress((address) toBase64));
 9482     __ cbz(isURL, ProcessData);
 9483     __ lea(codec, ExternalAddress((address) toBase64URL));
 9484 
 9485     __ BIND(ProcessData);
 9486 
 9487     // too short to formup a SIMD loop, roll back
 9488     __ cmp(length, (u1)24);
 9489     __ br(Assembler::LT, Process3B);
 9490 
 9491     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9492 
 9493     __ BIND(Process48B);
 9494     __ cmp(length, (u1)48);
 9495     __ br(Assembler::LT, Process24B);
 9496     generate_base64_encode_simdround(src, dst, v0, 16);
 9497     __ sub(length, length, 48);
 9498     __ b(Process48B);
 9499 
 9500     __ BIND(Process24B);
 9501     __ cmp(length, (u1)24);
 9502     __ br(Assembler::LT, SIMDExit);
 9503     generate_base64_encode_simdround(src, dst, v0, 8);
 9504     __ sub(length, length, 24);
 9505 
 9506     __ BIND(SIMDExit);
 9507     __ cbz(length, Exit);
 9508 
 9509     __ BIND(Process3B);
 9510     //  3 src bytes, 24 bits
 9511     __ ldrb(r10, __ post(src, 1));
 9512     __ ldrb(r11, __ post(src, 1));
 9513     __ ldrb(r12, __ post(src, 1));
 9514     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9515     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9516     // codec index
 9517     __ ubfmw(r15, r12, 18, 23);
 9518     __ ubfmw(r14, r12, 12, 17);
 9519     __ ubfmw(r13, r12, 6,  11);
 9520     __ andw(r12,  r12, 63);
 9521     // get the code based on the codec
 9522     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9523     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9524     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9525     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9526     __ strb(r15, __ post(dst, 1));
 9527     __ strb(r14, __ post(dst, 1));
 9528     __ strb(r13, __ post(dst, 1));
 9529     __ strb(r12, __ post(dst, 1));
 9530     __ sub(length, length, 3);
 9531     __ cbnz(length, Process3B);
 9532 
 9533     __ BIND(Exit);
 9534     __ ret(lr);
 9535 
 9536     return start;
 9537   }
 9538 
 9539   void generate_base64_decode_simdround(Register src, Register dst,
 9540         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9541 
 9542     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9543     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9544 
 9545     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9546     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9547 
 9548     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9549 
 9550     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9551 
 9552     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9553 
 9554     // we need unsigned saturating subtract, to make sure all input values
 9555     // in range [0, 63] will have 0U value in the higher half lookup
 9556     __ uqsubv(decH0, __ T16B, in0, v27);
 9557     __ uqsubv(decH1, __ T16B, in1, v27);
 9558     __ uqsubv(decH2, __ T16B, in2, v27);
 9559     __ uqsubv(decH3, __ T16B, in3, v27);
 9560 
 9561     // lower half lookup
 9562     __ tbl(decL0, arrangement, codecL, 4, in0);
 9563     __ tbl(decL1, arrangement, codecL, 4, in1);
 9564     __ tbl(decL2, arrangement, codecL, 4, in2);
 9565     __ tbl(decL3, arrangement, codecL, 4, in3);
 9566 
 9567     // higher half lookup
 9568     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9569     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9570     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9571     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9572 
 9573     // combine lower and higher
 9574     __ orr(decL0, arrangement, decL0, decH0);
 9575     __ orr(decL1, arrangement, decL1, decH1);
 9576     __ orr(decL2, arrangement, decL2, decH2);
 9577     __ orr(decL3, arrangement, decL3, decH3);
 9578 
 9579     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9580     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9581     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9582     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9583     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9584     __ orr(in0, arrangement, decH0, decH1);
 9585     __ orr(in1, arrangement, decH2, decH3);
 9586     __ orr(in2, arrangement, in0,   in1);
 9587     __ umaxv(in3, arrangement, in2);
 9588     __ umov(rscratch2, in3, __ B, 0);
 9589 
 9590     // get the data to output
 9591     __ shl(out0,  arrangement, decL0, 2);
 9592     __ ushr(out1, arrangement, decL1, 4);
 9593     __ orr(out0,  arrangement, out0,  out1);
 9594     __ shl(out1,  arrangement, decL1, 4);
 9595     __ ushr(out2, arrangement, decL2, 2);
 9596     __ orr(out1,  arrangement, out1,  out2);
 9597     __ shl(out2,  arrangement, decL2, 6);
 9598     __ orr(out2,  arrangement, out2,  decL3);
 9599 
 9600     __ cbz(rscratch2, NoIllegalData);
 9601 
 9602     // handle illegal input
 9603     __ umov(r10, in2, __ D, 0);
 9604     if (size == 16) {
 9605       __ cbnz(r10, ErrorInLowerHalf);
 9606 
 9607       // illegal input is in higher half, store the lower half now.
 9608       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9609 
 9610       __ umov(r10, in2,  __ D, 1);
 9611       __ umov(r11, out0, __ D, 1);
 9612       __ umov(r12, out1, __ D, 1);
 9613       __ umov(r13, out2, __ D, 1);
 9614       __ b(StoreLegalData);
 9615 
 9616       __ BIND(ErrorInLowerHalf);
 9617     }
 9618     __ umov(r11, out0, __ D, 0);
 9619     __ umov(r12, out1, __ D, 0);
 9620     __ umov(r13, out2, __ D, 0);
 9621 
 9622     __ BIND(StoreLegalData);
 9623     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 9624     __ strb(r11, __ post(dst, 1));
 9625     __ strb(r12, __ post(dst, 1));
 9626     __ strb(r13, __ post(dst, 1));
 9627     __ lsr(r10, r10, 8);
 9628     __ lsr(r11, r11, 8);
 9629     __ lsr(r12, r12, 8);
 9630     __ lsr(r13, r13, 8);
 9631     __ b(StoreLegalData);
 9632 
 9633     __ BIND(NoIllegalData);
 9634     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 9635   }
 9636 
 9637 
 9638    /**
 9639    *  Arguments:
 9640    *
 9641    *  Input:
 9642    *  c_rarg0   - src_start
 9643    *  c_rarg1   - src_offset
 9644    *  c_rarg2   - src_length
 9645    *  c_rarg3   - dest_start
 9646    *  c_rarg4   - dest_offset
 9647    *  c_rarg5   - isURL
 9648    *  c_rarg6   - isMIME
 9649    *
 9650    */
 9651   address generate_base64_decodeBlock() {
 9652 
 9653     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 9654     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 9655     // titled "Base64 decoding".
 9656 
 9657     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 9658     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 9659     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 9660     static const uint8_t fromBase64ForNoSIMD[256] = {
 9661       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9662       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9663       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9664        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9665       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9666        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 9667       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9668        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9669       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9670       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9671       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9672       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9673       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9674       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9675       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9676       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9677     };
 9678 
 9679     static const uint8_t fromBase64URLForNoSIMD[256] = {
 9680       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9681       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9682       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9683        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9684       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9685        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 9686       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9687        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9688       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9689       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9690       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9691       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9692       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9693       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9694       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9695       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9696     };
 9697 
 9698     // A legal value of base64 code is in range [0, 127].  We need two lookups
 9699     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 9700     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 9701     // table vector lookup use tbx, out of range indices are unchanged in
 9702     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 9703     // The value of index 64 is set to 0, so that we know that we already get the
 9704     // decoded data with the 1st lookup.
 9705     static const uint8_t fromBase64ForSIMD[128] = {
 9706       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9707       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9708       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9709        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9710         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9711        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9712       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9713        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9714     };
 9715 
 9716     static const uint8_t fromBase64URLForSIMD[128] = {
 9717       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9718       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9719       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9720        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9721         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9722        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9723        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9724        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9725     };
 9726 
 9727     __ align(CodeEntryAlignment);
 9728     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 9729     StubCodeMark mark(this, stub_id);
 9730     address start = __ pc();
 9731 
 9732     Register src    = c_rarg0;  // source array
 9733     Register soff   = c_rarg1;  // source start offset
 9734     Register send   = c_rarg2;  // source end offset
 9735     Register dst    = c_rarg3;  // dest array
 9736     Register doff   = c_rarg4;  // position for writing to dest array
 9737     Register isURL  = c_rarg5;  // Base64 or URL character set
 9738     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 9739 
 9740     Register length = send;    // reuse send as length of source data to process
 9741 
 9742     Register simd_codec   = c_rarg6;
 9743     Register nosimd_codec = c_rarg7;
 9744 
 9745     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 9746 
 9747     __ enter();
 9748 
 9749     __ add(src, src, soff);
 9750     __ add(dst, dst, doff);
 9751 
 9752     __ mov(doff, dst);
 9753 
 9754     __ sub(length, send, soff);
 9755     __ bfm(length, zr, 0, 1);
 9756 
 9757     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 9758     __ cbz(isURL, ProcessData);
 9759     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 9760 
 9761     __ BIND(ProcessData);
 9762     __ mov(rscratch1, length);
 9763     __ cmp(length, (u1)144); // 144 = 80 + 64
 9764     __ br(Assembler::LT, Process4B);
 9765 
 9766     // In the MIME case, the line length cannot be more than 76
 9767     // bytes (see RFC 2045). This is too short a block for SIMD
 9768     // to be worthwhile, so we use non-SIMD here.
 9769     __ movw(rscratch1, 79);
 9770 
 9771     __ BIND(Process4B);
 9772     __ ldrw(r14, __ post(src, 4));
 9773     __ ubfxw(r10, r14, 0,  8);
 9774     __ ubfxw(r11, r14, 8,  8);
 9775     __ ubfxw(r12, r14, 16, 8);
 9776     __ ubfxw(r13, r14, 24, 8);
 9777     // get the de-code
 9778     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 9779     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 9780     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 9781     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 9782     // error detection, 255u indicates an illegal input
 9783     __ orrw(r14, r10, r11);
 9784     __ orrw(r15, r12, r13);
 9785     __ orrw(r14, r14, r15);
 9786     __ tbnz(r14, 7, Exit);
 9787     // recover the data
 9788     __ lslw(r14, r10, 10);
 9789     __ bfiw(r14, r11, 4, 6);
 9790     __ bfmw(r14, r12, 2, 5);
 9791     __ rev16w(r14, r14);
 9792     __ bfiw(r13, r12, 6, 2);
 9793     __ strh(r14, __ post(dst, 2));
 9794     __ strb(r13, __ post(dst, 1));
 9795     // non-simd loop
 9796     __ subsw(rscratch1, rscratch1, 4);
 9797     __ br(Assembler::GT, Process4B);
 9798 
 9799     // if exiting from PreProcess80B, rscratch1 == -1;
 9800     // otherwise, rscratch1 == 0.
 9801     __ cbzw(rscratch1, Exit);
 9802     __ sub(length, length, 80);
 9803 
 9804     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 9805     __ cbz(isURL, SIMDEnter);
 9806     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 9807 
 9808     __ BIND(SIMDEnter);
 9809     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 9810     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 9811     __ mov(rscratch1, 63);
 9812     __ dup(v27, __ T16B, rscratch1);
 9813 
 9814     __ BIND(Process64B);
 9815     __ cmp(length, (u1)64);
 9816     __ br(Assembler::LT, Process32B);
 9817     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 9818     __ sub(length, length, 64);
 9819     __ b(Process64B);
 9820 
 9821     __ BIND(Process32B);
 9822     __ cmp(length, (u1)32);
 9823     __ br(Assembler::LT, SIMDExit);
 9824     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 9825     __ sub(length, length, 32);
 9826     __ b(Process32B);
 9827 
 9828     __ BIND(SIMDExit);
 9829     __ cbz(length, Exit);
 9830     __ movw(rscratch1, length);
 9831     __ b(Process4B);
 9832 
 9833     __ BIND(Exit);
 9834     __ sub(c_rarg0, dst, doff);
 9835 
 9836     __ leave();
 9837     __ ret(lr);
 9838 
 9839     return start;
 9840   }
 9841 
 9842   // Support for spin waits.
 9843   address generate_spin_wait() {
 9844     __ align(CodeEntryAlignment);
 9845     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 9846     StubCodeMark mark(this, stub_id);
 9847     address start = __ pc();
 9848 
 9849     __ spin_wait();
 9850     __ ret(lr);
 9851 
 9852     return start;
 9853   }
 9854 
 9855   void generate_lookup_secondary_supers_table_stub() {
 9856     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 9857     StubCodeMark mark(this, stub_id);
 9858 
 9859     const Register
 9860       r_super_klass  = r0,
 9861       r_array_base   = r1,
 9862       r_array_length = r2,
 9863       r_array_index  = r3,
 9864       r_sub_klass    = r4,
 9865       r_bitmap       = rscratch2,
 9866       result         = r5;
 9867     const FloatRegister
 9868       vtemp          = v0;
 9869 
 9870     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 9871       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 9872       Label L_success;
 9873       __ enter();
 9874       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 9875                                              r_array_base, r_array_length, r_array_index,
 9876                                              vtemp, result, slot,
 9877                                              /*stub_is_near*/true);
 9878       __ leave();
 9879       __ ret(lr);
 9880     }
 9881   }
 9882 
 9883   // Slow path implementation for UseSecondarySupersTable.
 9884   address generate_lookup_secondary_supers_table_slow_path_stub() {
 9885     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 9886     StubCodeMark mark(this, stub_id);
 9887 
 9888     address start = __ pc();
 9889     const Register
 9890       r_super_klass  = r0,        // argument
 9891       r_array_base   = r1,        // argument
 9892       temp1          = r2,        // temp
 9893       r_array_index  = r3,        // argument
 9894       r_bitmap       = rscratch2, // argument
 9895       result         = r5;        // argument
 9896 
 9897     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 9898     __ ret(lr);
 9899 
 9900     return start;
 9901   }
 9902 
 9903 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9904 
 9905   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 9906   //
 9907   // If LSE is in use, generate LSE versions of all the stubs. The
 9908   // non-LSE versions are in atomic_aarch64.S.
 9909 
 9910   // class AtomicStubMark records the entry point of a stub and the
 9911   // stub pointer which will point to it. The stub pointer is set to
 9912   // the entry point when ~AtomicStubMark() is called, which must be
 9913   // after ICache::invalidate_range. This ensures safe publication of
 9914   // the generated code.
 9915   class AtomicStubMark {
 9916     address _entry_point;
 9917     aarch64_atomic_stub_t *_stub;
 9918     MacroAssembler *_masm;
 9919   public:
 9920     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 9921       _masm = masm;
 9922       __ align(32);
 9923       _entry_point = __ pc();
 9924       _stub = stub;
 9925     }
 9926     ~AtomicStubMark() {
 9927       *_stub = (aarch64_atomic_stub_t)_entry_point;
 9928     }
 9929   };
 9930 
 9931   // NB: For memory_order_conservative we need a trailing membar after
 9932   // LSE atomic operations but not a leading membar.
 9933   //
 9934   // We don't need a leading membar because a clause in the Arm ARM
 9935   // says:
 9936   //
 9937   //   Barrier-ordered-before
 9938   //
 9939   //   Barrier instructions order prior Memory effects before subsequent
 9940   //   Memory effects generated by the same Observer. A read or a write
 9941   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 9942   //   Observer if and only if RW1 appears in program order before RW 2
 9943   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 9944   //   instruction with both Acquire and Release semantics.
 9945   //
 9946   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 9947   // and Release semantics, therefore we don't need a leading
 9948   // barrier. However, there is no corresponding Barrier-ordered-after
 9949   // relationship, therefore we need a trailing membar to prevent a
 9950   // later store or load from being reordered with the store in an
 9951   // atomic instruction.
 9952   //
 9953   // This was checked by using the herd7 consistency model simulator
 9954   // (http://diy.inria.fr/) with this test case:
 9955   //
 9956   // AArch64 LseCas
 9957   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 9958   // P0 | P1;
 9959   // LDR W4, [X2] | MOV W3, #0;
 9960   // DMB LD       | MOV W4, #1;
 9961   // LDR W3, [X1] | CASAL W3, W4, [X1];
 9962   //              | DMB ISH;
 9963   //              | STR W4, [X2];
 9964   // exists
 9965   // (0:X3=0 /\ 0:X4=1)
 9966   //
 9967   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 9968   // with the store to x in P1. Without the DMB in P1 this may happen.
 9969   //
 9970   // At the time of writing we don't know of any AArch64 hardware that
 9971   // reorders stores in this way, but the Reference Manual permits it.
 9972 
 9973   void gen_cas_entry(Assembler::operand_size size,
 9974                      atomic_memory_order order) {
 9975     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 9976       exchange_val = c_rarg2;
 9977     bool acquire, release;
 9978     switch (order) {
 9979       case memory_order_relaxed:
 9980         acquire = false;
 9981         release = false;
 9982         break;
 9983       case memory_order_release:
 9984         acquire = false;
 9985         release = true;
 9986         break;
 9987       default:
 9988         acquire = true;
 9989         release = true;
 9990         break;
 9991     }
 9992     __ mov(prev, compare_val);
 9993     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 9994     if (order == memory_order_conservative) {
 9995       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9996     }
 9997     if (size == Assembler::xword) {
 9998       __ mov(r0, prev);
 9999     } else {
10000       __ movw(r0, prev);
10001     }
10002     __ ret(lr);
10003   }
10004 
10005   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10006     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10007     // If not relaxed, then default to conservative.  Relaxed is the only
10008     // case we use enough to be worth specializing.
10009     if (order == memory_order_relaxed) {
10010       __ ldadd(size, incr, prev, addr);
10011     } else {
10012       __ ldaddal(size, incr, prev, addr);
10013       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10014     }
10015     if (size == Assembler::xword) {
10016       __ mov(r0, prev);
10017     } else {
10018       __ movw(r0, prev);
10019     }
10020     __ ret(lr);
10021   }
10022 
10023   void gen_swpal_entry(Assembler::operand_size size) {
10024     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10025     __ swpal(size, incr, prev, addr);
10026     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10027     if (size == Assembler::xword) {
10028       __ mov(r0, prev);
10029     } else {
10030       __ movw(r0, prev);
10031     }
10032     __ ret(lr);
10033   }
10034 
10035   void generate_atomic_entry_points() {
10036     if (! UseLSE) {
10037       return;
10038     }
10039     __ align(CodeEntryAlignment);
10040     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
10041     StubCodeMark mark(this, stub_id);
10042     address first_entry = __ pc();
10043 
10044     // ADD, memory_order_conservative
10045     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10046     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10047     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10048     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10049 
10050     // ADD, memory_order_relaxed
10051     AtomicStubMark mark_fetch_add_4_relaxed
10052       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10053     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10054     AtomicStubMark mark_fetch_add_8_relaxed
10055       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10056     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10057 
10058     // XCHG, memory_order_conservative
10059     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10060     gen_swpal_entry(Assembler::word);
10061     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10062     gen_swpal_entry(Assembler::xword);
10063 
10064     // CAS, memory_order_conservative
10065     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10066     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10067     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10068     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10069     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10070     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10071 
10072     // CAS, memory_order_relaxed
10073     AtomicStubMark mark_cmpxchg_1_relaxed
10074       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10075     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10076     AtomicStubMark mark_cmpxchg_4_relaxed
10077       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10078     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10079     AtomicStubMark mark_cmpxchg_8_relaxed
10080       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10081     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10082 
10083     AtomicStubMark mark_cmpxchg_4_release
10084       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10085     gen_cas_entry(MacroAssembler::word, memory_order_release);
10086     AtomicStubMark mark_cmpxchg_8_release
10087       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10088     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10089 
10090     AtomicStubMark mark_cmpxchg_4_seq_cst
10091       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10092     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10093     AtomicStubMark mark_cmpxchg_8_seq_cst
10094       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10095     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10096 
10097     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10098   }
10099 #endif // LINUX
10100 
10101   address generate_cont_thaw(Continuation::thaw_kind kind) {
10102     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10103     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10104 
10105     address start = __ pc();
10106 
10107     if (return_barrier) {
10108       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10109       __ mov(sp, rscratch1);
10110     }
10111     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10112 
10113     if (return_barrier) {
10114       // preserve possible return value from a method returning to the return barrier
10115       __ fmovd(rscratch1, v0);
10116       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10117     }
10118 
10119     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10120     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10121     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10122 
10123     if (return_barrier) {
10124       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10125       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10126       __ fmovd(v0, rscratch1);
10127     }
10128     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10129 
10130 
10131     Label thaw_success;
10132     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10133     __ cbnz(rscratch2, thaw_success);
10134     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10135     __ br(rscratch1);
10136     __ bind(thaw_success);
10137 
10138     // make room for the thawed frames
10139     __ sub(rscratch1, sp, rscratch2);
10140     __ andr(rscratch1, rscratch1, -16); // align
10141     __ mov(sp, rscratch1);
10142 
10143     if (return_barrier) {
10144       // save original return value -- again
10145       __ fmovd(rscratch1, v0);
10146       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10147     }
10148 
10149     // If we want, we can templatize thaw by kind, and have three different entries
10150     __ movw(c_rarg1, (uint32_t)kind);
10151 
10152     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10153     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10154 
10155     if (return_barrier) {
10156       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10157       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10158       __ fmovd(v0, rscratch1);
10159     } else {
10160       __ mov(r0, zr); // return 0 (success) from doYield
10161     }
10162 
10163     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10164     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10165     __ mov(rfp, sp);
10166 
10167     if (return_barrier_exception) {
10168       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10169       __ authenticate_return_address(c_rarg1);
10170       __ verify_oop(r0);
10171       // save return value containing the exception oop in callee-saved R19
10172       __ mov(r19, r0);
10173 
10174       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10175 
10176       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10177       // __ reinitialize_ptrue();
10178 
10179       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10180 
10181       __ mov(r1, r0); // the exception handler
10182       __ mov(r0, r19); // restore return value containing the exception oop
10183       __ verify_oop(r0);
10184 
10185       __ leave();
10186       __ mov(r3, lr);
10187       __ br(r1); // the exception handler
10188     } else {
10189       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10190       __ leave();
10191       __ ret(lr);
10192     }
10193 
10194     return start;
10195   }
10196 
10197   address generate_cont_thaw() {
10198     if (!Continuations::enabled()) return nullptr;
10199 
10200     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
10201     StubCodeMark mark(this, stub_id);
10202     address start = __ pc();
10203     generate_cont_thaw(Continuation::thaw_top);
10204     return start;
10205   }
10206 
10207   address generate_cont_returnBarrier() {
10208     if (!Continuations::enabled()) return nullptr;
10209 
10210     // TODO: will probably need multiple return barriers depending on return type
10211     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
10212     StubCodeMark mark(this, stub_id);
10213     address start = __ pc();
10214 
10215     generate_cont_thaw(Continuation::thaw_return_barrier);
10216 
10217     return start;
10218   }
10219 
10220   address generate_cont_returnBarrier_exception() {
10221     if (!Continuations::enabled()) return nullptr;
10222 
10223     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
10224     StubCodeMark mark(this, stub_id);
10225     address start = __ pc();
10226 
10227     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10228 
10229     return start;
10230   }
10231 
10232   address generate_cont_preempt_stub() {
10233     if (!Continuations::enabled()) return nullptr;
10234     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
10235     StubCodeMark mark(this, stub_id);
10236     address start = __ pc();
10237 
10238     __ reset_last_Java_frame(true);
10239 
10240     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10241     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10242     __ mov(sp, rscratch2);
10243 
10244     Label preemption_cancelled;
10245     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10246     __ cbnz(rscratch1, preemption_cancelled);
10247 
10248     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10249     SharedRuntime::continuation_enter_cleanup(_masm);
10250     __ leave();
10251     __ ret(lr);
10252 
10253     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10254     __ bind(preemption_cancelled);
10255     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10256     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10257     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10258     __ ldr(rscratch1, Address(rscratch1));
10259     __ br(rscratch1);
10260 
10261     return start;
10262   }
10263 
10264   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10265   // are represented as long[5], with BITS_PER_LIMB = 26.
10266   // Pack five 26-bit limbs into three 64-bit registers.
10267   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10268     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10269     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10270     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10271     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10272 
10273     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10274     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10275     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10276     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10277 
10278     if (dest2->is_valid()) {
10279       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10280     } else {
10281 #ifdef ASSERT
10282       Label OK;
10283       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10284       __ br(__ EQ, OK);
10285       __ stop("high bits of Poly1305 integer should be zero");
10286       __ should_not_reach_here();
10287       __ bind(OK);
10288 #endif
10289     }
10290   }
10291 
10292   // As above, but return only a 128-bit integer, packed into two
10293   // 64-bit registers.
10294   void pack_26(Register dest0, Register dest1, Register src) {
10295     pack_26(dest0, dest1, noreg, src);
10296   }
10297 
10298   // Multiply and multiply-accumulate unsigned 64-bit registers.
10299   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10300     __ mul(prod_lo, n, m);
10301     __ umulh(prod_hi, n, m);
10302   }
10303   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10304     wide_mul(rscratch1, rscratch2, n, m);
10305     __ adds(sum_lo, sum_lo, rscratch1);
10306     __ adc(sum_hi, sum_hi, rscratch2);
10307   }
10308 
10309   // Poly1305, RFC 7539
10310 
10311   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10312   // description of the tricks used to simplify and accelerate this
10313   // computation.
10314 
10315   address generate_poly1305_processBlocks() {
10316     __ align(CodeEntryAlignment);
10317     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
10318     StubCodeMark mark(this, stub_id);
10319     address start = __ pc();
10320     Label here;
10321     __ enter();
10322     RegSet callee_saved = RegSet::range(r19, r28);
10323     __ push(callee_saved, sp);
10324 
10325     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10326 
10327     // Arguments
10328     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10329 
10330     // R_n is the 128-bit randomly-generated key, packed into two
10331     // registers.  The caller passes this key to us as long[5], with
10332     // BITS_PER_LIMB = 26.
10333     const Register R_0 = *++regs, R_1 = *++regs;
10334     pack_26(R_0, R_1, r_start);
10335 
10336     // RR_n is (R_n >> 2) * 5
10337     const Register RR_0 = *++regs, RR_1 = *++regs;
10338     __ lsr(RR_0, R_0, 2);
10339     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10340     __ lsr(RR_1, R_1, 2);
10341     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10342 
10343     // U_n is the current checksum
10344     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10345     pack_26(U_0, U_1, U_2, acc_start);
10346 
10347     static constexpr int BLOCK_LENGTH = 16;
10348     Label DONE, LOOP;
10349 
10350     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10351     __ br(Assembler::LT, DONE); {
10352       __ bind(LOOP);
10353 
10354       // S_n is to be the sum of U_n and the next block of data
10355       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10356       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10357       __ adds(S_0, U_0, S_0);
10358       __ adcs(S_1, U_1, S_1);
10359       __ adc(S_2, U_2, zr);
10360       __ add(S_2, S_2, 1);
10361 
10362       const Register U_0HI = *++regs, U_1HI = *++regs;
10363 
10364       // NB: this logic depends on some of the special properties of
10365       // Poly1305 keys. In particular, because we know that the top
10366       // four bits of R_0 and R_1 are zero, we can add together
10367       // partial products without any risk of needing to propagate a
10368       // carry out.
10369       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10370       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10371       __ andr(U_2, R_0, 3);
10372       __ mul(U_2, S_2, U_2);
10373 
10374       // Recycle registers S_0, S_1, S_2
10375       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10376 
10377       // Partial reduction mod 2**130 - 5
10378       __ adds(U_1, U_0HI, U_1);
10379       __ adc(U_2, U_1HI, U_2);
10380       // Sum now in U_2:U_1:U_0.
10381       // Dead: U_0HI, U_1HI.
10382       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10383 
10384       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10385 
10386       // First, U_2:U_1:U_0 += (U_2 >> 2)
10387       __ lsr(rscratch1, U_2, 2);
10388       __ andr(U_2, U_2, (u8)3);
10389       __ adds(U_0, U_0, rscratch1);
10390       __ adcs(U_1, U_1, zr);
10391       __ adc(U_2, U_2, zr);
10392       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10393       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10394       __ adcs(U_1, U_1, zr);
10395       __ adc(U_2, U_2, zr);
10396 
10397       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10398       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10399       __ br(~ Assembler::LT, LOOP);
10400     }
10401 
10402     // Further reduce modulo 2^130 - 5
10403     __ lsr(rscratch1, U_2, 2);
10404     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10405     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10406     __ adcs(U_1, U_1, zr);
10407     __ andr(U_2, U_2, (u1)3);
10408     __ adc(U_2, U_2, zr);
10409 
10410     // Unpack the sum into five 26-bit limbs and write to memory.
10411     __ ubfiz(rscratch1, U_0, 0, 26);
10412     __ ubfx(rscratch2, U_0, 26, 26);
10413     __ stp(rscratch1, rscratch2, Address(acc_start));
10414     __ ubfx(rscratch1, U_0, 52, 12);
10415     __ bfi(rscratch1, U_1, 12, 14);
10416     __ ubfx(rscratch2, U_1, 14, 26);
10417     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10418     __ ubfx(rscratch1, U_1, 40, 24);
10419     __ bfi(rscratch1, U_2, 24, 3);
10420     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10421 
10422     __ bind(DONE);
10423     __ pop(callee_saved, sp);
10424     __ leave();
10425     __ ret(lr);
10426 
10427     return start;
10428   }
10429 
10430   // exception handler for upcall stubs
10431   address generate_upcall_stub_exception_handler() {
10432     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
10433     StubCodeMark mark(this, stub_id);
10434     address start = __ pc();
10435 
10436     // Native caller has no idea how to handle exceptions,
10437     // so we just crash here. Up to callee to catch exceptions.
10438     __ verify_oop(r0);
10439     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10440     __ blr(rscratch1);
10441     __ should_not_reach_here();
10442 
10443     return start;
10444   }
10445 
10446   // load Method* target of MethodHandle
10447   // j_rarg0 = jobject receiver
10448   // rmethod = result
10449   address generate_upcall_stub_load_target() {
10450     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
10451     StubCodeMark mark(this, stub_id);
10452     address start = __ pc();
10453 
10454     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10455       // Load target method from receiver
10456     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10457     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10458     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10459     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10460                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10461                       noreg, noreg);
10462     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10463 
10464     __ ret(lr);
10465 
10466     return start;
10467   }
10468 
10469 #undef __
10470 #define __ masm->
10471 
10472   class MontgomeryMultiplyGenerator : public MacroAssembler {
10473 
10474     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10475       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10476 
10477     RegSet _toSave;
10478     bool _squaring;
10479 
10480   public:
10481     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10482       : MacroAssembler(as->code()), _squaring(squaring) {
10483 
10484       // Register allocation
10485 
10486       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10487       Pa_base = *regs;       // Argument registers
10488       if (squaring)
10489         Pb_base = Pa_base;
10490       else
10491         Pb_base = *++regs;
10492       Pn_base = *++regs;
10493       Rlen= *++regs;
10494       inv = *++regs;
10495       Pm_base = *++regs;
10496 
10497                           // Working registers:
10498       Ra =  *++regs;        // The current digit of a, b, n, and m.
10499       Rb =  *++regs;
10500       Rm =  *++regs;
10501       Rn =  *++regs;
10502 
10503       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10504       Pb =  *++regs;
10505       Pm =  *++regs;
10506       Pn =  *++regs;
10507 
10508       t0 =  *++regs;        // Three registers which form a
10509       t1 =  *++regs;        // triple-precision accumuator.
10510       t2 =  *++regs;
10511 
10512       Ri =  *++regs;        // Inner and outer loop indexes.
10513       Rj =  *++regs;
10514 
10515       Rhi_ab = *++regs;     // Product registers: low and high parts
10516       Rlo_ab = *++regs;     // of a*b and m*n.
10517       Rhi_mn = *++regs;
10518       Rlo_mn = *++regs;
10519 
10520       // r19 and up are callee-saved.
10521       _toSave = RegSet::range(r19, *regs) + Pm_base;
10522     }
10523 
10524   private:
10525     void save_regs() {
10526       push(_toSave, sp);
10527     }
10528 
10529     void restore_regs() {
10530       pop(_toSave, sp);
10531     }
10532 
10533     template <typename T>
10534     void unroll_2(Register count, T block) {
10535       Label loop, end, odd;
10536       tbnz(count, 0, odd);
10537       cbz(count, end);
10538       align(16);
10539       bind(loop);
10540       (this->*block)();
10541       bind(odd);
10542       (this->*block)();
10543       subs(count, count, 2);
10544       br(Assembler::GT, loop);
10545       bind(end);
10546     }
10547 
10548     template <typename T>
10549     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10550       Label loop, end, odd;
10551       tbnz(count, 0, odd);
10552       cbz(count, end);
10553       align(16);
10554       bind(loop);
10555       (this->*block)(d, s, tmp);
10556       bind(odd);
10557       (this->*block)(d, s, tmp);
10558       subs(count, count, 2);
10559       br(Assembler::GT, loop);
10560       bind(end);
10561     }
10562 
10563     void pre1(RegisterOrConstant i) {
10564       block_comment("pre1");
10565       // Pa = Pa_base;
10566       // Pb = Pb_base + i;
10567       // Pm = Pm_base;
10568       // Pn = Pn_base + i;
10569       // Ra = *Pa;
10570       // Rb = *Pb;
10571       // Rm = *Pm;
10572       // Rn = *Pn;
10573       ldr(Ra, Address(Pa_base));
10574       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10575       ldr(Rm, Address(Pm_base));
10576       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10577       lea(Pa, Address(Pa_base));
10578       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10579       lea(Pm, Address(Pm_base));
10580       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10581 
10582       // Zero the m*n result.
10583       mov(Rhi_mn, zr);
10584       mov(Rlo_mn, zr);
10585     }
10586 
10587     // The core multiply-accumulate step of a Montgomery
10588     // multiplication.  The idea is to schedule operations as a
10589     // pipeline so that instructions with long latencies (loads and
10590     // multiplies) have time to complete before their results are
10591     // used.  This most benefits in-order implementations of the
10592     // architecture but out-of-order ones also benefit.
10593     void step() {
10594       block_comment("step");
10595       // MACC(Ra, Rb, t0, t1, t2);
10596       // Ra = *++Pa;
10597       // Rb = *--Pb;
10598       umulh(Rhi_ab, Ra, Rb);
10599       mul(Rlo_ab, Ra, Rb);
10600       ldr(Ra, pre(Pa, wordSize));
10601       ldr(Rb, pre(Pb, -wordSize));
10602       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10603                                        // previous iteration.
10604       // MACC(Rm, Rn, t0, t1, t2);
10605       // Rm = *++Pm;
10606       // Rn = *--Pn;
10607       umulh(Rhi_mn, Rm, Rn);
10608       mul(Rlo_mn, Rm, Rn);
10609       ldr(Rm, pre(Pm, wordSize));
10610       ldr(Rn, pre(Pn, -wordSize));
10611       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10612     }
10613 
10614     void post1() {
10615       block_comment("post1");
10616 
10617       // MACC(Ra, Rb, t0, t1, t2);
10618       // Ra = *++Pa;
10619       // Rb = *--Pb;
10620       umulh(Rhi_ab, Ra, Rb);
10621       mul(Rlo_ab, Ra, Rb);
10622       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10623       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10624 
10625       // *Pm = Rm = t0 * inv;
10626       mul(Rm, t0, inv);
10627       str(Rm, Address(Pm));
10628 
10629       // MACC(Rm, Rn, t0, t1, t2);
10630       // t0 = t1; t1 = t2; t2 = 0;
10631       umulh(Rhi_mn, Rm, Rn);
10632 
10633 #ifndef PRODUCT
10634       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10635       {
10636         mul(Rlo_mn, Rm, Rn);
10637         add(Rlo_mn, t0, Rlo_mn);
10638         Label ok;
10639         cbz(Rlo_mn, ok); {
10640           stop("broken Montgomery multiply");
10641         } bind(ok);
10642       }
10643 #endif
10644       // We have very carefully set things up so that
10645       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10646       // the lower half of Rm * Rn because we know the result already:
10647       // it must be -t0.  t0 + (-t0) must generate a carry iff
10648       // t0 != 0.  So, rather than do a mul and an adds we just set
10649       // the carry flag iff t0 is nonzero.
10650       //
10651       // mul(Rlo_mn, Rm, Rn);
10652       // adds(zr, t0, Rlo_mn);
10653       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10654       adcs(t0, t1, Rhi_mn);
10655       adc(t1, t2, zr);
10656       mov(t2, zr);
10657     }
10658 
10659     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
10660       block_comment("pre2");
10661       // Pa = Pa_base + i-len;
10662       // Pb = Pb_base + len;
10663       // Pm = Pm_base + i-len;
10664       // Pn = Pn_base + len;
10665 
10666       if (i.is_register()) {
10667         sub(Rj, i.as_register(), len);
10668       } else {
10669         mov(Rj, i.as_constant());
10670         sub(Rj, Rj, len);
10671       }
10672       // Rj == i-len
10673 
10674       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
10675       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
10676       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10677       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
10678 
10679       // Ra = *++Pa;
10680       // Rb = *--Pb;
10681       // Rm = *++Pm;
10682       // Rn = *--Pn;
10683       ldr(Ra, pre(Pa, wordSize));
10684       ldr(Rb, pre(Pb, -wordSize));
10685       ldr(Rm, pre(Pm, wordSize));
10686       ldr(Rn, pre(Pn, -wordSize));
10687 
10688       mov(Rhi_mn, zr);
10689       mov(Rlo_mn, zr);
10690     }
10691 
10692     void post2(RegisterOrConstant i, RegisterOrConstant len) {
10693       block_comment("post2");
10694       if (i.is_constant()) {
10695         mov(Rj, i.as_constant()-len.as_constant());
10696       } else {
10697         sub(Rj, i.as_register(), len);
10698       }
10699 
10700       adds(t0, t0, Rlo_mn); // The pending m*n, low part
10701 
10702       // As soon as we know the least significant digit of our result,
10703       // store it.
10704       // Pm_base[i-len] = t0;
10705       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10706 
10707       // t0 = t1; t1 = t2; t2 = 0;
10708       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
10709       adc(t1, t2, zr);
10710       mov(t2, zr);
10711     }
10712 
10713     // A carry in t0 after Montgomery multiplication means that we
10714     // should subtract multiples of n from our result in m.  We'll
10715     // keep doing that until there is no carry.
10716     void normalize(RegisterOrConstant len) {
10717       block_comment("normalize");
10718       // while (t0)
10719       //   t0 = sub(Pm_base, Pn_base, t0, len);
10720       Label loop, post, again;
10721       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
10722       cbz(t0, post); {
10723         bind(again); {
10724           mov(i, zr);
10725           mov(cnt, len);
10726           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10727           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10728           subs(zr, zr, zr); // set carry flag, i.e. no borrow
10729           align(16);
10730           bind(loop); {
10731             sbcs(Rm, Rm, Rn);
10732             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10733             add(i, i, 1);
10734             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10735             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10736             sub(cnt, cnt, 1);
10737           } cbnz(cnt, loop);
10738           sbc(t0, t0, zr);
10739         } cbnz(t0, again);
10740       } bind(post);
10741     }
10742 
10743     // Move memory at s to d, reversing words.
10744     //    Increments d to end of copied memory
10745     //    Destroys tmp1, tmp2
10746     //    Preserves len
10747     //    Leaves s pointing to the address which was in d at start
10748     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
10749       assert(tmp1->encoding() < r19->encoding(), "register corruption");
10750       assert(tmp2->encoding() < r19->encoding(), "register corruption");
10751 
10752       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
10753       mov(tmp1, len);
10754       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
10755       sub(s, d, len, ext::uxtw, LogBytesPerWord);
10756     }
10757     // where
10758     void reverse1(Register d, Register s, Register tmp) {
10759       ldr(tmp, pre(s, -wordSize));
10760       ror(tmp, tmp, 32);
10761       str(tmp, post(d, wordSize));
10762     }
10763 
10764     void step_squaring() {
10765       // An extra ACC
10766       step();
10767       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10768     }
10769 
10770     void last_squaring(RegisterOrConstant i) {
10771       Label dont;
10772       // if ((i & 1) == 0) {
10773       tbnz(i.as_register(), 0, dont); {
10774         // MACC(Ra, Rb, t0, t1, t2);
10775         // Ra = *++Pa;
10776         // Rb = *--Pb;
10777         umulh(Rhi_ab, Ra, Rb);
10778         mul(Rlo_ab, Ra, Rb);
10779         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10780       } bind(dont);
10781     }
10782 
10783     void extra_step_squaring() {
10784       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10785 
10786       // MACC(Rm, Rn, t0, t1, t2);
10787       // Rm = *++Pm;
10788       // Rn = *--Pn;
10789       umulh(Rhi_mn, Rm, Rn);
10790       mul(Rlo_mn, Rm, Rn);
10791       ldr(Rm, pre(Pm, wordSize));
10792       ldr(Rn, pre(Pn, -wordSize));
10793     }
10794 
10795     void post1_squaring() {
10796       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10797 
10798       // *Pm = Rm = t0 * inv;
10799       mul(Rm, t0, inv);
10800       str(Rm, Address(Pm));
10801 
10802       // MACC(Rm, Rn, t0, t1, t2);
10803       // t0 = t1; t1 = t2; t2 = 0;
10804       umulh(Rhi_mn, Rm, Rn);
10805 
10806 #ifndef PRODUCT
10807       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10808       {
10809         mul(Rlo_mn, Rm, Rn);
10810         add(Rlo_mn, t0, Rlo_mn);
10811         Label ok;
10812         cbz(Rlo_mn, ok); {
10813           stop("broken Montgomery multiply");
10814         } bind(ok);
10815       }
10816 #endif
10817       // We have very carefully set things up so that
10818       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10819       // the lower half of Rm * Rn because we know the result already:
10820       // it must be -t0.  t0 + (-t0) must generate a carry iff
10821       // t0 != 0.  So, rather than do a mul and an adds we just set
10822       // the carry flag iff t0 is nonzero.
10823       //
10824       // mul(Rlo_mn, Rm, Rn);
10825       // adds(zr, t0, Rlo_mn);
10826       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10827       adcs(t0, t1, Rhi_mn);
10828       adc(t1, t2, zr);
10829       mov(t2, zr);
10830     }
10831 
10832     void acc(Register Rhi, Register Rlo,
10833              Register t0, Register t1, Register t2) {
10834       adds(t0, t0, Rlo);
10835       adcs(t1, t1, Rhi);
10836       adc(t2, t2, zr);
10837     }
10838 
10839   public:
10840     /**
10841      * Fast Montgomery multiplication.  The derivation of the
10842      * algorithm is in A Cryptographic Library for the Motorola
10843      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
10844      *
10845      * Arguments:
10846      *
10847      * Inputs for multiplication:
10848      *   c_rarg0   - int array elements a
10849      *   c_rarg1   - int array elements b
10850      *   c_rarg2   - int array elements n (the modulus)
10851      *   c_rarg3   - int length
10852      *   c_rarg4   - int inv
10853      *   c_rarg5   - int array elements m (the result)
10854      *
10855      * Inputs for squaring:
10856      *   c_rarg0   - int array elements a
10857      *   c_rarg1   - int array elements n (the modulus)
10858      *   c_rarg2   - int length
10859      *   c_rarg3   - int inv
10860      *   c_rarg4   - int array elements m (the result)
10861      *
10862      */
10863     address generate_multiply() {
10864       Label argh, nothing;
10865       bind(argh);
10866       stop("MontgomeryMultiply total_allocation must be <= 8192");
10867 
10868       align(CodeEntryAlignment);
10869       address entry = pc();
10870 
10871       cbzw(Rlen, nothing);
10872 
10873       enter();
10874 
10875       // Make room.
10876       cmpw(Rlen, 512);
10877       br(Assembler::HI, argh);
10878       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
10879       andr(sp, Ra, -2 * wordSize);
10880 
10881       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
10882 
10883       {
10884         // Copy input args, reversing as we go.  We use Ra as a
10885         // temporary variable.
10886         reverse(Ra, Pa_base, Rlen, t0, t1);
10887         if (!_squaring)
10888           reverse(Ra, Pb_base, Rlen, t0, t1);
10889         reverse(Ra, Pn_base, Rlen, t0, t1);
10890       }
10891 
10892       // Push all call-saved registers and also Pm_base which we'll need
10893       // at the end.
10894       save_regs();
10895 
10896 #ifndef PRODUCT
10897       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
10898       {
10899         ldr(Rn, Address(Pn_base, 0));
10900         mul(Rlo_mn, Rn, inv);
10901         subs(zr, Rlo_mn, -1);
10902         Label ok;
10903         br(EQ, ok); {
10904           stop("broken inverse in Montgomery multiply");
10905         } bind(ok);
10906       }
10907 #endif
10908 
10909       mov(Pm_base, Ra);
10910 
10911       mov(t0, zr);
10912       mov(t1, zr);
10913       mov(t2, zr);
10914 
10915       block_comment("for (int i = 0; i < len; i++) {");
10916       mov(Ri, zr); {
10917         Label loop, end;
10918         cmpw(Ri, Rlen);
10919         br(Assembler::GE, end);
10920 
10921         bind(loop);
10922         pre1(Ri);
10923 
10924         block_comment("  for (j = i; j; j--) {"); {
10925           movw(Rj, Ri);
10926           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10927         } block_comment("  } // j");
10928 
10929         post1();
10930         addw(Ri, Ri, 1);
10931         cmpw(Ri, Rlen);
10932         br(Assembler::LT, loop);
10933         bind(end);
10934         block_comment("} // i");
10935       }
10936 
10937       block_comment("for (int i = len; i < 2*len; i++) {");
10938       mov(Ri, Rlen); {
10939         Label loop, end;
10940         cmpw(Ri, Rlen, Assembler::LSL, 1);
10941         br(Assembler::GE, end);
10942 
10943         bind(loop);
10944         pre2(Ri, Rlen);
10945 
10946         block_comment("  for (j = len*2-i-1; j; j--) {"); {
10947           lslw(Rj, Rlen, 1);
10948           subw(Rj, Rj, Ri);
10949           subw(Rj, Rj, 1);
10950           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10951         } block_comment("  } // j");
10952 
10953         post2(Ri, Rlen);
10954         addw(Ri, Ri, 1);
10955         cmpw(Ri, Rlen, Assembler::LSL, 1);
10956         br(Assembler::LT, loop);
10957         bind(end);
10958       }
10959       block_comment("} // i");
10960 
10961       normalize(Rlen);
10962 
10963       mov(Ra, Pm_base);  // Save Pm_base in Ra
10964       restore_regs();  // Restore caller's Pm_base
10965 
10966       // Copy our result into caller's Pm_base
10967       reverse(Pm_base, Ra, Rlen, t0, t1);
10968 
10969       leave();
10970       bind(nothing);
10971       ret(lr);
10972 
10973       return entry;
10974     }
10975     // In C, approximately:
10976 
10977     // void
10978     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
10979     //                     julong Pn_base[], julong Pm_base[],
10980     //                     julong inv, int len) {
10981     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
10982     //   julong *Pa, *Pb, *Pn, *Pm;
10983     //   julong Ra, Rb, Rn, Rm;
10984 
10985     //   int i;
10986 
10987     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
10988 
10989     //   for (i = 0; i < len; i++) {
10990     //     int j;
10991 
10992     //     Pa = Pa_base;
10993     //     Pb = Pb_base + i;
10994     //     Pm = Pm_base;
10995     //     Pn = Pn_base + i;
10996 
10997     //     Ra = *Pa;
10998     //     Rb = *Pb;
10999     //     Rm = *Pm;
11000     //     Rn = *Pn;
11001 
11002     //     int iters = i;
11003     //     for (j = 0; iters--; j++) {
11004     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11005     //       MACC(Ra, Rb, t0, t1, t2);
11006     //       Ra = *++Pa;
11007     //       Rb = *--Pb;
11008     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11009     //       MACC(Rm, Rn, t0, t1, t2);
11010     //       Rm = *++Pm;
11011     //       Rn = *--Pn;
11012     //     }
11013 
11014     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11015     //     MACC(Ra, Rb, t0, t1, t2);
11016     //     *Pm = Rm = t0 * inv;
11017     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11018     //     MACC(Rm, Rn, t0, t1, t2);
11019 
11020     //     assert(t0 == 0, "broken Montgomery multiply");
11021 
11022     //     t0 = t1; t1 = t2; t2 = 0;
11023     //   }
11024 
11025     //   for (i = len; i < 2*len; i++) {
11026     //     int j;
11027 
11028     //     Pa = Pa_base + i-len;
11029     //     Pb = Pb_base + len;
11030     //     Pm = Pm_base + i-len;
11031     //     Pn = Pn_base + len;
11032 
11033     //     Ra = *++Pa;
11034     //     Rb = *--Pb;
11035     //     Rm = *++Pm;
11036     //     Rn = *--Pn;
11037 
11038     //     int iters = len*2-i-1;
11039     //     for (j = i-len+1; iters--; j++) {
11040     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11041     //       MACC(Ra, Rb, t0, t1, t2);
11042     //       Ra = *++Pa;
11043     //       Rb = *--Pb;
11044     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11045     //       MACC(Rm, Rn, t0, t1, t2);
11046     //       Rm = *++Pm;
11047     //       Rn = *--Pn;
11048     //     }
11049 
11050     //     Pm_base[i-len] = t0;
11051     //     t0 = t1; t1 = t2; t2 = 0;
11052     //   }
11053 
11054     //   while (t0)
11055     //     t0 = sub(Pm_base, Pn_base, t0, len);
11056     // }
11057 
11058     /**
11059      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11060      * multiplies than Montgomery multiplication so it should be up to
11061      * 25% faster.  However, its loop control is more complex and it
11062      * may actually run slower on some machines.
11063      *
11064      * Arguments:
11065      *
11066      * Inputs:
11067      *   c_rarg0   - int array elements a
11068      *   c_rarg1   - int array elements n (the modulus)
11069      *   c_rarg2   - int length
11070      *   c_rarg3   - int inv
11071      *   c_rarg4   - int array elements m (the result)
11072      *
11073      */
11074     address generate_square() {
11075       Label argh;
11076       bind(argh);
11077       stop("MontgomeryMultiply total_allocation must be <= 8192");
11078 
11079       align(CodeEntryAlignment);
11080       address entry = pc();
11081 
11082       enter();
11083 
11084       // Make room.
11085       cmpw(Rlen, 512);
11086       br(Assembler::HI, argh);
11087       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11088       andr(sp, Ra, -2 * wordSize);
11089 
11090       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11091 
11092       {
11093         // Copy input args, reversing as we go.  We use Ra as a
11094         // temporary variable.
11095         reverse(Ra, Pa_base, Rlen, t0, t1);
11096         reverse(Ra, Pn_base, Rlen, t0, t1);
11097       }
11098 
11099       // Push all call-saved registers and also Pm_base which we'll need
11100       // at the end.
11101       save_regs();
11102 
11103       mov(Pm_base, Ra);
11104 
11105       mov(t0, zr);
11106       mov(t1, zr);
11107       mov(t2, zr);
11108 
11109       block_comment("for (int i = 0; i < len; i++) {");
11110       mov(Ri, zr); {
11111         Label loop, end;
11112         bind(loop);
11113         cmp(Ri, Rlen);
11114         br(Assembler::GE, end);
11115 
11116         pre1(Ri);
11117 
11118         block_comment("for (j = (i+1)/2; j; j--) {"); {
11119           add(Rj, Ri, 1);
11120           lsr(Rj, Rj, 1);
11121           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11122         } block_comment("  } // j");
11123 
11124         last_squaring(Ri);
11125 
11126         block_comment("  for (j = i/2; j; j--) {"); {
11127           lsr(Rj, Ri, 1);
11128           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11129         } block_comment("  } // j");
11130 
11131         post1_squaring();
11132         add(Ri, Ri, 1);
11133         cmp(Ri, Rlen);
11134         br(Assembler::LT, loop);
11135 
11136         bind(end);
11137         block_comment("} // i");
11138       }
11139 
11140       block_comment("for (int i = len; i < 2*len; i++) {");
11141       mov(Ri, Rlen); {
11142         Label loop, end;
11143         bind(loop);
11144         cmp(Ri, Rlen, Assembler::LSL, 1);
11145         br(Assembler::GE, end);
11146 
11147         pre2(Ri, Rlen);
11148 
11149         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11150           lsl(Rj, Rlen, 1);
11151           sub(Rj, Rj, Ri);
11152           sub(Rj, Rj, 1);
11153           lsr(Rj, Rj, 1);
11154           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11155         } block_comment("  } // j");
11156 
11157         last_squaring(Ri);
11158 
11159         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11160           lsl(Rj, Rlen, 1);
11161           sub(Rj, Rj, Ri);
11162           lsr(Rj, Rj, 1);
11163           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11164         } block_comment("  } // j");
11165 
11166         post2(Ri, Rlen);
11167         add(Ri, Ri, 1);
11168         cmp(Ri, Rlen, Assembler::LSL, 1);
11169 
11170         br(Assembler::LT, loop);
11171         bind(end);
11172         block_comment("} // i");
11173       }
11174 
11175       normalize(Rlen);
11176 
11177       mov(Ra, Pm_base);  // Save Pm_base in Ra
11178       restore_regs();  // Restore caller's Pm_base
11179 
11180       // Copy our result into caller's Pm_base
11181       reverse(Pm_base, Ra, Rlen, t0, t1);
11182 
11183       leave();
11184       ret(lr);
11185 
11186       return entry;
11187     }
11188     // In C, approximately:
11189 
11190     // void
11191     // montgomery_square(julong Pa_base[], julong Pn_base[],
11192     //                   julong Pm_base[], julong inv, int len) {
11193     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11194     //   julong *Pa, *Pb, *Pn, *Pm;
11195     //   julong Ra, Rb, Rn, Rm;
11196 
11197     //   int i;
11198 
11199     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11200 
11201     //   for (i = 0; i < len; i++) {
11202     //     int j;
11203 
11204     //     Pa = Pa_base;
11205     //     Pb = Pa_base + i;
11206     //     Pm = Pm_base;
11207     //     Pn = Pn_base + i;
11208 
11209     //     Ra = *Pa;
11210     //     Rb = *Pb;
11211     //     Rm = *Pm;
11212     //     Rn = *Pn;
11213 
11214     //     int iters = (i+1)/2;
11215     //     for (j = 0; iters--; j++) {
11216     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11217     //       MACC2(Ra, Rb, t0, t1, t2);
11218     //       Ra = *++Pa;
11219     //       Rb = *--Pb;
11220     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11221     //       MACC(Rm, Rn, t0, t1, t2);
11222     //       Rm = *++Pm;
11223     //       Rn = *--Pn;
11224     //     }
11225     //     if ((i & 1) == 0) {
11226     //       assert(Ra == Pa_base[j], "must be");
11227     //       MACC(Ra, Ra, t0, t1, t2);
11228     //     }
11229     //     iters = i/2;
11230     //     assert(iters == i-j, "must be");
11231     //     for (; iters--; j++) {
11232     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11233     //       MACC(Rm, Rn, t0, t1, t2);
11234     //       Rm = *++Pm;
11235     //       Rn = *--Pn;
11236     //     }
11237 
11238     //     *Pm = Rm = t0 * inv;
11239     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11240     //     MACC(Rm, Rn, t0, t1, t2);
11241 
11242     //     assert(t0 == 0, "broken Montgomery multiply");
11243 
11244     //     t0 = t1; t1 = t2; t2 = 0;
11245     //   }
11246 
11247     //   for (i = len; i < 2*len; i++) {
11248     //     int start = i-len+1;
11249     //     int end = start + (len - start)/2;
11250     //     int j;
11251 
11252     //     Pa = Pa_base + i-len;
11253     //     Pb = Pa_base + len;
11254     //     Pm = Pm_base + i-len;
11255     //     Pn = Pn_base + len;
11256 
11257     //     Ra = *++Pa;
11258     //     Rb = *--Pb;
11259     //     Rm = *++Pm;
11260     //     Rn = *--Pn;
11261 
11262     //     int iters = (2*len-i-1)/2;
11263     //     assert(iters == end-start, "must be");
11264     //     for (j = start; iters--; j++) {
11265     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11266     //       MACC2(Ra, Rb, t0, t1, t2);
11267     //       Ra = *++Pa;
11268     //       Rb = *--Pb;
11269     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11270     //       MACC(Rm, Rn, t0, t1, t2);
11271     //       Rm = *++Pm;
11272     //       Rn = *--Pn;
11273     //     }
11274     //     if ((i & 1) == 0) {
11275     //       assert(Ra == Pa_base[j], "must be");
11276     //       MACC(Ra, Ra, t0, t1, t2);
11277     //     }
11278     //     iters =  (2*len-i)/2;
11279     //     assert(iters == len-j, "must be");
11280     //     for (; iters--; j++) {
11281     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11282     //       MACC(Rm, Rn, t0, t1, t2);
11283     //       Rm = *++Pm;
11284     //       Rn = *--Pn;
11285     //     }
11286     //     Pm_base[i-len] = t0;
11287     //     t0 = t1; t1 = t2; t2 = 0;
11288     //   }
11289 
11290     //   while (t0)
11291     //     t0 = sub(Pm_base, Pn_base, t0, len);
11292     // }
11293   };
11294 
11295   // Initialization
11296   void generate_initial_stubs() {
11297     // Generate initial stubs and initializes the entry points
11298 
11299     // entry points that exist in all platforms Note: This is code
11300     // that could be shared among different platforms - however the
11301     // benefit seems to be smaller than the disadvantage of having a
11302     // much more complicated generator structure. See also comment in
11303     // stubRoutines.hpp.
11304 
11305     StubRoutines::_forward_exception_entry = generate_forward_exception();
11306 
11307     StubRoutines::_call_stub_entry =
11308       generate_call_stub(StubRoutines::_call_stub_return_address);
11309 
11310     // is referenced by megamorphic call
11311     StubRoutines::_catch_exception_entry = generate_catch_exception();
11312 
11313     // Initialize table for copy memory (arraycopy) check.
11314     if (UnsafeMemoryAccess::_table == nullptr) {
11315       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11316     }
11317 
11318     if (UseCRC32Intrinsics) {
11319       // set table address before stub generation which use it
11320       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
11321       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11322     }
11323 
11324     if (UseCRC32CIntrinsics) {
11325       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11326     }
11327 
11328     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11329       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11330     }
11331 
11332     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11333       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11334     }
11335 
11336     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11337         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11338       StubRoutines::_hf2f = generate_float16ToFloat();
11339       StubRoutines::_f2hf = generate_floatToFloat16();
11340     }
11341   }
11342 
11343   void generate_continuation_stubs() {
11344     // Continuation stubs:
11345     StubRoutines::_cont_thaw          = generate_cont_thaw();
11346     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11347     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11348     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11349   }
11350 
11351   void generate_final_stubs() {
11352     // support for verify_oop (must happen after universe_init)
11353     if (VerifyOops) {
11354       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11355     }
11356 
11357     // arraycopy stubs used by compilers
11358     generate_arraycopy_stubs();
11359 
11360     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11361 
11362     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11363 
11364     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11365     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11366 
11367 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11368 
11369     generate_atomic_entry_points();
11370 
11371 #endif // LINUX
11372 
11373 #ifdef COMPILER2
11374     if (UseSecondarySupersTable) {
11375       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11376       if (! InlineSecondarySupersTest) {
11377         generate_lookup_secondary_supers_table_stub();
11378       }
11379     }
11380 #endif
11381 
11382     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11383 
11384     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11385   }
11386 
11387   void generate_compiler_stubs() {
11388 #if COMPILER2_OR_JVMCI
11389 
11390     if (UseSVE == 0) {
11391       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
11392     }
11393 
11394     // array equals stub for large arrays.
11395     if (!UseSimpleArrayEquals) {
11396       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11397     }
11398 
11399     // arrays_hascode stub for large arrays.
11400     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11401     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11402     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11403     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11404     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11405 
11406     // byte_array_inflate stub for large arrays.
11407     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11408 
11409     // countPositives stub for large arrays.
11410     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11411 
11412     generate_compare_long_strings();
11413 
11414     generate_string_indexof_stubs();
11415 
11416 #ifdef COMPILER2
11417     if (UseMultiplyToLenIntrinsic) {
11418       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11419     }
11420 
11421     if (UseSquareToLenIntrinsic) {
11422       StubRoutines::_squareToLen = generate_squareToLen();
11423     }
11424 
11425     if (UseMulAddIntrinsic) {
11426       StubRoutines::_mulAdd = generate_mulAdd();
11427     }
11428 
11429     if (UseSIMDForBigIntegerShiftIntrinsics) {
11430       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11431       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11432     }
11433 
11434     if (UseMontgomeryMultiplyIntrinsic) {
11435       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
11436       StubCodeMark mark(this, stub_id);
11437       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11438       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11439     }
11440 
11441     if (UseMontgomerySquareIntrinsic) {
11442       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
11443       StubCodeMark mark(this, stub_id);
11444       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11445       // We use generate_multiply() rather than generate_square()
11446       // because it's faster for the sizes of modulus we care about.
11447       StubRoutines::_montgomerySquare = g.generate_multiply();
11448     }
11449 
11450 #endif // COMPILER2
11451 
11452     if (UseChaCha20Intrinsics) {
11453       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11454     }
11455 
11456     if (UseKyberIntrinsics) {
11457       StubRoutines::_kyberNtt = generate_kyberNtt();
11458       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11459       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11460       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11461       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11462       StubRoutines::_kyber12To16 = generate_kyber12To16();
11463       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11464     }
11465 
11466     if (UseDilithiumIntrinsics) {
11467       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11468       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11469       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11470       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11471       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11472     }
11473 
11474     if (UseBASE64Intrinsics) {
11475         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11476         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11477     }
11478 
11479     // data cache line writeback
11480     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11481     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11482 
11483     if (UseAESIntrinsics) {
11484       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11485       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11486       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11487       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11488       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11489     }
11490     if (UseGHASHIntrinsics) {
11491       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11492       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11493     }
11494     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11495       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11496     }
11497 
11498     if (UseMD5Intrinsics) {
11499       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
11500       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
11501     }
11502     if (UseSHA1Intrinsics) {
11503       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
11504       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
11505     }
11506     if (UseSHA256Intrinsics) {
11507       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
11508       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
11509     }
11510     if (UseSHA512Intrinsics) {
11511       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
11512       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
11513     }
11514     if (UseSHA3Intrinsics) {
11515       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
11516       StubRoutines::_double_keccak         = generate_double_keccak();
11517       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
11518     }
11519 
11520     if (UsePoly1305Intrinsics) {
11521       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11522     }
11523 
11524     // generate Adler32 intrinsics code
11525     if (UseAdler32Intrinsics) {
11526       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11527     }
11528 
11529 #endif // COMPILER2_OR_JVMCI
11530   }
11531 
11532  public:
11533   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
11534     switch(blob_id) {
11535     case initial_id:
11536       generate_initial_stubs();
11537       break;
11538      case continuation_id:
11539       generate_continuation_stubs();
11540       break;
11541     case compiler_id:
11542       generate_compiler_stubs();
11543       break;
11544     case final_id:
11545       generate_final_stubs();
11546       break;
11547     default:
11548       fatal("unexpected blob id: %d", blob_id);
11549       break;
11550     };
11551   }
11552 }; // end class declaration
11553 
11554 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
11555   StubGenerator g(code, blob_id);
11556 }
11557 
11558 
11559 #if defined (LINUX)
11560 
11561 // Define pointers to atomic stubs and initialize them to point to the
11562 // code in atomic_aarch64.S.
11563 
11564 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11565   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11566     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11567   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11568     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11569 
11570 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11571 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11572 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11573 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11574 DEFAULT_ATOMIC_OP(xchg, 4, )
11575 DEFAULT_ATOMIC_OP(xchg, 8, )
11576 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11577 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11578 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11579 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11580 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11581 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11582 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11583 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11584 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11585 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11586 
11587 #undef DEFAULT_ATOMIC_OP
11588 
11589 #endif // LINUX