1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubId stub_id = StubId::stubgen_call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     __ ldr(j_rarg2, result);
  332     Label is_long, is_float, is_double, exit;
  333     __ ldr(j_rarg1, result_type);
  334     __ cmp(j_rarg1, (u1)T_OBJECT);
  335     __ br(Assembler::EQ, is_long);
  336     __ cmp(j_rarg1, (u1)T_LONG);
  337     __ br(Assembler::EQ, is_long);
  338     __ cmp(j_rarg1, (u1)T_FLOAT);
  339     __ br(Assembler::EQ, is_float);
  340     __ cmp(j_rarg1, (u1)T_DOUBLE);
  341     __ br(Assembler::EQ, is_double);
  342 
  343     // handle T_INT case
  344     __ strw(r0, Address(j_rarg2));
  345 
  346     __ BIND(exit);
  347 
  348     // pop parameters
  349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  350 
  351 #ifdef ASSERT
  352     // verify that threads correspond
  353     {
  354       Label L, S;
  355       __ ldr(rscratch1, thread);
  356       __ cmp(rthread, rscratch1);
  357       __ br(Assembler::NE, S);
  358       __ get_thread(rscratch1);
  359       __ cmp(rthread, rscratch1);
  360       __ br(Assembler::EQ, L);
  361       __ BIND(S);
  362       __ stop("StubRoutines::call_stub: threads must correspond");
  363       __ BIND(L);
  364     }
  365 #endif
  366 
  367     __ pop_cont_fastpath(rthread);
  368 
  369     // restore callee-save registers
  370     __ ldpd(v15, v14,  d15_save);
  371     __ ldpd(v13, v12,  d13_save);
  372     __ ldpd(v11, v10,  d11_save);
  373     __ ldpd(v9,  v8,   d9_save);
  374 
  375     __ ldp(r28, r27,   r28_save);
  376     __ ldp(r26, r25,   r26_save);
  377     __ ldp(r24, r23,   r24_save);
  378     __ ldp(r22, r21,   r22_save);
  379     __ ldp(r20, r19,   r20_save);
  380 
  381     // restore fpcr
  382     __ ldr(rscratch1,  fpcr_save);
  383     __ set_fpcr(rscratch1);
  384 
  385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  386     __ ldrw(c_rarg2, result_type);
  387     __ ldr(c_rarg3,  method);
  388     __ ldp(c_rarg4, c_rarg5,  entry_point);
  389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  390 
  391     // leave frame and return to caller
  392     __ leave();
  393     __ ret(lr);
  394 
  395     // handle return types different from T_INT
  396 
  397     __ BIND(is_long);
  398     __ str(r0, Address(j_rarg2, 0));
  399     __ br(Assembler::AL, exit);
  400 
  401     __ BIND(is_float);
  402     __ strs(j_farg0, Address(j_rarg2, 0));
  403     __ br(Assembler::AL, exit);
  404 
  405     __ BIND(is_double);
  406     __ strd(j_farg0, Address(j_rarg2, 0));
  407     __ br(Assembler::AL, exit);
  408 
  409     return start;
  410   }
  411 
  412   // Return point for a Java call if there's an exception thrown in
  413   // Java code.  The exception is caught and transformed into a
  414   // pending exception stored in JavaThread that can be tested from
  415   // within the VM.
  416   //
  417   // Note: Usually the parameters are removed by the callee. In case
  418   // of an exception crossing an activation frame boundary, that is
  419   // not the case if the callee is compiled code => need to setup the
  420   // rsp.
  421   //
  422   // r0: exception oop
  423 
  424   address generate_catch_exception() {
  425     StubId stub_id = StubId::stubgen_catch_exception_id;
  426     StubCodeMark mark(this, stub_id);
  427     address start = __ pc();
  428 
  429     // same as in generate_call_stub():
  430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  431     const Address thread        (rfp, thread_off         * wordSize);
  432 
  433 #ifdef ASSERT
  434     // verify that threads correspond
  435     {
  436       Label L, S;
  437       __ ldr(rscratch1, thread);
  438       __ cmp(rthread, rscratch1);
  439       __ br(Assembler::NE, S);
  440       __ get_thread(rscratch1);
  441       __ cmp(rthread, rscratch1);
  442       __ br(Assembler::EQ, L);
  443       __ bind(S);
  444       __ stop("StubRoutines::catch_exception: threads must correspond");
  445       __ bind(L);
  446     }
  447 #endif
  448 
  449     // set pending exception
  450     __ verify_oop(r0);
  451 
  452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  453     __ mov(rscratch1, (address)__FILE__);
  454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  455     __ movw(rscratch1, (int)__LINE__);
  456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  457 
  458     // complete return to VM
  459     assert(StubRoutines::_call_stub_return_address != nullptr,
  460            "_call_stub_return_address must have been generated before");
  461     __ b(StubRoutines::_call_stub_return_address);
  462 
  463     return start;
  464   }
  465 
  466   // Continuation point for runtime calls returning with a pending
  467   // exception.  The pending exception check happened in the runtime
  468   // or native call stub.  The pending exception in Thread is
  469   // converted into a Java-level exception.
  470   //
  471   // Contract with Java-level exception handlers:
  472   // r0: exception
  473   // r3: throwing pc
  474   //
  475   // NOTE: At entry of this stub, exception-pc must be in LR !!
  476 
  477   // NOTE: this is always used as a jump target within generated code
  478   // so it just needs to be generated code with no x86 prolog
  479 
  480   address generate_forward_exception() {
  481     StubId stub_id = StubId::stubgen_forward_exception_id;
  482     StubCodeMark mark(this, stub_id);
  483     address start = __ pc();
  484 
  485     // Upon entry, LR points to the return address returning into
  486     // Java (interpreted or compiled) code; i.e., the return address
  487     // becomes the throwing pc.
  488     //
  489     // Arguments pushed before the runtime call are still on the stack
  490     // but the exception handler will reset the stack pointer ->
  491     // ignore them.  A potential result in registers can be ignored as
  492     // well.
  493 
  494 #ifdef ASSERT
  495     // make sure this code is only executed if there is a pending exception
  496     {
  497       Label L;
  498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  499       __ cbnz(rscratch1, L);
  500       __ stop("StubRoutines::forward exception: no pending exception (1)");
  501       __ bind(L);
  502     }
  503 #endif
  504 
  505     // compute exception handler into r19
  506 
  507     // call the VM to find the handler address associated with the
  508     // caller address. pass thread in r0 and caller pc (ret address)
  509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  510     // the stack.
  511     __ mov(c_rarg1, lr);
  512     // lr will be trashed by the VM call so we move it to R19
  513     // (callee-saved) because we also need to pass it to the handler
  514     // returned by this call.
  515     __ mov(r19, lr);
  516     BLOCK_COMMENT("call exception_handler_for_return_address");
  517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  518                          SharedRuntime::exception_handler_for_return_address),
  519                     rthread, c_rarg1);
  520     // Reinitialize the ptrue predicate register, in case the external runtime
  521     // call clobbers ptrue reg, as we may return to SVE compiled code.
  522     __ reinitialize_ptrue();
  523 
  524     // we should not really care that lr is no longer the callee
  525     // address. we saved the value the handler needs in r19 so we can
  526     // just copy it to r3. however, the C2 handler will push its own
  527     // frame and then calls into the VM and the VM code asserts that
  528     // the PC for the frame above the handler belongs to a compiled
  529     // Java method. So, we restore lr here to satisfy that assert.
  530     __ mov(lr, r19);
  531     // setup r0 & r3 & clear pending exception
  532     __ mov(r3, r19);
  533     __ mov(r19, r0);
  534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  536 
  537 #ifdef ASSERT
  538     // make sure exception is set
  539     {
  540       Label L;
  541       __ cbnz(r0, L);
  542       __ stop("StubRoutines::forward exception: no pending exception (2)");
  543       __ bind(L);
  544     }
  545 #endif
  546 
  547     // continue at exception handler
  548     // r0: exception
  549     // r3: throwing pc
  550     // r19: exception handler
  551     __ verify_oop(r0);
  552     __ br(r19);
  553 
  554     return start;
  555   }
  556 
  557   // Non-destructive plausibility checks for oops
  558   //
  559   // Arguments:
  560   //    r0: oop to verify
  561   //    rscratch1: error message
  562   //
  563   // Stack after saving c_rarg3:
  564   //    [tos + 0]: saved c_rarg3
  565   //    [tos + 1]: saved c_rarg2
  566   //    [tos + 2]: saved lr
  567   //    [tos + 3]: saved rscratch2
  568   //    [tos + 4]: saved r0
  569   //    [tos + 5]: saved rscratch1
  570   address generate_verify_oop() {
  571     StubId stub_id = StubId::stubgen_verify_oop_id;
  572     StubCodeMark mark(this, stub_id);
  573     address start = __ pc();
  574 
  575     Label exit, error;
  576 
  577     // save c_rarg2 and c_rarg3
  578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  579 
  580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ ldr(c_rarg3, Address(c_rarg2));
  583     __ add(c_rarg3, c_rarg3, 1);
  584     __ str(c_rarg3, Address(c_rarg2));
  585 
  586     // object is in r0
  587     // make sure object is 'reasonable'
  588     __ cbz(r0, exit); // if obj is null it is OK
  589 
  590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  592 
  593     // return if everything seems ok
  594     __ bind(exit);
  595 
  596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  597     __ ret(lr);
  598 
  599     // handle errors
  600     __ bind(error);
  601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  602 
  603     __ push(RegSet::range(r0, r29), sp);
  604     // debug(char* msg, int64_t pc, int64_t regs[])
  605     __ mov(c_rarg0, rscratch1);      // pass address of error message
  606     __ mov(c_rarg1, lr);             // pass return address
  607     __ mov(c_rarg2, sp);             // pass address of regs on stack
  608 #ifndef PRODUCT
  609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  610 #endif
  611     BLOCK_COMMENT("call MacroAssembler::debug");
  612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  613     __ blr(rscratch1);
  614     __ hlt(0);
  615 
  616     return start;
  617   }
  618 
  619   // Generate indices for iota vector.
  620   address generate_iota_indices(StubId stub_id) {
  621     __ align(CodeEntryAlignment);
  622     StubCodeMark mark(this, stub_id);
  623     address start = __ pc();
  624     // B
  625     __ emit_data64(0x0706050403020100, relocInfo::none);
  626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  627     // H
  628     __ emit_data64(0x0003000200010000, relocInfo::none);
  629     __ emit_data64(0x0007000600050004, relocInfo::none);
  630     // S
  631     __ emit_data64(0x0000000100000000, relocInfo::none);
  632     __ emit_data64(0x0000000300000002, relocInfo::none);
  633     // D
  634     __ emit_data64(0x0000000000000000, relocInfo::none);
  635     __ emit_data64(0x0000000000000001, relocInfo::none);
  636     // S - FP
  637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  639     // D - FP
  640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  642     return start;
  643   }
  644 
  645   // The inner part of zero_words().  This is the bulk operation,
  646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  647   // caller is responsible for zeroing the last few words.
  648   //
  649   // Inputs:
  650   // r10: the HeapWord-aligned base address of an array to zero.
  651   // r11: the count in HeapWords, r11 > 0.
  652   //
  653   // Returns r10 and r11, adjusted for the caller to clear.
  654   // r10: the base address of the tail of words left to clear.
  655   // r11: the number of words in the tail.
  656   //      r11 < MacroAssembler::zero_words_block_size.
  657 
  658   address generate_zero_blocks() {
  659     Label done;
  660     Label base_aligned;
  661 
  662     Register base = r10, cnt = r11;
  663 
  664     __ align(CodeEntryAlignment);
  665     StubId stub_id = StubId::stubgen_zero_blocks_id;
  666     StubCodeMark mark(this, stub_id);
  667     address start = __ pc();
  668 
  669     if (UseBlockZeroing) {
  670       int zva_length = VM_Version::zva_length();
  671 
  672       // Ensure ZVA length can be divided by 16. This is required by
  673       // the subsequent operations.
  674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  675 
  676       __ tbz(base, 3, base_aligned);
  677       __ str(zr, Address(__ post(base, 8)));
  678       __ sub(cnt, cnt, 1);
  679       __ bind(base_aligned);
  680 
  681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  682       // alignment.
  683       Label small;
  684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  685       __ subs(rscratch1, cnt, low_limit >> 3);
  686       __ br(Assembler::LT, small);
  687       __ zero_dcache_blocks(base, cnt);
  688       __ bind(small);
  689     }
  690 
  691     {
  692       // Number of stp instructions we'll unroll
  693       const int unroll =
  694         MacroAssembler::zero_words_block_size / 2;
  695       // Clear the remaining blocks.
  696       Label loop;
  697       __ subs(cnt, cnt, unroll * 2);
  698       __ br(Assembler::LT, done);
  699       __ bind(loop);
  700       for (int i = 0; i < unroll; i++)
  701         __ stp(zr, zr, __ post(base, 16));
  702       __ subs(cnt, cnt, unroll * 2);
  703       __ br(Assembler::GE, loop);
  704       __ bind(done);
  705       __ add(cnt, cnt, unroll * 2);
  706     }
  707 
  708     __ ret(lr);
  709 
  710     return start;
  711   }
  712 
  713 
  714   typedef enum {
  715     copy_forwards = 1,
  716     copy_backwards = -1
  717   } copy_direction;
  718 
  719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  720   // for arraycopy stubs.
  721   class ArrayCopyBarrierSetHelper : StackObj {
  722     BarrierSetAssembler* _bs_asm;
  723     MacroAssembler* _masm;
  724     DecoratorSet _decorators;
  725     BasicType _type;
  726     Register _gct1;
  727     Register _gct2;
  728     Register _gct3;
  729     FloatRegister _gcvt1;
  730     FloatRegister _gcvt2;
  731     FloatRegister _gcvt3;
  732 
  733   public:
  734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  735                               DecoratorSet decorators,
  736                               BasicType type,
  737                               Register gct1,
  738                               Register gct2,
  739                               Register gct3,
  740                               FloatRegister gcvt1,
  741                               FloatRegister gcvt2,
  742                               FloatRegister gcvt3)
  743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  744         _masm(masm),
  745         _decorators(decorators),
  746         _type(type),
  747         _gct1(gct1),
  748         _gct2(gct2),
  749         _gct3(gct3),
  750         _gcvt1(gcvt1),
  751         _gcvt2(gcvt2),
  752         _gcvt3(gcvt3) {
  753     }
  754 
  755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  757                             dst1, dst2, src,
  758                             _gct1, _gct2, _gcvt1);
  759     }
  760 
  761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  763                              dst, src1, src2,
  764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  765     }
  766 
  767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  769                             dst1, dst2, src,
  770                             _gct1);
  771     }
  772 
  773     void copy_store_at_16(Address dst, Register src1, Register src2) {
  774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  775                              dst, src1, src2,
  776                              _gct1, _gct2, _gct3);
  777     }
  778 
  779     void copy_load_at_8(Register dst, Address src) {
  780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  781                             dst, noreg, src,
  782                             _gct1);
  783     }
  784 
  785     void copy_store_at_8(Address dst, Register src) {
  786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  787                              dst, src, noreg,
  788                              _gct1, _gct2, _gct3);
  789     }
  790   };
  791 
  792   // Bulk copy of blocks of 8 words.
  793   //
  794   // count is a count of words.
  795   //
  796   // Precondition: count >= 8
  797   //
  798   // Postconditions:
  799   //
  800   // The least significant bit of count contains the remaining count
  801   // of words to copy.  The rest of count is trash.
  802   //
  803   // s and d are adjusted to point to the remaining words to copy
  804   //
  805   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
  806     BasicType type;
  807     copy_direction direction;
  808 
  809     switch (stub_id) {
  810     case StubId::stubgen_copy_byte_f_id:
  811       direction = copy_forwards;
  812       type = T_BYTE;
  813       break;
  814     case StubId::stubgen_copy_byte_b_id:
  815       direction = copy_backwards;
  816       type = T_BYTE;
  817       break;
  818     case StubId::stubgen_copy_oop_f_id:
  819       direction = copy_forwards;
  820       type = T_OBJECT;
  821       break;
  822     case StubId::stubgen_copy_oop_b_id:
  823       direction = copy_backwards;
  824       type = T_OBJECT;
  825       break;
  826     case StubId::stubgen_copy_oop_uninit_f_id:
  827       direction = copy_forwards;
  828       type = T_OBJECT;
  829       break;
  830     case StubId::stubgen_copy_oop_uninit_b_id:
  831       direction = copy_backwards;
  832       type = T_OBJECT;
  833       break;
  834     default:
  835       ShouldNotReachHere();
  836     }
  837 
  838     int unit = wordSize * direction;
  839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  840 
  841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  843     const Register stride = r14;
  844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  847 
  848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  849     assert_different_registers(s, d, count, rscratch1, rscratch2);
  850 
  851     Label again, drain;
  852 
  853     __ align(CodeEntryAlignment);
  854 
  855     StubCodeMark mark(this, stub_id);
  856 
  857     address start = __ pc();
  858 
  859     Label unaligned_copy_long;
  860     if (AvoidUnalignedAccesses) {
  861       __ tbnz(d, 3, unaligned_copy_long);
  862     }
  863 
  864     if (direction == copy_forwards) {
  865       __ sub(s, s, bias);
  866       __ sub(d, d, bias);
  867     }
  868 
  869 #ifdef ASSERT
  870     // Make sure we are never given < 8 words
  871     {
  872       Label L;
  873       __ cmp(count, (u1)8);
  874       __ br(Assembler::GE, L);
  875       __ stop("genrate_copy_longs called with < 8 words");
  876       __ bind(L);
  877     }
  878 #endif
  879 
  880     // Fill 8 registers
  881     if (UseSIMDForMemoryOps) {
  882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  884     } else {
  885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  889     }
  890 
  891     __ subs(count, count, 16);
  892     __ br(Assembler::LO, drain);
  893 
  894     int prefetch = PrefetchCopyIntervalInBytes;
  895     bool use_stride = false;
  896     if (direction == copy_backwards) {
  897       use_stride = prefetch > 256;
  898       prefetch = -prefetch;
  899       if (use_stride) __ mov(stride, prefetch);
  900     }
  901 
  902     __ bind(again);
  903 
  904     if (PrefetchCopyIntervalInBytes > 0)
  905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  906 
  907     if (UseSIMDForMemoryOps) {
  908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  912     } else {
  913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  921     }
  922 
  923     __ subs(count, count, 8);
  924     __ br(Assembler::HS, again);
  925 
  926     // Drain
  927     __ bind(drain);
  928     if (UseSIMDForMemoryOps) {
  929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  931     } else {
  932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936     }
  937 
  938     {
  939       Label L1, L2;
  940       __ tbz(count, exact_log2(4), L1);
  941       if (UseSIMDForMemoryOps) {
  942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  944       } else {
  945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  949       }
  950       __ bind(L1);
  951 
  952       if (direction == copy_forwards) {
  953         __ add(s, s, bias);
  954         __ add(d, d, bias);
  955       }
  956 
  957       __ tbz(count, 1, L2);
  958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  960       __ bind(L2);
  961     }
  962 
  963     __ ret(lr);
  964 
  965     if (AvoidUnalignedAccesses) {
  966       Label drain, again;
  967       // Register order for storing. Order is different for backward copy.
  968 
  969       __ bind(unaligned_copy_long);
  970 
  971       // source address is even aligned, target odd aligned
  972       //
  973       // when forward copying word pairs we read long pairs at offsets
  974       // {0, 2, 4, 6} (in long words). when backwards copying we read
  975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  976       // address by -2 in the forwards case so we can compute the
  977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  978       // or -1.
  979       //
  980       // when forward copying we need to store 1 word, 3 pairs and
  981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  982       // zero offset We adjust the destination by -1 which means we
  983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  984       //
  985       // When backwards copyng we need to store 1 word, 3 pairs and
  986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  987       // offsets {1, 3, 5, 7, 8} * unit.
  988 
  989       if (direction == copy_forwards) {
  990         __ sub(s, s, 16);
  991         __ sub(d, d, 8);
  992       }
  993 
  994       // Fill 8 registers
  995       //
  996       // for forwards copy s was offset by -16 from the original input
  997       // value of s so the register contents are at these offsets
  998       // relative to the 64 bit block addressed by that original input
  999       // and so on for each successive 64 byte block when s is updated
 1000       //
 1001       // t0 at offset 0,  t1 at offset 8
 1002       // t2 at offset 16, t3 at offset 24
 1003       // t4 at offset 32, t5 at offset 40
 1004       // t6 at offset 48, t7 at offset 56
 1005 
 1006       // for backwards copy s was not offset so the register contents
 1007       // are at these offsets into the preceding 64 byte block
 1008       // relative to that original input and so on for each successive
 1009       // preceding 64 byte block when s is updated. this explains the
 1010       // slightly counter-intuitive looking pattern of register usage
 1011       // in the stp instructions for backwards copy.
 1012       //
 1013       // t0 at offset -16, t1 at offset -8
 1014       // t2 at offset -32, t3 at offset -24
 1015       // t4 at offset -48, t5 at offset -40
 1016       // t6 at offset -64, t7 at offset -56
 1017 
 1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1022 
 1023       __ subs(count, count, 16);
 1024       __ br(Assembler::LO, drain);
 1025 
 1026       int prefetch = PrefetchCopyIntervalInBytes;
 1027       bool use_stride = false;
 1028       if (direction == copy_backwards) {
 1029         use_stride = prefetch > 256;
 1030         prefetch = -prefetch;
 1031         if (use_stride) __ mov(stride, prefetch);
 1032       }
 1033 
 1034       __ bind(again);
 1035 
 1036       if (PrefetchCopyIntervalInBytes > 0)
 1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1038 
 1039       if (direction == copy_forwards) {
 1040         // allowing for the offset of -8 the store instructions place
 1041         // registers into the target 64 bit block at the following
 1042         // offsets
 1043         //
 1044         // t0 at offset 0
 1045         // t1 at offset 8,  t2 at offset 16
 1046         // t3 at offset 24, t4 at offset 32
 1047         // t5 at offset 40, t6 at offset 48
 1048         // t7 at offset 56
 1049 
 1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1059       } else {
 1060         // d was not offset when we started so the registers are
 1061         // written into the 64 bit block preceding d with the following
 1062         // offsets
 1063         //
 1064         // t1 at offset -8
 1065         // t3 at offset -24, t0 at offset -16
 1066         // t5 at offset -48, t2 at offset -32
 1067         // t7 at offset -56, t4 at offset -48
 1068         //                   t6 at offset -64
 1069         //
 1070         // note that this matches the offsets previously noted for the
 1071         // loads
 1072 
 1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1082       }
 1083 
 1084       __ subs(count, count, 8);
 1085       __ br(Assembler::HS, again);
 1086 
 1087       // Drain
 1088       //
 1089       // this uses the same pattern of offsets and register arguments
 1090       // as above
 1091       __ bind(drain);
 1092       if (direction == copy_forwards) {
 1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1098       } else {
 1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1104       }
 1105       // now we need to copy any remaining part block which may
 1106       // include a 4 word block subblock and/or a 2 word subblock.
 1107       // bits 2 and 1 in the count are the tell-tale for whether we
 1108       // have each such subblock
 1109       {
 1110         Label L1, L2;
 1111         __ tbz(count, exact_log2(4), L1);
 1112         // this is the same as above but copying only 4 longs hence
 1113         // with only one intervening stp between the str instructions
 1114         // but note that the offsets and registers still follow the
 1115         // same pattern
 1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1118         if (direction == copy_forwards) {
 1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1122         } else {
 1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1126         }
 1127         __ bind(L1);
 1128 
 1129         __ tbz(count, 1, L2);
 1130         // this is the same as above but copying only 2 longs hence
 1131         // there is no intervening stp between the str instructions
 1132         // but note that the offset and register patterns are still
 1133         // the same
 1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1135         if (direction == copy_forwards) {
 1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1141         }
 1142         __ bind(L2);
 1143 
 1144         // for forwards copy we need to re-adjust the offsets we
 1145         // applied so that s and d are follow the last words written
 1146 
 1147         if (direction == copy_forwards) {
 1148           __ add(s, s, 16);
 1149           __ add(d, d, 8);
 1150         }
 1151 
 1152       }
 1153 
 1154       __ ret(lr);
 1155     }
 1156 
 1157     return start;
 1158   }
 1159 
 1160   // Small copy: less than 16 bytes.
 1161   //
 1162   // NB: Ignores all of the bits of count which represent more than 15
 1163   // bytes, so a caller doesn't have to mask them.
 1164 
 1165   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1166     bool is_backwards = step < 0;
 1167     size_t granularity = g_uabs(step);
 1168     int direction = is_backwards ? -1 : 1;
 1169 
 1170     Label Lword, Lint, Lshort, Lbyte;
 1171 
 1172     assert(granularity
 1173            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1174 
 1175     const Register t0 = r3;
 1176     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1177     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1178 
 1179     // ??? I don't know if this bit-test-and-branch is the right thing
 1180     // to do.  It does a lot of jumping, resulting in several
 1181     // mispredicted branches.  It might make more sense to do this
 1182     // with something like Duff's device with a single computed branch.
 1183 
 1184     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1185     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1186     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1187     __ bind(Lword);
 1188 
 1189     if (granularity <= sizeof (jint)) {
 1190       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1191       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1192       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1193       __ bind(Lint);
 1194     }
 1195 
 1196     if (granularity <= sizeof (jshort)) {
 1197       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1198       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1199       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1200       __ bind(Lshort);
 1201     }
 1202 
 1203     if (granularity <= sizeof (jbyte)) {
 1204       __ tbz(count, 0, Lbyte);
 1205       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1206       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1207       __ bind(Lbyte);
 1208     }
 1209   }
 1210 
 1211   // All-singing all-dancing memory copy.
 1212   //
 1213   // Copy count units of memory from s to d.  The size of a unit is
 1214   // step, which can be positive or negative depending on the direction
 1215   // of copy.  If is_aligned is false, we align the source address.
 1216   //
 1217 
 1218   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1219                    Register s, Register d, Register count, int step) {
 1220     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1221     bool is_backwards = step < 0;
 1222     unsigned int granularity = g_uabs(step);
 1223     const Register t0 = r3, t1 = r4;
 1224 
 1225     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1226     // load all the data before writing anything
 1227     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1228     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1229     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1230     const Register send = r17, dend = r16;
 1231     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1232     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1233     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1234 
 1235     if (PrefetchCopyIntervalInBytes > 0)
 1236       __ prfm(Address(s, 0), PLDL1KEEP);
 1237     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1238     __ br(Assembler::HI, copy_big);
 1239 
 1240     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1241     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1242 
 1243     __ cmp(count, u1(16/granularity));
 1244     __ br(Assembler::LS, copy16);
 1245 
 1246     __ cmp(count, u1(64/granularity));
 1247     __ br(Assembler::HI, copy80);
 1248 
 1249     __ cmp(count, u1(32/granularity));
 1250     __ br(Assembler::LS, copy32);
 1251 
 1252     // 33..64 bytes
 1253     if (UseSIMDForMemoryOps) {
 1254       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1255       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1256       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1257       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1258     } else {
 1259       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1260       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1261       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1262       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1263 
 1264       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1265       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1266       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1267       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1268     }
 1269     __ b(finish);
 1270 
 1271     // 17..32 bytes
 1272     __ bind(copy32);
 1273     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1274     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1275 
 1276     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1277     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1278     __ b(finish);
 1279 
 1280     // 65..80/96 bytes
 1281     // (96 bytes if SIMD because we do 32 byes per instruction)
 1282     __ bind(copy80);
 1283     if (UseSIMDForMemoryOps) {
 1284       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1285       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1286       // Unaligned pointers can be an issue for copying.
 1287       // The issue has more chances to happen when granularity of data is
 1288       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1289       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1290       // The most performance drop has been seen for the range 65-80 bytes.
 1291       // For such cases using the pair of ldp/stp instead of the third pair of
 1292       // ldpq/stpq fixes the performance issue.
 1293       if (granularity < sizeof (jint)) {
 1294         Label copy96;
 1295         __ cmp(count, u1(80/granularity));
 1296         __ br(Assembler::HI, copy96);
 1297         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1298 
 1299         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1300         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1301 
 1302         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1303         __ b(finish);
 1304 
 1305         __ bind(copy96);
 1306       }
 1307       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1308 
 1309       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1310       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1311 
 1312       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1313     } else {
 1314       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1315       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1316       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1317       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1318       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1319 
 1320       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1321       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1322       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1323       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1324       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1325     }
 1326     __ b(finish);
 1327 
 1328     // 0..16 bytes
 1329     __ bind(copy16);
 1330     __ cmp(count, u1(8/granularity));
 1331     __ br(Assembler::LO, copy8);
 1332 
 1333     // 8..16 bytes
 1334     bs.copy_load_at_8(t0, Address(s, 0));
 1335     bs.copy_load_at_8(t1, Address(send, -8));
 1336     bs.copy_store_at_8(Address(d, 0), t0);
 1337     bs.copy_store_at_8(Address(dend, -8), t1);
 1338     __ b(finish);
 1339 
 1340     if (granularity < 8) {
 1341       // 4..7 bytes
 1342       __ bind(copy8);
 1343       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1344       __ ldrw(t0, Address(s, 0));
 1345       __ ldrw(t1, Address(send, -4));
 1346       __ strw(t0, Address(d, 0));
 1347       __ strw(t1, Address(dend, -4));
 1348       __ b(finish);
 1349       if (granularity < 4) {
 1350         // 0..3 bytes
 1351         __ bind(copy4);
 1352         __ cbz(count, finish); // get rid of 0 case
 1353         if (granularity == 2) {
 1354           __ ldrh(t0, Address(s, 0));
 1355           __ strh(t0, Address(d, 0));
 1356         } else { // granularity == 1
 1357           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1358           // the first and last byte.
 1359           // Handle the 3 byte case by loading and storing base + count/2
 1360           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1361           // This does means in the 1 byte case we load/store the same
 1362           // byte 3 times.
 1363           __ lsr(count, count, 1);
 1364           __ ldrb(t0, Address(s, 0));
 1365           __ ldrb(t1, Address(send, -1));
 1366           __ ldrb(t2, Address(s, count));
 1367           __ strb(t0, Address(d, 0));
 1368           __ strb(t1, Address(dend, -1));
 1369           __ strb(t2, Address(d, count));
 1370         }
 1371         __ b(finish);
 1372       }
 1373     }
 1374 
 1375     __ bind(copy_big);
 1376     if (is_backwards) {
 1377       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1378       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1379     }
 1380 
 1381     // Now we've got the small case out of the way we can align the
 1382     // source address on a 2-word boundary.
 1383 
 1384     // Here we will materialize a count in r15, which is used by copy_memory_small
 1385     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1386     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1387     // can not be used as a temp register, as it contains the count.
 1388 
 1389     Label aligned;
 1390 
 1391     if (is_aligned) {
 1392       // We may have to adjust by 1 word to get s 2-word-aligned.
 1393       __ tbz(s, exact_log2(wordSize), aligned);
 1394       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1395       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1396       __ sub(count, count, wordSize/granularity);
 1397     } else {
 1398       if (is_backwards) {
 1399         __ andr(r15, s, 2 * wordSize - 1);
 1400       } else {
 1401         __ neg(r15, s);
 1402         __ andr(r15, r15, 2 * wordSize - 1);
 1403       }
 1404       // r15 is the byte adjustment needed to align s.
 1405       __ cbz(r15, aligned);
 1406       int shift = exact_log2(granularity);
 1407       if (shift > 0) {
 1408         __ lsr(r15, r15, shift);
 1409       }
 1410       __ sub(count, count, r15);
 1411 
 1412 #if 0
 1413       // ?? This code is only correct for a disjoint copy.  It may or
 1414       // may not make sense to use it in that case.
 1415 
 1416       // Copy the first pair; s and d may not be aligned.
 1417       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1418       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1419 
 1420       // Align s and d, adjust count
 1421       if (is_backwards) {
 1422         __ sub(s, s, r15);
 1423         __ sub(d, d, r15);
 1424       } else {
 1425         __ add(s, s, r15);
 1426         __ add(d, d, r15);
 1427       }
 1428 #else
 1429       copy_memory_small(decorators, type, s, d, r15, step);
 1430 #endif
 1431     }
 1432 
 1433     __ bind(aligned);
 1434 
 1435     // s is now 2-word-aligned.
 1436 
 1437     // We have a count of units and some trailing bytes. Adjust the
 1438     // count and do a bulk copy of words. If the shift is zero
 1439     // perform a move instead to benefit from zero latency moves.
 1440     int shift = exact_log2(wordSize/granularity);
 1441     if (shift > 0) {
 1442       __ lsr(r15, count, shift);
 1443     } else {
 1444       __ mov(r15, count);
 1445     }
 1446     if (direction == copy_forwards) {
 1447       if (type != T_OBJECT) {
 1448         __ bl(StubRoutines::aarch64::copy_byte_f());
 1449       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1450         __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
 1451       } else {
 1452         __ bl(StubRoutines::aarch64::copy_oop_f());
 1453       }
 1454     } else {
 1455       if (type != T_OBJECT) {
 1456         __ bl(StubRoutines::aarch64::copy_byte_b());
 1457       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1458         __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
 1459       } else {
 1460         __ bl(StubRoutines::aarch64::copy_oop_b());
 1461       }
 1462     }
 1463 
 1464     // And the tail.
 1465     copy_memory_small(decorators, type, s, d, count, step);
 1466 
 1467     if (granularity >= 8) __ bind(copy8);
 1468     if (granularity >= 4) __ bind(copy4);
 1469     __ bind(finish);
 1470   }
 1471 
 1472 
 1473   void clobber_registers() {
 1474 #ifdef ASSERT
 1475     RegSet clobbered
 1476       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1477     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1478     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1479     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1480       __ mov(*it, rscratch1);
 1481     }
 1482 #endif
 1483 
 1484   }
 1485 
 1486   // Scan over array at a for count oops, verifying each one.
 1487   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1488   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1489     Label loop, end;
 1490     __ mov(rscratch1, a);
 1491     __ mov(rscratch2, zr);
 1492     __ bind(loop);
 1493     __ cmp(rscratch2, count);
 1494     __ br(Assembler::HS, end);
 1495     if (size == wordSize) {
 1496       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1497       __ verify_oop(temp);
 1498     } else {
 1499       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1500       __ decode_heap_oop(temp); // calls verify_oop
 1501     }
 1502     __ add(rscratch2, rscratch2, 1);
 1503     __ b(loop);
 1504     __ bind(end);
 1505   }
 1506 
 1507   // Arguments:
 1508   //   stub_id - is used to name the stub and identify all details of
 1509   //             how to perform the copy.
 1510   //
 1511   //   entry - is assigned to the stub's post push entry point unless
 1512   //           it is null
 1513   //
 1514   // Inputs:
 1515   //   c_rarg0   - source array address
 1516   //   c_rarg1   - destination array address
 1517   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1518   //
 1519   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1520   // the hardware handle it.  The two dwords within qwords that span
 1521   // cache line boundaries will still be loaded and stored atomically.
 1522   //
 1523   // Side Effects: nopush_entry is set to the (post push) entry point
 1524   //               so it can be used by the corresponding conjoint
 1525   //               copy method
 1526   //
 1527   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1528     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1529     RegSet saved_reg = RegSet::of(s, d, count);
 1530     int size;
 1531     bool aligned;
 1532     bool is_oop;
 1533     bool dest_uninitialized;
 1534     switch (stub_id) {
 1535     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1536       size = sizeof(jbyte);
 1537       aligned = false;
 1538       is_oop = false;
 1539       dest_uninitialized = false;
 1540       break;
 1541     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1542       size = sizeof(jbyte);
 1543       aligned = true;
 1544       is_oop = false;
 1545       dest_uninitialized = false;
 1546       break;
 1547     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1548       size = sizeof(jshort);
 1549       aligned = false;
 1550       is_oop = false;
 1551       dest_uninitialized = false;
 1552       break;
 1553     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1554       size = sizeof(jshort);
 1555       aligned = true;
 1556       is_oop = false;
 1557       dest_uninitialized = false;
 1558       break;
 1559     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1560       size = sizeof(jint);
 1561       aligned = false;
 1562       is_oop = false;
 1563       dest_uninitialized = false;
 1564       break;
 1565     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1566       size = sizeof(jint);
 1567       aligned = true;
 1568       is_oop = false;
 1569       dest_uninitialized = false;
 1570       break;
 1571     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1572       // since this is always aligned we can (should!) use the same
 1573       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1574       ShouldNotReachHere();
 1575       break;
 1576     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1577       size = sizeof(jlong);
 1578       aligned = true;
 1579       is_oop = false;
 1580       dest_uninitialized = false;
 1581       break;
 1582     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1583       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1584       aligned = !UseCompressedOops;
 1585       is_oop = true;
 1586       dest_uninitialized = false;
 1587       break;
 1588     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1589       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1590       aligned = !UseCompressedOops;
 1591       is_oop = true;
 1592       dest_uninitialized = false;
 1593       break;
 1594     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1595       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1596       aligned = !UseCompressedOops;
 1597       is_oop = true;
 1598       dest_uninitialized = true;
 1599       break;
 1600     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1601       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1602       aligned = !UseCompressedOops;
 1603       is_oop = true;
 1604       dest_uninitialized = true;
 1605       break;
 1606     default:
 1607       ShouldNotReachHere();
 1608       break;
 1609     }
 1610 
 1611     __ align(CodeEntryAlignment);
 1612     StubCodeMark mark(this, stub_id);
 1613     address start = __ pc();
 1614     __ enter();
 1615 
 1616     if (nopush_entry != nullptr) {
 1617       *nopush_entry = __ pc();
 1618       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1619       BLOCK_COMMENT("Entry:");
 1620     }
 1621 
 1622     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1623     if (dest_uninitialized) {
 1624       decorators |= IS_DEST_UNINITIALIZED;
 1625     }
 1626     if (aligned) {
 1627       decorators |= ARRAYCOPY_ALIGNED;
 1628     }
 1629 
 1630     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1631     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1632 
 1633     if (is_oop) {
 1634       // save regs before copy_memory
 1635       __ push(RegSet::of(d, count), sp);
 1636     }
 1637     {
 1638       // UnsafeMemoryAccess page error: continue after unsafe access
 1639       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1640       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1641       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1642     }
 1643 
 1644     if (is_oop) {
 1645       __ pop(RegSet::of(d, count), sp);
 1646       if (VerifyOops)
 1647         verify_oop_array(size, d, count, r16);
 1648     }
 1649 
 1650     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1651 
 1652     __ leave();
 1653     __ mov(r0, zr); // return 0
 1654     __ ret(lr);
 1655     return start;
 1656   }
 1657 
 1658   // Arguments:
 1659   //   stub_id - is used to name the stub and identify all details of
 1660   //             how to perform the copy.
 1661   //
 1662   //   nooverlap_target - identifes the (post push) entry for the
 1663   //             corresponding disjoint copy routine which can be
 1664   //             jumped to if the ranges do not actually overlap
 1665   //
 1666   //   entry - is assigned to the stub's post push entry point unless
 1667   //           it is null
 1668   //
 1669   //
 1670   // Inputs:
 1671   //   c_rarg0   - source array address
 1672   //   c_rarg1   - destination array address
 1673   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1674   //
 1675   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1676   // the hardware handle it.  The two dwords within qwords that span
 1677   // cache line boundaries will still be loaded and stored atomically.
 1678   //
 1679   // Side Effects:
 1680   //   nopush_entry is set to the no-overlap entry point so it can be
 1681   //   used by some other conjoint copy method
 1682   //
 1683   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1684     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1685     RegSet saved_regs = RegSet::of(s, d, count);
 1686     int size;
 1687     bool aligned;
 1688     bool is_oop;
 1689     bool dest_uninitialized;
 1690     switch (stub_id) {
 1691     case StubId::stubgen_jbyte_arraycopy_id:
 1692       size = sizeof(jbyte);
 1693       aligned = false;
 1694       is_oop = false;
 1695       dest_uninitialized = false;
 1696       break;
 1697     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1698       size = sizeof(jbyte);
 1699       aligned = true;
 1700       is_oop = false;
 1701       dest_uninitialized = false;
 1702       break;
 1703     case StubId::stubgen_jshort_arraycopy_id:
 1704       size = sizeof(jshort);
 1705       aligned = false;
 1706       is_oop = false;
 1707       dest_uninitialized = false;
 1708       break;
 1709     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1710       size = sizeof(jshort);
 1711       aligned = true;
 1712       is_oop = false;
 1713       dest_uninitialized = false;
 1714       break;
 1715     case StubId::stubgen_jint_arraycopy_id:
 1716       size = sizeof(jint);
 1717       aligned = false;
 1718       is_oop = false;
 1719       dest_uninitialized = false;
 1720       break;
 1721     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1722       size = sizeof(jint);
 1723       aligned = true;
 1724       is_oop = false;
 1725       dest_uninitialized = false;
 1726       break;
 1727     case StubId::stubgen_jlong_arraycopy_id:
 1728       // since this is always aligned we can (should!) use the same
 1729       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1730       ShouldNotReachHere();
 1731       break;
 1732     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1733       size = sizeof(jlong);
 1734       aligned = true;
 1735       is_oop = false;
 1736       dest_uninitialized = false;
 1737       break;
 1738     case StubId::stubgen_oop_arraycopy_id:
 1739       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1740       aligned = !UseCompressedOops;
 1741       is_oop = true;
 1742       dest_uninitialized = false;
 1743       break;
 1744     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1745       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1746       aligned = !UseCompressedOops;
 1747       is_oop = true;
 1748       dest_uninitialized = false;
 1749       break;
 1750     case StubId::stubgen_oop_arraycopy_uninit_id:
 1751       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1752       aligned = !UseCompressedOops;
 1753       is_oop = true;
 1754       dest_uninitialized = true;
 1755       break;
 1756     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1757       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1758       aligned = !UseCompressedOops;
 1759       is_oop = true;
 1760       dest_uninitialized = true;
 1761       break;
 1762     default:
 1763       ShouldNotReachHere();
 1764     }
 1765 
 1766     StubCodeMark mark(this, stub_id);
 1767     address start = __ pc();
 1768     __ enter();
 1769 
 1770     if (nopush_entry != nullptr) {
 1771       *nopush_entry = __ pc();
 1772       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1773       BLOCK_COMMENT("Entry:");
 1774     }
 1775 
 1776     // use fwd copy when (d-s) above_equal (count*size)
 1777     Label L_overlapping;
 1778     __ sub(rscratch1, d, s);
 1779     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1780     __ br(Assembler::LO, L_overlapping);
 1781     __ b(RuntimeAddress(nooverlap_target));
 1782     __ bind(L_overlapping);
 1783 
 1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1785     if (dest_uninitialized) {
 1786       decorators |= IS_DEST_UNINITIALIZED;
 1787     }
 1788     if (aligned) {
 1789       decorators |= ARRAYCOPY_ALIGNED;
 1790     }
 1791 
 1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1794 
 1795     if (is_oop) {
 1796       // save regs before copy_memory
 1797       __ push(RegSet::of(d, count), sp);
 1798     }
 1799     {
 1800       // UnsafeMemoryAccess page error: continue after unsafe access
 1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1804     }
 1805     if (is_oop) {
 1806       __ pop(RegSet::of(d, count), sp);
 1807       if (VerifyOops)
 1808         verify_oop_array(size, d, count, r16);
 1809     }
 1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1811     __ leave();
 1812     __ mov(r0, zr); // return 0
 1813     __ ret(lr);
 1814     return start;
 1815   }
 1816 
 1817   // Helper for generating a dynamic type check.
 1818   // Smashes rscratch1, rscratch2.
 1819   void generate_type_check(Register sub_klass,
 1820                            Register super_check_offset,
 1821                            Register super_klass,
 1822                            Register temp1,
 1823                            Register temp2,
 1824                            Register result,
 1825                            Label& L_success) {
 1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1827 
 1828     BLOCK_COMMENT("type_check:");
 1829 
 1830     Label L_miss;
 1831 
 1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1833                                      super_check_offset);
 1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1835 
 1836     // Fall through on failure!
 1837     __ BIND(L_miss);
 1838   }
 1839 
 1840   //
 1841   //  Generate checkcasting array copy stub
 1842   //
 1843   //  Input:
 1844   //    c_rarg0   - source array address
 1845   //    c_rarg1   - destination array address
 1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1847   //    c_rarg3   - size_t ckoff (super_check_offset)
 1848   //    c_rarg4   - oop ckval (super_klass)
 1849   //
 1850   //  Output:
 1851   //    r0 ==  0  -  success
 1852   //    r0 == -1^K - failure, where K is partial transfer count
 1853   //
 1854   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 1855     bool dest_uninitialized;
 1856     switch (stub_id) {
 1857     case StubId::stubgen_checkcast_arraycopy_id:
 1858       dest_uninitialized = false;
 1859       break;
 1860     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1861       dest_uninitialized = true;
 1862       break;
 1863     default:
 1864       ShouldNotReachHere();
 1865     }
 1866 
 1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1868 
 1869     // Input registers (after setup_arg_regs)
 1870     const Register from        = c_rarg0;   // source array address
 1871     const Register to          = c_rarg1;   // destination array address
 1872     const Register count       = c_rarg2;   // elementscount
 1873     const Register ckoff       = c_rarg3;   // super_check_offset
 1874     const Register ckval       = c_rarg4;   // super_klass
 1875 
 1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1877 
 1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1879     const Register copied_oop  = r22;       // actual oop copied
 1880     const Register count_save  = r21;       // orig elementscount
 1881     const Register start_to    = r20;       // destination array start address
 1882     const Register r19_klass   = r19;       // oop._klass
 1883 
 1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1886 
 1887     //---------------------------------------------------------------
 1888     // Assembler stub will be used for this call to arraycopy
 1889     // if the two arrays are subtypes of Object[] but the
 1890     // destination array type is not equal to or a supertype
 1891     // of the source type.  Each element must be separately
 1892     // checked.
 1893 
 1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1895                                copied_oop, r19_klass, count_save);
 1896 
 1897     __ align(CodeEntryAlignment);
 1898     StubCodeMark mark(this, stub_id);
 1899     address start = __ pc();
 1900 
 1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1902 
 1903 #ifdef ASSERT
 1904     // caller guarantees that the arrays really are different
 1905     // otherwise, we would have to make conjoint checks
 1906     { Label L;
 1907       __ b(L);                  // conjoint check not yet implemented
 1908       __ stop("checkcast_copy within a single array");
 1909       __ bind(L);
 1910     }
 1911 #endif //ASSERT
 1912 
 1913     // Caller of this entry point must set up the argument registers.
 1914     if (nopush_entry != nullptr) {
 1915       *nopush_entry = __ pc();
 1916       BLOCK_COMMENT("Entry:");
 1917     }
 1918 
 1919      // Empty array:  Nothing to do.
 1920     __ cbz(count, L_done);
 1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1922 
 1923 #ifdef ASSERT
 1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1925     // The ckoff and ckval must be mutually consistent,
 1926     // even though caller generates both.
 1927     { Label L;
 1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1929       __ ldrw(start_to, Address(ckval, sco_offset));
 1930       __ cmpw(ckoff, start_to);
 1931       __ br(Assembler::EQ, L);
 1932       __ stop("super_check_offset inconsistent");
 1933       __ bind(L);
 1934     }
 1935 #endif //ASSERT
 1936 
 1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1938     bool is_oop = true;
 1939     int element_size = UseCompressedOops ? 4 : 8;
 1940     if (dest_uninitialized) {
 1941       decorators |= IS_DEST_UNINITIALIZED;
 1942     }
 1943 
 1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1946 
 1947     // save the original count
 1948     __ mov(count_save, count);
 1949 
 1950     // Copy from low to high addresses
 1951     __ mov(start_to, to);              // Save destination array start address
 1952     __ b(L_load_element);
 1953 
 1954     // ======== begin loop ========
 1955     // (Loop is rotated; its entry is L_load_element.)
 1956     // Loop control:
 1957     //   for (; count != 0; count--) {
 1958     //     copied_oop = load_heap_oop(from++);
 1959     //     ... generate_type_check ...;
 1960     //     store_heap_oop(to++, copied_oop);
 1961     //   }
 1962     __ align(OptoLoopAlignment);
 1963 
 1964     __ BIND(L_store_element);
 1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1966                       __ post(to, element_size), copied_oop, noreg,
 1967                       gct1, gct2, gct3);
 1968     __ sub(count, count, 1);
 1969     __ cbz(count, L_do_card_marks);
 1970 
 1971     // ======== loop entry is here ========
 1972     __ BIND(L_load_element);
 1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1974                      copied_oop, noreg, __ post(from, element_size),
 1975                      gct1);
 1976     __ cbz(copied_oop, L_store_element);
 1977 
 1978     __ load_klass(r19_klass, copied_oop);// query the object klass
 1979 
 1980     BLOCK_COMMENT("type_check:");
 1981     generate_type_check(/*sub_klass*/r19_klass,
 1982                         /*super_check_offset*/ckoff,
 1983                         /*super_klass*/ckval,
 1984                         /*r_array_base*/gct1,
 1985                         /*temp2*/gct2,
 1986                         /*result*/r10, L_store_element);
 1987 
 1988     // Fall through on failure!
 1989 
 1990     // ======== end loop ========
 1991 
 1992     // It was a real error; we must depend on the caller to finish the job.
 1993     // Register count = remaining oops, count_orig = total oops.
 1994     // Emit GC store barriers for the oops we have copied and report
 1995     // their number to the caller.
 1996 
 1997     __ subs(count, count_save, count);     // K = partially copied oop count
 1998     __ eon(count, count, zr);              // report (-1^K) to caller
 1999     __ br(Assembler::EQ, L_done_pop);
 2000 
 2001     __ BIND(L_do_card_marks);
 2002     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2003 
 2004     __ bind(L_done_pop);
 2005     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2007 
 2008     __ bind(L_done);
 2009     __ mov(r0, count);
 2010     __ leave();
 2011     __ ret(lr);
 2012 
 2013     return start;
 2014   }
 2015 
 2016   // Perform range checks on the proposed arraycopy.
 2017   // Kills temp, but nothing else.
 2018   // Also, clean the sign bits of src_pos and dst_pos.
 2019   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2020                               Register src_pos, // source position (c_rarg1)
 2021                               Register dst,     // destination array oo (c_rarg2)
 2022                               Register dst_pos, // destination position (c_rarg3)
 2023                               Register length,
 2024                               Register temp,
 2025                               Label& L_failed) {
 2026     BLOCK_COMMENT("arraycopy_range_checks:");
 2027 
 2028     assert_different_registers(rscratch1, temp);
 2029 
 2030     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2031     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2032     __ addw(temp, length, src_pos);
 2033     __ cmpw(temp, rscratch1);
 2034     __ br(Assembler::HI, L_failed);
 2035 
 2036     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2037     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2038     __ addw(temp, length, dst_pos);
 2039     __ cmpw(temp, rscratch1);
 2040     __ br(Assembler::HI, L_failed);
 2041 
 2042     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2043     __ movw(src_pos, src_pos);
 2044     __ movw(dst_pos, dst_pos);
 2045 
 2046     BLOCK_COMMENT("arraycopy_range_checks done");
 2047   }
 2048 
 2049   // These stubs get called from some dumb test routine.
 2050   // I'll write them properly when they're called from
 2051   // something that's actually doing something.
 2052   static void fake_arraycopy_stub(address src, address dst, int count) {
 2053     assert(count == 0, "huh?");
 2054   }
 2055 
 2056 
 2057   //
 2058   //  Generate 'unsafe' array copy stub
 2059   //  Though just as safe as the other stubs, it takes an unscaled
 2060   //  size_t argument instead of an element count.
 2061   //
 2062   //  Input:
 2063   //    c_rarg0   - source array address
 2064   //    c_rarg1   - destination array address
 2065   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2066   //
 2067   // Examines the alignment of the operands and dispatches
 2068   // to a long, int, short, or byte copy loop.
 2069   //
 2070   address generate_unsafe_copy(address byte_copy_entry,
 2071                                address short_copy_entry,
 2072                                address int_copy_entry,
 2073                                address long_copy_entry) {
 2074     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2075 
 2076     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2077     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2078 
 2079     __ align(CodeEntryAlignment);
 2080     StubCodeMark mark(this, stub_id);
 2081     address start = __ pc();
 2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2083 
 2084     // bump this on entry, not on exit:
 2085     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2086 
 2087     __ orr(rscratch1, s, d);
 2088     __ orr(rscratch1, rscratch1, count);
 2089 
 2090     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2091     __ cbz(rscratch1, L_long_aligned);
 2092     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2093     __ cbz(rscratch1, L_int_aligned);
 2094     __ tbz(rscratch1, 0, L_short_aligned);
 2095     __ b(RuntimeAddress(byte_copy_entry));
 2096 
 2097     __ BIND(L_short_aligned);
 2098     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2099     __ b(RuntimeAddress(short_copy_entry));
 2100     __ BIND(L_int_aligned);
 2101     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2102     __ b(RuntimeAddress(int_copy_entry));
 2103     __ BIND(L_long_aligned);
 2104     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2105     __ b(RuntimeAddress(long_copy_entry));
 2106 
 2107     return start;
 2108   }
 2109 
 2110   //
 2111   //  Generate generic array copy stubs
 2112   //
 2113   //  Input:
 2114   //    c_rarg0    -  src oop
 2115   //    c_rarg1    -  src_pos (32-bits)
 2116   //    c_rarg2    -  dst oop
 2117   //    c_rarg3    -  dst_pos (32-bits)
 2118   //    c_rarg4    -  element count (32-bits)
 2119   //
 2120   //  Output:
 2121   //    r0 ==  0  -  success
 2122   //    r0 == -1^K - failure, where K is partial transfer count
 2123   //
 2124   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2125                                 address int_copy_entry, address oop_copy_entry,
 2126                                 address long_copy_entry, address checkcast_copy_entry) {
 2127     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2128 
 2129     Label L_failed, L_objArray;
 2130     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2131 
 2132     // Input registers
 2133     const Register src        = c_rarg0;  // source array oop
 2134     const Register src_pos    = c_rarg1;  // source position
 2135     const Register dst        = c_rarg2;  // destination array oop
 2136     const Register dst_pos    = c_rarg3;  // destination position
 2137     const Register length     = c_rarg4;
 2138 
 2139 
 2140     // Registers used as temps
 2141     const Register dst_klass  = c_rarg5;
 2142 
 2143     __ align(CodeEntryAlignment);
 2144 
 2145     StubCodeMark mark(this, stub_id);
 2146 
 2147     address start = __ pc();
 2148 
 2149     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2150 
 2151     // bump this on entry, not on exit:
 2152     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2153 
 2154     //-----------------------------------------------------------------------
 2155     // Assembler stub will be used for this call to arraycopy
 2156     // if the following conditions are met:
 2157     //
 2158     // (1) src and dst must not be null.
 2159     // (2) src_pos must not be negative.
 2160     // (3) dst_pos must not be negative.
 2161     // (4) length  must not be negative.
 2162     // (5) src klass and dst klass should be the same and not null.
 2163     // (6) src and dst should be arrays.
 2164     // (7) src_pos + length must not exceed length of src.
 2165     // (8) dst_pos + length must not exceed length of dst.
 2166     //
 2167 
 2168     //  if (src == nullptr) return -1;
 2169     __ cbz(src, L_failed);
 2170 
 2171     //  if (src_pos < 0) return -1;
 2172     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2173 
 2174     //  if (dst == nullptr) return -1;
 2175     __ cbz(dst, L_failed);
 2176 
 2177     //  if (dst_pos < 0) return -1;
 2178     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2179 
 2180     // registers used as temp
 2181     const Register scratch_length    = r16; // elements count to copy
 2182     const Register scratch_src_klass = r17; // array klass
 2183     const Register lh                = r15; // layout helper
 2184 
 2185     //  if (length < 0) return -1;
 2186     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2187     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2188 
 2189     __ load_klass(scratch_src_klass, src);
 2190 #ifdef ASSERT
 2191     //  assert(src->klass() != nullptr);
 2192     {
 2193       BLOCK_COMMENT("assert klasses not null {");
 2194       Label L1, L2;
 2195       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2196       __ bind(L1);
 2197       __ stop("broken null klass");
 2198       __ bind(L2);
 2199       __ load_klass(rscratch1, dst);
 2200       __ cbz(rscratch1, L1);     // this would be broken also
 2201       BLOCK_COMMENT("} assert klasses not null done");
 2202     }
 2203 #endif
 2204 
 2205     // Load layout helper (32-bits)
 2206     //
 2207     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2208     // 32        30    24            16              8     2                 0
 2209     //
 2210     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2211     //
 2212 
 2213     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2214 
 2215     // Handle objArrays completely differently...
 2216     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2217     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2218     __ movw(rscratch1, objArray_lh);
 2219     __ eorw(rscratch2, lh, rscratch1);
 2220     __ cbzw(rscratch2, L_objArray);
 2221 
 2222     //  if (src->klass() != dst->klass()) return -1;
 2223     __ load_klass(rscratch2, dst);
 2224     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2225     __ cbnz(rscratch2, L_failed);
 2226 
 2227     //  if (!src->is_Array()) return -1;
 2228     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2229 
 2230     // At this point, it is known to be a typeArray (array_tag 0x3).
 2231 #ifdef ASSERT
 2232     {
 2233       BLOCK_COMMENT("assert primitive array {");
 2234       Label L;
 2235       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2236       __ cmpw(lh, rscratch2);
 2237       __ br(Assembler::GE, L);
 2238       __ stop("must be a primitive array");
 2239       __ bind(L);
 2240       BLOCK_COMMENT("} assert primitive array done");
 2241     }
 2242 #endif
 2243 
 2244     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2245                            rscratch2, L_failed);
 2246 
 2247     // TypeArrayKlass
 2248     //
 2249     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2250     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2251     //
 2252 
 2253     const Register rscratch1_offset = rscratch1;    // array offset
 2254     const Register r15_elsize = lh; // element size
 2255 
 2256     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2257            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2258     __ add(src, src, rscratch1_offset);           // src array offset
 2259     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2260     BLOCK_COMMENT("choose copy loop based on element size");
 2261 
 2262     // next registers should be set before the jump to corresponding stub
 2263     const Register from     = c_rarg0;  // source array address
 2264     const Register to       = c_rarg1;  // destination array address
 2265     const Register count    = c_rarg2;  // elements count
 2266 
 2267     // 'from', 'to', 'count' registers should be set in such order
 2268     // since they are the same as 'src', 'src_pos', 'dst'.
 2269 
 2270     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2271 
 2272     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2273     // size in bytes).  We do a simple bitwise binary search.
 2274   __ BIND(L_copy_bytes);
 2275     __ tbnz(r15_elsize, 1, L_copy_ints);
 2276     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2277     __ lea(from, Address(src, src_pos));// src_addr
 2278     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2279     __ movw(count, scratch_length); // length
 2280     __ b(RuntimeAddress(byte_copy_entry));
 2281 
 2282   __ BIND(L_copy_shorts);
 2283     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2285     __ movw(count, scratch_length); // length
 2286     __ b(RuntimeAddress(short_copy_entry));
 2287 
 2288   __ BIND(L_copy_ints);
 2289     __ tbnz(r15_elsize, 0, L_copy_longs);
 2290     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2291     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2292     __ movw(count, scratch_length); // length
 2293     __ b(RuntimeAddress(int_copy_entry));
 2294 
 2295   __ BIND(L_copy_longs);
 2296 #ifdef ASSERT
 2297     {
 2298       BLOCK_COMMENT("assert long copy {");
 2299       Label L;
 2300       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2301       __ cmpw(r15_elsize, LogBytesPerLong);
 2302       __ br(Assembler::EQ, L);
 2303       __ stop("must be long copy, but elsize is wrong");
 2304       __ bind(L);
 2305       BLOCK_COMMENT("} assert long copy done");
 2306     }
 2307 #endif
 2308     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2310     __ movw(count, scratch_length); // length
 2311     __ b(RuntimeAddress(long_copy_entry));
 2312 
 2313     // ObjArrayKlass
 2314   __ BIND(L_objArray);
 2315     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2316 
 2317     Label L_plain_copy, L_checkcast_copy;
 2318     //  test array classes for subtyping
 2319     __ load_klass(r15, dst);
 2320     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2321     __ br(Assembler::NE, L_checkcast_copy);
 2322 
 2323     // Identically typed arrays can be copied without element-wise checks.
 2324     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2325                            rscratch2, L_failed);
 2326 
 2327     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2328     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2329     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2330     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2331     __ movw(count, scratch_length); // length
 2332   __ BIND(L_plain_copy);
 2333     __ b(RuntimeAddress(oop_copy_entry));
 2334 
 2335   __ BIND(L_checkcast_copy);
 2336     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2337     {
 2338       // Before looking at dst.length, make sure dst is also an objArray.
 2339       __ ldrw(rscratch1, Address(r15, lh_offset));
 2340       __ movw(rscratch2, objArray_lh);
 2341       __ eorw(rscratch1, rscratch1, rscratch2);
 2342       __ cbnzw(rscratch1, L_failed);
 2343 
 2344       // It is safe to examine both src.length and dst.length.
 2345       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2346                              r15, L_failed);
 2347 
 2348       __ load_klass(dst_klass, dst); // reload
 2349 
 2350       // Marshal the base address arguments now, freeing registers.
 2351       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2352       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2354       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2355       __ movw(count, length);           // length (reloaded)
 2356       Register sco_temp = c_rarg3;      // this register is free now
 2357       assert_different_registers(from, to, count, sco_temp,
 2358                                  dst_klass, scratch_src_klass);
 2359       // assert_clean_int(count, sco_temp);
 2360 
 2361       // Generate the type check.
 2362       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2364 
 2365       // Smashes rscratch1, rscratch2
 2366       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2367                           L_plain_copy);
 2368 
 2369       // Fetch destination element klass from the ObjArrayKlass header.
 2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2373 
 2374       // the checkcast_copy loop needs two extra arguments:
 2375       assert(c_rarg3 == sco_temp, "#3 already in place");
 2376       // Set up arguments for checkcast_copy_entry.
 2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2378       __ b(RuntimeAddress(checkcast_copy_entry));
 2379     }
 2380 
 2381   __ BIND(L_failed);
 2382     __ mov(r0, -1);
 2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2384     __ ret(lr);
 2385 
 2386     return start;
 2387   }
 2388 
 2389   //
 2390   // Generate stub for array fill. If "aligned" is true, the
 2391   // "to" address is assumed to be heapword aligned.
 2392   //
 2393   // Arguments for generated stub:
 2394   //   to:    c_rarg0
 2395   //   value: c_rarg1
 2396   //   count: c_rarg2 treated as signed
 2397   //
 2398   address generate_fill(StubId stub_id) {
 2399     BasicType t;
 2400     bool aligned;
 2401 
 2402     switch (stub_id) {
 2403     case StubId::stubgen_jbyte_fill_id:
 2404       t = T_BYTE;
 2405       aligned = false;
 2406       break;
 2407     case StubId::stubgen_jshort_fill_id:
 2408       t = T_SHORT;
 2409       aligned = false;
 2410       break;
 2411     case StubId::stubgen_jint_fill_id:
 2412       t = T_INT;
 2413       aligned = false;
 2414       break;
 2415     case StubId::stubgen_arrayof_jbyte_fill_id:
 2416       t = T_BYTE;
 2417       aligned = true;
 2418       break;
 2419     case StubId::stubgen_arrayof_jshort_fill_id:
 2420       t = T_SHORT;
 2421       aligned = true;
 2422       break;
 2423     case StubId::stubgen_arrayof_jint_fill_id:
 2424       t = T_INT;
 2425       aligned = true;
 2426       break;
 2427     default:
 2428       ShouldNotReachHere();
 2429     };
 2430 
 2431     __ align(CodeEntryAlignment);
 2432     StubCodeMark mark(this, stub_id);
 2433     address start = __ pc();
 2434 
 2435     BLOCK_COMMENT("Entry:");
 2436 
 2437     const Register to        = c_rarg0;  // source array address
 2438     const Register value     = c_rarg1;  // value
 2439     const Register count     = c_rarg2;  // elements count
 2440 
 2441     const Register bz_base = r10;        // base for block_zero routine
 2442     const Register cnt_words = r11;      // temp register
 2443 
 2444     __ enter();
 2445 
 2446     Label L_fill_elements, L_exit1;
 2447 
 2448     int shift = -1;
 2449     switch (t) {
 2450       case T_BYTE:
 2451         shift = 0;
 2452         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2453         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2454         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2455         __ br(Assembler::LO, L_fill_elements);
 2456         break;
 2457       case T_SHORT:
 2458         shift = 1;
 2459         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2461         __ br(Assembler::LO, L_fill_elements);
 2462         break;
 2463       case T_INT:
 2464         shift = 2;
 2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2466         __ br(Assembler::LO, L_fill_elements);
 2467         break;
 2468       default: ShouldNotReachHere();
 2469     }
 2470 
 2471     // Align source address at 8 bytes address boundary.
 2472     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2473     if (!aligned) {
 2474       switch (t) {
 2475         case T_BYTE:
 2476           // One byte misalignment happens only for byte arrays.
 2477           __ tbz(to, 0, L_skip_align1);
 2478           __ strb(value, Address(__ post(to, 1)));
 2479           __ subw(count, count, 1);
 2480           __ bind(L_skip_align1);
 2481           // Fallthrough
 2482         case T_SHORT:
 2483           // Two bytes misalignment happens only for byte and short (char) arrays.
 2484           __ tbz(to, 1, L_skip_align2);
 2485           __ strh(value, Address(__ post(to, 2)));
 2486           __ subw(count, count, 2 >> shift);
 2487           __ bind(L_skip_align2);
 2488           // Fallthrough
 2489         case T_INT:
 2490           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2491           __ tbz(to, 2, L_skip_align4);
 2492           __ strw(value, Address(__ post(to, 4)));
 2493           __ subw(count, count, 4 >> shift);
 2494           __ bind(L_skip_align4);
 2495           break;
 2496         default: ShouldNotReachHere();
 2497       }
 2498     }
 2499 
 2500     //
 2501     //  Fill large chunks
 2502     //
 2503     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2504     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2505     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2506     if (UseBlockZeroing) {
 2507       Label non_block_zeroing, rest;
 2508       // If the fill value is zero we can use the fast zero_words().
 2509       __ cbnz(value, non_block_zeroing);
 2510       __ mov(bz_base, to);
 2511       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2512       address tpc = __ zero_words(bz_base, cnt_words);
 2513       if (tpc == nullptr) {
 2514         fatal("CodeCache is full at generate_fill");
 2515       }
 2516       __ b(rest);
 2517       __ bind(non_block_zeroing);
 2518       __ fill_words(to, cnt_words, value);
 2519       __ bind(rest);
 2520     } else {
 2521       __ fill_words(to, cnt_words, value);
 2522     }
 2523 
 2524     // Remaining count is less than 8 bytes. Fill it by a single store.
 2525     // Note that the total length is no less than 8 bytes.
 2526     if (t == T_BYTE || t == T_SHORT) {
 2527       Label L_exit1;
 2528       __ cbzw(count, L_exit1);
 2529       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2530       __ str(value, Address(to, -8));    // overwrite some elements
 2531       __ bind(L_exit1);
 2532       __ leave();
 2533       __ ret(lr);
 2534     }
 2535 
 2536     // Handle copies less than 8 bytes.
 2537     Label L_fill_2, L_fill_4, L_exit2;
 2538     __ bind(L_fill_elements);
 2539     switch (t) {
 2540       case T_BYTE:
 2541         __ tbz(count, 0, L_fill_2);
 2542         __ strb(value, Address(__ post(to, 1)));
 2543         __ bind(L_fill_2);
 2544         __ tbz(count, 1, L_fill_4);
 2545         __ strh(value, Address(__ post(to, 2)));
 2546         __ bind(L_fill_4);
 2547         __ tbz(count, 2, L_exit2);
 2548         __ strw(value, Address(to));
 2549         break;
 2550       case T_SHORT:
 2551         __ tbz(count, 0, L_fill_4);
 2552         __ strh(value, Address(__ post(to, 2)));
 2553         __ bind(L_fill_4);
 2554         __ tbz(count, 1, L_exit2);
 2555         __ strw(value, Address(to));
 2556         break;
 2557       case T_INT:
 2558         __ cbzw(count, L_exit2);
 2559         __ strw(value, Address(to));
 2560         break;
 2561       default: ShouldNotReachHere();
 2562     }
 2563     __ bind(L_exit2);
 2564     __ leave();
 2565     __ ret(lr);
 2566     return start;
 2567   }
 2568 
 2569   address generate_unsafecopy_common_error_exit() {
 2570     address start_pc = __ pc();
 2571       __ leave();
 2572       __ mov(r0, 0);
 2573       __ ret(lr);
 2574     return start_pc;
 2575   }
 2576 
 2577   //
 2578   //  Generate 'unsafe' set memory stub
 2579   //  Though just as safe as the other stubs, it takes an unscaled
 2580   //  size_t (# bytes) argument instead of an element count.
 2581   //
 2582   //  This fill operation is atomicity preserving: as long as the
 2583   //  address supplied is sufficiently aligned, all writes of up to 64
 2584   //  bits in size are single-copy atomic.
 2585   //
 2586   //  Input:
 2587   //    c_rarg0   - destination array address
 2588   //    c_rarg1   - byte count (size_t)
 2589   //    c_rarg2   - byte value
 2590   //
 2591   address generate_unsafe_setmemory() {
 2592     __ align(CodeEntryAlignment);
 2593     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2594     address start = __ pc();
 2595 
 2596     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2597     Label tail;
 2598 
 2599     UnsafeMemoryAccessMark umam(this, true, false);
 2600 
 2601     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2602 
 2603     __ dup(v0, __ T16B, value);
 2604 
 2605     if (AvoidUnalignedAccesses) {
 2606       __ cmp(count, (u1)16);
 2607       __ br(__ LO, tail);
 2608 
 2609       __ mov(rscratch1, 16);
 2610       __ andr(rscratch2, dest, 15);
 2611       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2612       __ strq(v0, Address(dest));
 2613       __ sub(count, count, rscratch1);
 2614       __ add(dest, dest, rscratch1);
 2615     }
 2616 
 2617     __ subs(count, count, (u1)64);
 2618     __ br(__ LO, tail);
 2619     {
 2620       Label again;
 2621       __ bind(again);
 2622       __ stpq(v0, v0, Address(dest));
 2623       __ stpq(v0, v0, Address(dest, 32));
 2624 
 2625       __ subs(count, count, 64);
 2626       __ add(dest, dest, 64);
 2627       __ br(__ HS, again);
 2628     }
 2629 
 2630     __ bind(tail);
 2631     // The count of bytes is off by 64, but we don't need to correct
 2632     // it because we're only going to use the least-significant few
 2633     // count bits from here on.
 2634     // __ add(count, count, 64);
 2635 
 2636     {
 2637       Label dont;
 2638       __ tbz(count, exact_log2(32), dont);
 2639       __ stpq(v0, v0, __ post(dest, 32));
 2640       __ bind(dont);
 2641     }
 2642     {
 2643       Label dont;
 2644       __ tbz(count, exact_log2(16), dont);
 2645       __ strq(v0, __ post(dest, 16));
 2646       __ bind(dont);
 2647     }
 2648     {
 2649       Label dont;
 2650       __ tbz(count, exact_log2(8), dont);
 2651       __ strd(v0, __ post(dest, 8));
 2652       __ bind(dont);
 2653     }
 2654 
 2655     Label finished;
 2656     __ tst(count, 7);
 2657     __ br(__ EQ, finished);
 2658 
 2659     {
 2660       Label dont;
 2661       __ tbz(count, exact_log2(4), dont);
 2662       __ strs(v0, __ post(dest, 4));
 2663       __ bind(dont);
 2664     }
 2665     {
 2666       Label dont;
 2667       __ tbz(count, exact_log2(2), dont);
 2668       __ bfi(value, value, 8, 8);
 2669       __ strh(value, __ post(dest, 2));
 2670       __ bind(dont);
 2671     }
 2672     {
 2673       Label dont;
 2674       __ tbz(count, exact_log2(1), dont);
 2675       __ strb(value, Address(dest));
 2676       __ bind(dont);
 2677     }
 2678 
 2679     __ bind(finished);
 2680     __ leave();
 2681     __ ret(lr);
 2682 
 2683     return start;
 2684   }
 2685 
 2686   address generate_data_cache_writeback() {
 2687     const Register line        = c_rarg0;  // address of line to write back
 2688 
 2689     __ align(CodeEntryAlignment);
 2690 
 2691     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2692     StubCodeMark mark(this, stub_id);
 2693 
 2694     address start = __ pc();
 2695     __ enter();
 2696     __ cache_wb(Address(line, 0));
 2697     __ leave();
 2698     __ ret(lr);
 2699 
 2700     return start;
 2701   }
 2702 
 2703   address generate_data_cache_writeback_sync() {
 2704     const Register is_pre     = c_rarg0;  // pre or post sync
 2705 
 2706     __ align(CodeEntryAlignment);
 2707 
 2708     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2709     StubCodeMark mark(this, stub_id);
 2710 
 2711     // pre wbsync is a no-op
 2712     // post wbsync translates to an sfence
 2713 
 2714     Label skip;
 2715     address start = __ pc();
 2716     __ enter();
 2717     __ cbnz(is_pre, skip);
 2718     __ cache_wbsync(false);
 2719     __ bind(skip);
 2720     __ leave();
 2721     __ ret(lr);
 2722 
 2723     return start;
 2724   }
 2725 
 2726   void generate_arraycopy_stubs() {
 2727     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 2728     // entry immediately following their stack push. This can be used
 2729     // as a post-push branch target for compatible stubs when they
 2730     // identify a special case that can be handled by the fallback
 2731     // stub e.g a disjoint copy stub may be use as a special case
 2732     // fallback for its compatible conjoint copy stub.
 2733     //
 2734     // A no push entry is always returned in the following local and
 2735     // then published by assigning to the appropriate entry field in
 2736     // class StubRoutines. The entry value is then passed to the
 2737     // generator for the compatible stub. That means the entry must be
 2738     // listed when saving to/restoring from the AOT cache, ensuring
 2739     // that the inter-stub jumps are noted at AOT-cache save and
 2740     // relocated at AOT cache load.
 2741     address nopush_entry;
 2742 
 2743     // generate the common exit first so later stubs can rely on it if
 2744     // they want an UnsafeMemoryAccess exit non-local to the stub
 2745     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2746     // register the stub as the default exit with class UnsafeMemoryAccess
 2747     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2748 
 2749     // generate and publish arch64-specific bulk copy routines first
 2750     // so we can call them from other copy stubs
 2751     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2752     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2753 
 2754     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2755     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2756 
 2757     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2758     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2759 
 2760     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2761 
 2762     //*** jbyte
 2763     // Always need aligned and unaligned versions
 2764     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2765     // disjoint nopush entry is needed by conjoint copy
 2766     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2767     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 2768     // conjoint nopush entry is needed by generic/unsafe copy
 2769     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 2770     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2771     // disjoint arrayof nopush entry is needed by conjoint copy
 2772     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2773     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 2774 
 2775     //*** jshort
 2776     // Always need aligned and unaligned versions
 2777     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 2778     // disjoint nopush entry is needed by conjoint copy
 2779     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 2780     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 2781     // conjoint nopush entry is used by generic/unsafe copy
 2782     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 2783     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 2784     // disjoint arrayof nopush entry is needed by conjoint copy
 2785     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 2786     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 2787 
 2788     //*** jint
 2789     // Aligned versions
 2790     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 2791     // disjoint arrayof nopush entry is needed by conjoint copy
 2792     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 2793     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 2794     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2795     // jint_arraycopy_nopush always points to the unaligned version
 2796     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 2797     // disjoint nopush entry is needed by conjoint copy
 2798     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 2799     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 2800     // conjoint nopush entry is needed by generic/unsafe copy
 2801     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 2802 
 2803     //*** jlong
 2804     // It is always aligned
 2805     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 2806     // disjoint arrayof nopush entry is needed by conjoint copy
 2807     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 2808     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 2809     // conjoint nopush entry is needed by generic/unsafe copy
 2810     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 2811     // disjoint normal/nopush and conjoint normal entries are not
 2812     // generated since the arrayof versions are the same
 2813     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2814     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 2815     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2816 
 2817     //*** oops
 2818     {
 2819       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2820         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 2821       // disjoint arrayof nopush entry is needed by conjoint copy
 2822       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 2823       StubRoutines::_arrayof_oop_arraycopy
 2824         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 2825       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 2826       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 2827       // Aligned versions without pre-barriers
 2828       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2829         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 2830       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 2831       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 2832       // note that we don't need a returned nopush entry because the
 2833       // generic/unsafe copy does not cater for uninit arrays.
 2834       StubRoutines::_arrayof_oop_arraycopy_uninit
 2835         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 2836     }
 2837 
 2838     // for oop copies reuse arrayof entries for non-arrayof cases
 2839     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2840     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 2841     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2842     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2843     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 2844     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2845 
 2846     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 2847     // checkcast nopush entry is needed by generic copy
 2848     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 2849     // note that we don't need a returned nopush entry because the
 2850     // generic copy does not cater for uninit arrays.
 2851     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2852 
 2853     // unsafe arraycopy may fallback on conjoint stubs
 2854     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2855                                                               StubRoutines::_jshort_arraycopy_nopush,
 2856                                                               StubRoutines::_jint_arraycopy_nopush,
 2857                                                               StubRoutines::_jlong_arraycopy_nopush);
 2858 
 2859     // generic arraycopy may fallback on conjoint stubs
 2860     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2861                                                                StubRoutines::_jshort_arraycopy_nopush,
 2862                                                                StubRoutines::_jint_arraycopy_nopush,
 2863                                                                StubRoutines::_oop_arraycopy_nopush,
 2864                                                                StubRoutines::_jlong_arraycopy_nopush,
 2865                                                                StubRoutines::_checkcast_arraycopy_nopush);
 2866 
 2867     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2868     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2869     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2870     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2871     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2872     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2873   }
 2874 
 2875   void generate_math_stubs() { Unimplemented(); }
 2876 
 2877   // Arguments:
 2878   //
 2879   // Inputs:
 2880   //   c_rarg0   - source byte array address
 2881   //   c_rarg1   - destination byte array address
 2882   //   c_rarg2   - sessionKe (key) in little endian int array
 2883   //
 2884   address generate_aescrypt_encryptBlock() {
 2885     __ align(CodeEntryAlignment);
 2886     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2887     StubCodeMark mark(this, stub_id);
 2888 
 2889     const Register from        = c_rarg0;  // source array address
 2890     const Register to          = c_rarg1;  // destination array address
 2891     const Register key         = c_rarg2;  // key array address
 2892     const Register keylen      = rscratch1;
 2893 
 2894     address start = __ pc();
 2895     __ enter();
 2896 
 2897     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2898 
 2899     __ aesenc_loadkeys(key, keylen);
 2900     __ aesecb_encrypt(from, to, keylen);
 2901 
 2902     __ mov(r0, 0);
 2903 
 2904     __ leave();
 2905     __ ret(lr);
 2906 
 2907     return start;
 2908   }
 2909 
 2910   // Arguments:
 2911   //
 2912   // Inputs:
 2913   //   c_rarg0   - source byte array address
 2914   //   c_rarg1   - destination byte array address
 2915   //   c_rarg2   - sessionKd (key) in little endian int array
 2916   //
 2917   address generate_aescrypt_decryptBlock() {
 2918     assert(UseAES, "need AES cryptographic extension support");
 2919     __ align(CodeEntryAlignment);
 2920     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2921     StubCodeMark mark(this, stub_id);
 2922     Label L_doLast;
 2923 
 2924     const Register from        = c_rarg0;  // source array address
 2925     const Register to          = c_rarg1;  // destination array address
 2926     const Register key         = c_rarg2;  // key array address
 2927     const Register keylen      = rscratch1;
 2928 
 2929     address start = __ pc();
 2930     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2931 
 2932     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2933 
 2934     __ aesecb_decrypt(from, to, key, keylen);
 2935 
 2936     __ mov(r0, 0);
 2937 
 2938     __ leave();
 2939     __ ret(lr);
 2940 
 2941     return start;
 2942   }
 2943 
 2944   // Arguments:
 2945   //
 2946   // Inputs:
 2947   //   c_rarg0   - source byte array address
 2948   //   c_rarg1   - destination byte array address
 2949   //   c_rarg2   - sessionKe (key) in little endian int array
 2950   //   c_rarg3   - r vector byte array address
 2951   //   c_rarg4   - input length
 2952   //
 2953   // Output:
 2954   //   x0        - input length
 2955   //
 2956   address generate_cipherBlockChaining_encryptAESCrypt() {
 2957     assert(UseAES, "need AES cryptographic extension support");
 2958     __ align(CodeEntryAlignment);
 2959     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2960     StubCodeMark mark(this, stub_id);
 2961 
 2962     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2963 
 2964     const Register from        = c_rarg0;  // source array address
 2965     const Register to          = c_rarg1;  // destination array address
 2966     const Register key         = c_rarg2;  // key array address
 2967     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2968                                            // and left with the results of the last encryption block
 2969     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2970     const Register keylen      = rscratch1;
 2971 
 2972     address start = __ pc();
 2973 
 2974       __ enter();
 2975 
 2976       __ movw(rscratch2, len_reg);
 2977 
 2978       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2979 
 2980       __ ld1(v0, __ T16B, rvec);
 2981 
 2982       __ cmpw(keylen, 52);
 2983       __ br(Assembler::CC, L_loadkeys_44);
 2984       __ br(Assembler::EQ, L_loadkeys_52);
 2985 
 2986       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2987       __ rev32(v17, __ T16B, v17);
 2988       __ rev32(v18, __ T16B, v18);
 2989     __ BIND(L_loadkeys_52);
 2990       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2991       __ rev32(v19, __ T16B, v19);
 2992       __ rev32(v20, __ T16B, v20);
 2993     __ BIND(L_loadkeys_44);
 2994       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2995       __ rev32(v21, __ T16B, v21);
 2996       __ rev32(v22, __ T16B, v22);
 2997       __ rev32(v23, __ T16B, v23);
 2998       __ rev32(v24, __ T16B, v24);
 2999       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3000       __ rev32(v25, __ T16B, v25);
 3001       __ rev32(v26, __ T16B, v26);
 3002       __ rev32(v27, __ T16B, v27);
 3003       __ rev32(v28, __ T16B, v28);
 3004       __ ld1(v29, v30, v31, __ T16B, key);
 3005       __ rev32(v29, __ T16B, v29);
 3006       __ rev32(v30, __ T16B, v30);
 3007       __ rev32(v31, __ T16B, v31);
 3008 
 3009     __ BIND(L_aes_loop);
 3010       __ ld1(v1, __ T16B, __ post(from, 16));
 3011       __ eor(v0, __ T16B, v0, v1);
 3012 
 3013       __ br(Assembler::CC, L_rounds_44);
 3014       __ br(Assembler::EQ, L_rounds_52);
 3015 
 3016       __ aese(v0, v17); __ aesmc(v0, v0);
 3017       __ aese(v0, v18); __ aesmc(v0, v0);
 3018     __ BIND(L_rounds_52);
 3019       __ aese(v0, v19); __ aesmc(v0, v0);
 3020       __ aese(v0, v20); __ aesmc(v0, v0);
 3021     __ BIND(L_rounds_44);
 3022       __ aese(v0, v21); __ aesmc(v0, v0);
 3023       __ aese(v0, v22); __ aesmc(v0, v0);
 3024       __ aese(v0, v23); __ aesmc(v0, v0);
 3025       __ aese(v0, v24); __ aesmc(v0, v0);
 3026       __ aese(v0, v25); __ aesmc(v0, v0);
 3027       __ aese(v0, v26); __ aesmc(v0, v0);
 3028       __ aese(v0, v27); __ aesmc(v0, v0);
 3029       __ aese(v0, v28); __ aesmc(v0, v0);
 3030       __ aese(v0, v29); __ aesmc(v0, v0);
 3031       __ aese(v0, v30);
 3032       __ eor(v0, __ T16B, v0, v31);
 3033 
 3034       __ st1(v0, __ T16B, __ post(to, 16));
 3035 
 3036       __ subw(len_reg, len_reg, 16);
 3037       __ cbnzw(len_reg, L_aes_loop);
 3038 
 3039       __ st1(v0, __ T16B, rvec);
 3040 
 3041       __ mov(r0, rscratch2);
 3042 
 3043       __ leave();
 3044       __ ret(lr);
 3045 
 3046       return start;
 3047   }
 3048 
 3049   // Arguments:
 3050   //
 3051   // Inputs:
 3052   //   c_rarg0   - source byte array address
 3053   //   c_rarg1   - destination byte array address
 3054   //   c_rarg2   - sessionKd (key) in little endian int array
 3055   //   c_rarg3   - r vector byte array address
 3056   //   c_rarg4   - input length
 3057   //
 3058   // Output:
 3059   //   r0        - input length
 3060   //
 3061   address generate_cipherBlockChaining_decryptAESCrypt() {
 3062     assert(UseAES, "need AES cryptographic extension support");
 3063     __ align(CodeEntryAlignment);
 3064     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3065     StubCodeMark mark(this, stub_id);
 3066 
 3067     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3068 
 3069     const Register from        = c_rarg0;  // source array address
 3070     const Register to          = c_rarg1;  // destination array address
 3071     const Register key         = c_rarg2;  // key array address
 3072     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3073                                            // and left with the results of the last encryption block
 3074     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3075     const Register keylen      = rscratch1;
 3076 
 3077     address start = __ pc();
 3078 
 3079       __ enter();
 3080 
 3081       __ movw(rscratch2, len_reg);
 3082 
 3083       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3084 
 3085       __ ld1(v2, __ T16B, rvec);
 3086 
 3087       __ ld1(v31, __ T16B, __ post(key, 16));
 3088       __ rev32(v31, __ T16B, v31);
 3089 
 3090       __ cmpw(keylen, 52);
 3091       __ br(Assembler::CC, L_loadkeys_44);
 3092       __ br(Assembler::EQ, L_loadkeys_52);
 3093 
 3094       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3095       __ rev32(v17, __ T16B, v17);
 3096       __ rev32(v18, __ T16B, v18);
 3097     __ BIND(L_loadkeys_52);
 3098       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3099       __ rev32(v19, __ T16B, v19);
 3100       __ rev32(v20, __ T16B, v20);
 3101     __ BIND(L_loadkeys_44);
 3102       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3103       __ rev32(v21, __ T16B, v21);
 3104       __ rev32(v22, __ T16B, v22);
 3105       __ rev32(v23, __ T16B, v23);
 3106       __ rev32(v24, __ T16B, v24);
 3107       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3108       __ rev32(v25, __ T16B, v25);
 3109       __ rev32(v26, __ T16B, v26);
 3110       __ rev32(v27, __ T16B, v27);
 3111       __ rev32(v28, __ T16B, v28);
 3112       __ ld1(v29, v30, __ T16B, key);
 3113       __ rev32(v29, __ T16B, v29);
 3114       __ rev32(v30, __ T16B, v30);
 3115 
 3116     __ BIND(L_aes_loop);
 3117       __ ld1(v0, __ T16B, __ post(from, 16));
 3118       __ orr(v1, __ T16B, v0, v0);
 3119 
 3120       __ br(Assembler::CC, L_rounds_44);
 3121       __ br(Assembler::EQ, L_rounds_52);
 3122 
 3123       __ aesd(v0, v17); __ aesimc(v0, v0);
 3124       __ aesd(v0, v18); __ aesimc(v0, v0);
 3125     __ BIND(L_rounds_52);
 3126       __ aesd(v0, v19); __ aesimc(v0, v0);
 3127       __ aesd(v0, v20); __ aesimc(v0, v0);
 3128     __ BIND(L_rounds_44);
 3129       __ aesd(v0, v21); __ aesimc(v0, v0);
 3130       __ aesd(v0, v22); __ aesimc(v0, v0);
 3131       __ aesd(v0, v23); __ aesimc(v0, v0);
 3132       __ aesd(v0, v24); __ aesimc(v0, v0);
 3133       __ aesd(v0, v25); __ aesimc(v0, v0);
 3134       __ aesd(v0, v26); __ aesimc(v0, v0);
 3135       __ aesd(v0, v27); __ aesimc(v0, v0);
 3136       __ aesd(v0, v28); __ aesimc(v0, v0);
 3137       __ aesd(v0, v29); __ aesimc(v0, v0);
 3138       __ aesd(v0, v30);
 3139       __ eor(v0, __ T16B, v0, v31);
 3140       __ eor(v0, __ T16B, v0, v2);
 3141 
 3142       __ st1(v0, __ T16B, __ post(to, 16));
 3143       __ orr(v2, __ T16B, v1, v1);
 3144 
 3145       __ subw(len_reg, len_reg, 16);
 3146       __ cbnzw(len_reg, L_aes_loop);
 3147 
 3148       __ st1(v2, __ T16B, rvec);
 3149 
 3150       __ mov(r0, rscratch2);
 3151 
 3152       __ leave();
 3153       __ ret(lr);
 3154 
 3155     return start;
 3156   }
 3157 
 3158   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3159   // Inputs: 128-bits. in is preserved.
 3160   // The least-significant 64-bit word is in the upper dword of each vector.
 3161   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3162   // Output: result
 3163   void be_add_128_64(FloatRegister result, FloatRegister in,
 3164                      FloatRegister inc, FloatRegister tmp) {
 3165     assert_different_registers(result, tmp, inc);
 3166 
 3167     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3168                                            // input
 3169     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3170     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3171                                            // MSD == 0 (must be!) to LSD
 3172     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3173   }
 3174 
 3175   // CTR AES crypt.
 3176   // Arguments:
 3177   //
 3178   // Inputs:
 3179   //   c_rarg0   - source byte array address
 3180   //   c_rarg1   - destination byte array address
 3181   //   c_rarg2   - sessionKe (key) in little endian int array
 3182   //   c_rarg3   - counter vector byte array address
 3183   //   c_rarg4   - input length
 3184   //   c_rarg5   - saved encryptedCounter start
 3185   //   c_rarg6   - saved used length
 3186   //
 3187   // Output:
 3188   //   r0       - input length
 3189   //
 3190   address generate_counterMode_AESCrypt() {
 3191     const Register in = c_rarg0;
 3192     const Register out = c_rarg1;
 3193     const Register key = c_rarg2;
 3194     const Register counter = c_rarg3;
 3195     const Register saved_len = c_rarg4, len = r10;
 3196     const Register saved_encrypted_ctr = c_rarg5;
 3197     const Register used_ptr = c_rarg6, used = r12;
 3198 
 3199     const Register offset = r7;
 3200     const Register keylen = r11;
 3201 
 3202     const unsigned char block_size = 16;
 3203     const int bulk_width = 4;
 3204     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3205     // performance with larger data sizes, but it also means that the
 3206     // fast path isn't used until you have at least 8 blocks, and up
 3207     // to 127 bytes of data will be executed on the slow path. For
 3208     // that reason, and also so as not to blow away too much icache, 4
 3209     // blocks seems like a sensible compromise.
 3210 
 3211     // Algorithm:
 3212     //
 3213     //    if (len == 0) {
 3214     //        goto DONE;
 3215     //    }
 3216     //    int result = len;
 3217     //    do {
 3218     //        if (used >= blockSize) {
 3219     //            if (len >= bulk_width * blockSize) {
 3220     //                CTR_large_block();
 3221     //                if (len == 0)
 3222     //                    goto DONE;
 3223     //            }
 3224     //            for (;;) {
 3225     //                16ByteVector v0 = counter;
 3226     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3227     //                used = 0;
 3228     //                if (len < blockSize)
 3229     //                    break;    /* goto NEXT */
 3230     //                16ByteVector v1 = load16Bytes(in, offset);
 3231     //                v1 = v1 ^ encryptedCounter;
 3232     //                store16Bytes(out, offset);
 3233     //                used = blockSize;
 3234     //                offset += blockSize;
 3235     //                len -= blockSize;
 3236     //                if (len == 0)
 3237     //                    goto DONE;
 3238     //            }
 3239     //        }
 3240     //      NEXT:
 3241     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3242     //        len--;
 3243     //    } while (len != 0);
 3244     //  DONE:
 3245     //    return result;
 3246     //
 3247     // CTR_large_block()
 3248     //    Wide bulk encryption of whole blocks.
 3249 
 3250     __ align(CodeEntryAlignment);
 3251     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3252     StubCodeMark mark(this, stub_id);
 3253     const address start = __ pc();
 3254     __ enter();
 3255 
 3256     Label DONE, CTR_large_block, large_block_return;
 3257     __ ldrw(used, Address(used_ptr));
 3258     __ cbzw(saved_len, DONE);
 3259 
 3260     __ mov(len, saved_len);
 3261     __ mov(offset, 0);
 3262 
 3263     // Compute #rounds for AES based on the length of the key array
 3264     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3265 
 3266     __ aesenc_loadkeys(key, keylen);
 3267 
 3268     {
 3269       Label L_CTR_loop, NEXT;
 3270 
 3271       __ bind(L_CTR_loop);
 3272 
 3273       __ cmp(used, block_size);
 3274       __ br(__ LO, NEXT);
 3275 
 3276       // Maybe we have a lot of data
 3277       __ subsw(rscratch1, len, bulk_width * block_size);
 3278       __ br(__ HS, CTR_large_block);
 3279       __ BIND(large_block_return);
 3280       __ cbzw(len, DONE);
 3281 
 3282       // Setup the counter
 3283       __ movi(v4, __ T4S, 0);
 3284       __ movi(v5, __ T4S, 1);
 3285       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3286 
 3287       // 128-bit big-endian increment
 3288       __ ld1(v0, __ T16B, counter);
 3289       __ rev64(v16, __ T16B, v0);
 3290       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3291       __ rev64(v16, __ T16B, v16);
 3292       __ st1(v16, __ T16B, counter);
 3293       // Previous counter value is in v0
 3294       // v4 contains { 0, 1 }
 3295 
 3296       {
 3297         // We have fewer than bulk_width blocks of data left. Encrypt
 3298         // them one by one until there is less than a full block
 3299         // remaining, being careful to save both the encrypted counter
 3300         // and the counter.
 3301 
 3302         Label inner_loop;
 3303         __ bind(inner_loop);
 3304         // Counter to encrypt is in v0
 3305         __ aesecb_encrypt(noreg, noreg, keylen);
 3306         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3307 
 3308         // Do we have a remaining full block?
 3309 
 3310         __ mov(used, 0);
 3311         __ cmp(len, block_size);
 3312         __ br(__ LO, NEXT);
 3313 
 3314         // Yes, we have a full block
 3315         __ ldrq(v1, Address(in, offset));
 3316         __ eor(v1, __ T16B, v1, v0);
 3317         __ strq(v1, Address(out, offset));
 3318         __ mov(used, block_size);
 3319         __ add(offset, offset, block_size);
 3320 
 3321         __ subw(len, len, block_size);
 3322         __ cbzw(len, DONE);
 3323 
 3324         // Increment the counter, store it back
 3325         __ orr(v0, __ T16B, v16, v16);
 3326         __ rev64(v16, __ T16B, v16);
 3327         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3328         __ rev64(v16, __ T16B, v16);
 3329         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3330 
 3331         __ b(inner_loop);
 3332       }
 3333 
 3334       __ BIND(NEXT);
 3335 
 3336       // Encrypt a single byte, and loop.
 3337       // We expect this to be a rare event.
 3338       __ ldrb(rscratch1, Address(in, offset));
 3339       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3340       __ eor(rscratch1, rscratch1, rscratch2);
 3341       __ strb(rscratch1, Address(out, offset));
 3342       __ add(offset, offset, 1);
 3343       __ add(used, used, 1);
 3344       __ subw(len, len,1);
 3345       __ cbnzw(len, L_CTR_loop);
 3346     }
 3347 
 3348     __ bind(DONE);
 3349     __ strw(used, Address(used_ptr));
 3350     __ mov(r0, saved_len);
 3351 
 3352     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3353     __ ret(lr);
 3354 
 3355     // Bulk encryption
 3356 
 3357     __ BIND (CTR_large_block);
 3358     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3359 
 3360     if (bulk_width == 8) {
 3361       __ sub(sp, sp, 4 * 16);
 3362       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3363     }
 3364     __ sub(sp, sp, 4 * 16);
 3365     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3366     RegSet saved_regs = (RegSet::of(in, out, offset)
 3367                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3368     __ push(saved_regs, sp);
 3369     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3370     __ add(in, in, offset);
 3371     __ add(out, out, offset);
 3372 
 3373     // Keys should already be loaded into the correct registers
 3374 
 3375     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3376     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3377 
 3378     // AES/CTR loop
 3379     {
 3380       Label L_CTR_loop;
 3381       __ BIND(L_CTR_loop);
 3382 
 3383       // Setup the counters
 3384       __ movi(v8, __ T4S, 0);
 3385       __ movi(v9, __ T4S, 1);
 3386       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3387 
 3388       for (int i = 0; i < bulk_width; i++) {
 3389         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3390         __ rev64(v0_ofs, __ T16B, v16);
 3391         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3392       }
 3393 
 3394       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3395 
 3396       // Encrypt the counters
 3397       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3398 
 3399       if (bulk_width == 8) {
 3400         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3401       }
 3402 
 3403       // XOR the encrypted counters with the inputs
 3404       for (int i = 0; i < bulk_width; i++) {
 3405         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3406         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3407         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3408       }
 3409 
 3410       // Write the encrypted data
 3411       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3412       if (bulk_width == 8) {
 3413         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3414       }
 3415 
 3416       __ subw(len, len, 16 * bulk_width);
 3417       __ cbnzw(len, L_CTR_loop);
 3418     }
 3419 
 3420     // Save the counter back where it goes
 3421     __ rev64(v16, __ T16B, v16);
 3422     __ st1(v16, __ T16B, counter);
 3423 
 3424     __ pop(saved_regs, sp);
 3425 
 3426     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3427     if (bulk_width == 8) {
 3428       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3429     }
 3430 
 3431     __ andr(rscratch1, len, -16 * bulk_width);
 3432     __ sub(len, len, rscratch1);
 3433     __ add(offset, offset, rscratch1);
 3434     __ mov(used, 16);
 3435     __ strw(used, Address(used_ptr));
 3436     __ b(large_block_return);
 3437 
 3438     return start;
 3439   }
 3440 
 3441   // Vector AES Galois Counter Mode implementation. Parameters:
 3442   //
 3443   // in = c_rarg0
 3444   // len = c_rarg1
 3445   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3446   // out = c_rarg3
 3447   // key = c_rarg4
 3448   // state = c_rarg5 - GHASH.state
 3449   // subkeyHtbl = c_rarg6 - powers of H
 3450   // counter = c_rarg7 - 16 bytes of CTR
 3451   // return - number of processed bytes
 3452   address generate_galoisCounterMode_AESCrypt() {
 3453     Label ghash_polynomial; // local data generated after code
 3454 
 3455    __ align(CodeEntryAlignment);
 3456     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3457     StubCodeMark mark(this, stub_id);
 3458     address start = __ pc();
 3459     __ enter();
 3460 
 3461     const Register in = c_rarg0;
 3462     const Register len = c_rarg1;
 3463     const Register ct = c_rarg2;
 3464     const Register out = c_rarg3;
 3465     // and updated with the incremented counter in the end
 3466 
 3467     const Register key = c_rarg4;
 3468     const Register state = c_rarg5;
 3469 
 3470     const Register subkeyHtbl = c_rarg6;
 3471 
 3472     const Register counter = c_rarg7;
 3473 
 3474     const Register keylen = r10;
 3475     // Save state before entering routine
 3476     __ sub(sp, sp, 4 * 16);
 3477     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3478     __ sub(sp, sp, 4 * 16);
 3479     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3480 
 3481     // __ andr(len, len, -512);
 3482     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3483     __ str(len, __ pre(sp, -2 * wordSize));
 3484 
 3485     Label DONE;
 3486     __ cbz(len, DONE);
 3487 
 3488     // Compute #rounds for AES based on the length of the key array
 3489     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3490 
 3491     __ aesenc_loadkeys(key, keylen);
 3492     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3493     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3494 
 3495     // AES/CTR loop
 3496     {
 3497       Label L_CTR_loop;
 3498       __ BIND(L_CTR_loop);
 3499 
 3500       // Setup the counters
 3501       __ movi(v8, __ T4S, 0);
 3502       __ movi(v9, __ T4S, 1);
 3503       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3504 
 3505       assert(v0->encoding() < v8->encoding(), "");
 3506       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3507         FloatRegister f = as_FloatRegister(i);
 3508         __ rev32(f, __ T16B, v16);
 3509         __ addv(v16, __ T4S, v16, v8);
 3510       }
 3511 
 3512       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3513 
 3514       // Encrypt the counters
 3515       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3516 
 3517       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3518 
 3519       // XOR the encrypted counters with the inputs
 3520       for (int i = 0; i < 8; i++) {
 3521         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3522         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3523         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3524       }
 3525       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3526       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3527 
 3528       __ subw(len, len, 16 * 8);
 3529       __ cbnzw(len, L_CTR_loop);
 3530     }
 3531 
 3532     __ rev32(v16, __ T16B, v16);
 3533     __ st1(v16, __ T16B, counter);
 3534 
 3535     __ ldr(len, Address(sp));
 3536     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3537 
 3538     // GHASH/CTR loop
 3539     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3540                                 len, /*unrolls*/4);
 3541 
 3542 #ifdef ASSERT
 3543     { Label L;
 3544       __ cmp(len, (unsigned char)0);
 3545       __ br(Assembler::EQ, L);
 3546       __ stop("stubGenerator: abort");
 3547       __ bind(L);
 3548   }
 3549 #endif
 3550 
 3551   __ bind(DONE);
 3552     // Return the number of bytes processed
 3553     __ ldr(r0, __ post(sp, 2 * wordSize));
 3554 
 3555     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3556     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3557 
 3558     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3559     __ ret(lr);
 3560 
 3561     // bind label and generate polynomial data
 3562     __ align(wordSize * 2);
 3563     __ bind(ghash_polynomial);
 3564     __ emit_int64(0x87);  // The low-order bits of the field
 3565                           // polynomial (i.e. p = z^7+z^2+z+1)
 3566                           // repeated in the low and high parts of a
 3567                           // 128-bit vector
 3568     __ emit_int64(0x87);
 3569 
 3570     return start;
 3571   }
 3572 
 3573   class Cached64Bytes {
 3574   private:
 3575     MacroAssembler *_masm;
 3576     Register _regs[8];
 3577 
 3578   public:
 3579     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3580       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3581       auto it = rs.begin();
 3582       for (auto &r: _regs) {
 3583         r = *it;
 3584         ++it;
 3585       }
 3586     }
 3587 
 3588     void gen_loads(Register base) {
 3589       for (int i = 0; i < 8; i += 2) {
 3590         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3591       }
 3592     }
 3593 
 3594     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3595     void extract_u32(Register dest, int i) {
 3596       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3597     }
 3598   };
 3599 
 3600   // Utility routines for md5.
 3601   // Clobbers r10 and r11.
 3602   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3603               int k, int s, int t) {
 3604     Register rscratch3 = r10;
 3605     Register rscratch4 = r11;
 3606 
 3607     __ eorw(rscratch3, r3, r4);
 3608     __ movw(rscratch2, t);
 3609     __ andw(rscratch3, rscratch3, r2);
 3610     __ addw(rscratch4, r1, rscratch2);
 3611     reg_cache.extract_u32(rscratch1, k);
 3612     __ eorw(rscratch3, rscratch3, r4);
 3613     __ addw(rscratch4, rscratch4, rscratch1);
 3614     __ addw(rscratch3, rscratch3, rscratch4);
 3615     __ rorw(rscratch2, rscratch3, 32 - s);
 3616     __ addw(r1, rscratch2, r2);
 3617   }
 3618 
 3619   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3620               int k, int s, int t) {
 3621     Register rscratch3 = r10;
 3622     Register rscratch4 = r11;
 3623 
 3624     reg_cache.extract_u32(rscratch1, k);
 3625     __ movw(rscratch2, t);
 3626     __ addw(rscratch4, r1, rscratch2);
 3627     __ addw(rscratch4, rscratch4, rscratch1);
 3628     __ bicw(rscratch2, r3, r4);
 3629     __ andw(rscratch3, r2, r4);
 3630     __ addw(rscratch2, rscratch2, rscratch4);
 3631     __ addw(rscratch2, rscratch2, rscratch3);
 3632     __ rorw(rscratch2, rscratch2, 32 - s);
 3633     __ addw(r1, rscratch2, r2);
 3634   }
 3635 
 3636   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3637               int k, int s, int t) {
 3638     Register rscratch3 = r10;
 3639     Register rscratch4 = r11;
 3640 
 3641     __ eorw(rscratch3, r3, r4);
 3642     __ movw(rscratch2, t);
 3643     __ addw(rscratch4, r1, rscratch2);
 3644     reg_cache.extract_u32(rscratch1, k);
 3645     __ eorw(rscratch3, rscratch3, r2);
 3646     __ addw(rscratch4, rscratch4, rscratch1);
 3647     __ addw(rscratch3, rscratch3, rscratch4);
 3648     __ rorw(rscratch2, rscratch3, 32 - s);
 3649     __ addw(r1, rscratch2, r2);
 3650   }
 3651 
 3652   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3653               int k, int s, int t) {
 3654     Register rscratch3 = r10;
 3655     Register rscratch4 = r11;
 3656 
 3657     __ movw(rscratch3, t);
 3658     __ ornw(rscratch2, r2, r4);
 3659     __ addw(rscratch4, r1, rscratch3);
 3660     reg_cache.extract_u32(rscratch1, k);
 3661     __ eorw(rscratch3, rscratch2, r3);
 3662     __ addw(rscratch4, rscratch4, rscratch1);
 3663     __ addw(rscratch3, rscratch3, rscratch4);
 3664     __ rorw(rscratch2, rscratch3, 32 - s);
 3665     __ addw(r1, rscratch2, r2);
 3666   }
 3667 
 3668   // Arguments:
 3669   //
 3670   // Inputs:
 3671   //   c_rarg0   - byte[]  source+offset
 3672   //   c_rarg1   - int[]   SHA.state
 3673   //   c_rarg2   - int     offset
 3674   //   c_rarg3   - int     limit
 3675   //
 3676   address generate_md5_implCompress(StubId stub_id) {
 3677     bool multi_block;
 3678     switch (stub_id) {
 3679     case StubId::stubgen_md5_implCompress_id:
 3680       multi_block = false;
 3681       break;
 3682     case StubId::stubgen_md5_implCompressMB_id:
 3683       multi_block = true;
 3684       break;
 3685     default:
 3686       ShouldNotReachHere();
 3687     }
 3688     __ align(CodeEntryAlignment);
 3689 
 3690     StubCodeMark mark(this, stub_id);
 3691     address start = __ pc();
 3692 
 3693     Register buf       = c_rarg0;
 3694     Register state     = c_rarg1;
 3695     Register ofs       = c_rarg2;
 3696     Register limit     = c_rarg3;
 3697     Register a         = r4;
 3698     Register b         = r5;
 3699     Register c         = r6;
 3700     Register d         = r7;
 3701     Register rscratch3 = r10;
 3702     Register rscratch4 = r11;
 3703 
 3704     Register state_regs[2] = { r12, r13 };
 3705     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3706     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3707 
 3708     __ push(saved_regs, sp);
 3709 
 3710     __ ldp(state_regs[0], state_regs[1], Address(state));
 3711     __ ubfx(a, state_regs[0],  0, 32);
 3712     __ ubfx(b, state_regs[0], 32, 32);
 3713     __ ubfx(c, state_regs[1],  0, 32);
 3714     __ ubfx(d, state_regs[1], 32, 32);
 3715 
 3716     Label md5_loop;
 3717     __ BIND(md5_loop);
 3718 
 3719     reg_cache.gen_loads(buf);
 3720 
 3721     // Round 1
 3722     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3723     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3724     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3725     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3726     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3727     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3728     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3729     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3730     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3731     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3732     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3733     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3734     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3735     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3736     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3737     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3738 
 3739     // Round 2
 3740     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3741     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3742     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3743     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3744     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3745     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3746     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3747     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3748     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3749     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3750     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3751     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3752     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3753     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3754     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3755     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3756 
 3757     // Round 3
 3758     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3759     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3760     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3761     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3762     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3763     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3764     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3765     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3766     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3767     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3768     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3769     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3770     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3771     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3772     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3773     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3774 
 3775     // Round 4
 3776     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3777     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3778     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3779     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3780     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3781     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3782     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3783     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3784     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3785     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3786     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3787     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3788     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3789     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3790     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3791     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3792 
 3793     __ addw(a, state_regs[0], a);
 3794     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3795     __ addw(b, rscratch2, b);
 3796     __ addw(c, state_regs[1], c);
 3797     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3798     __ addw(d, rscratch4, d);
 3799 
 3800     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3801     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3802 
 3803     if (multi_block) {
 3804       __ add(buf, buf, 64);
 3805       __ add(ofs, ofs, 64);
 3806       __ cmp(ofs, limit);
 3807       __ br(Assembler::LE, md5_loop);
 3808       __ mov(c_rarg0, ofs); // return ofs
 3809     }
 3810 
 3811     // write hash values back in the correct order
 3812     __ stp(state_regs[0], state_regs[1], Address(state));
 3813 
 3814     __ pop(saved_regs, sp);
 3815 
 3816     __ ret(lr);
 3817 
 3818     return start;
 3819   }
 3820 
 3821   // Arguments:
 3822   //
 3823   // Inputs:
 3824   //   c_rarg0   - byte[]  source+offset
 3825   //   c_rarg1   - int[]   SHA.state
 3826   //   c_rarg2   - int     offset
 3827   //   c_rarg3   - int     limit
 3828   //
 3829   address generate_sha1_implCompress(StubId stub_id) {
 3830     bool multi_block;
 3831     switch (stub_id) {
 3832     case StubId::stubgen_sha1_implCompress_id:
 3833       multi_block = false;
 3834       break;
 3835     case StubId::stubgen_sha1_implCompressMB_id:
 3836       multi_block = true;
 3837       break;
 3838     default:
 3839       ShouldNotReachHere();
 3840     }
 3841 
 3842     __ align(CodeEntryAlignment);
 3843 
 3844     StubCodeMark mark(this, stub_id);
 3845     address start = __ pc();
 3846 
 3847     Register buf   = c_rarg0;
 3848     Register state = c_rarg1;
 3849     Register ofs   = c_rarg2;
 3850     Register limit = c_rarg3;
 3851 
 3852     Label keys;
 3853     Label sha1_loop;
 3854 
 3855     // load the keys into v0..v3
 3856     __ adr(rscratch1, keys);
 3857     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3858     // load 5 words state into v6, v7
 3859     __ ldrq(v6, Address(state, 0));
 3860     __ ldrs(v7, Address(state, 16));
 3861 
 3862 
 3863     __ BIND(sha1_loop);
 3864     // load 64 bytes of data into v16..v19
 3865     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3866     __ rev32(v16, __ T16B, v16);
 3867     __ rev32(v17, __ T16B, v17);
 3868     __ rev32(v18, __ T16B, v18);
 3869     __ rev32(v19, __ T16B, v19);
 3870 
 3871     // do the sha1
 3872     __ addv(v4, __ T4S, v16, v0);
 3873     __ orr(v20, __ T16B, v6, v6);
 3874 
 3875     FloatRegister d0 = v16;
 3876     FloatRegister d1 = v17;
 3877     FloatRegister d2 = v18;
 3878     FloatRegister d3 = v19;
 3879 
 3880     for (int round = 0; round < 20; round++) {
 3881       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3882       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3883       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3884       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3885       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3886 
 3887       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3888       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3889       __ sha1h(tmp2, __ T4S, v20);
 3890       if (round < 5)
 3891         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3892       else if (round < 10 || round >= 15)
 3893         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3894       else
 3895         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3896       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3897 
 3898       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3899     }
 3900 
 3901     __ addv(v7, __ T2S, v7, v21);
 3902     __ addv(v6, __ T4S, v6, v20);
 3903 
 3904     if (multi_block) {
 3905       __ add(ofs, ofs, 64);
 3906       __ cmp(ofs, limit);
 3907       __ br(Assembler::LE, sha1_loop);
 3908       __ mov(c_rarg0, ofs); // return ofs
 3909     }
 3910 
 3911     __ strq(v6, Address(state, 0));
 3912     __ strs(v7, Address(state, 16));
 3913 
 3914     __ ret(lr);
 3915 
 3916     __ bind(keys);
 3917     __ emit_int32(0x5a827999);
 3918     __ emit_int32(0x6ed9eba1);
 3919     __ emit_int32(0x8f1bbcdc);
 3920     __ emit_int32(0xca62c1d6);
 3921 
 3922     return start;
 3923   }
 3924 
 3925 
 3926   // Arguments:
 3927   //
 3928   // Inputs:
 3929   //   c_rarg0   - byte[]  source+offset
 3930   //   c_rarg1   - int[]   SHA.state
 3931   //   c_rarg2   - int     offset
 3932   //   c_rarg3   - int     limit
 3933   //
 3934   address generate_sha256_implCompress(StubId stub_id) {
 3935     bool multi_block;
 3936     switch (stub_id) {
 3937     case StubId::stubgen_sha256_implCompress_id:
 3938       multi_block = false;
 3939       break;
 3940     case StubId::stubgen_sha256_implCompressMB_id:
 3941       multi_block = true;
 3942       break;
 3943     default:
 3944       ShouldNotReachHere();
 3945     }
 3946 
 3947     static const uint32_t round_consts[64] = {
 3948       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3949       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3950       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3951       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3952       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3953       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3954       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3955       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3956       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3957       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3958       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3959       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3960       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3961       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3962       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3963       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3964     };
 3965 
 3966     __ align(CodeEntryAlignment);
 3967 
 3968     StubCodeMark mark(this, stub_id);
 3969     address start = __ pc();
 3970 
 3971     Register buf   = c_rarg0;
 3972     Register state = c_rarg1;
 3973     Register ofs   = c_rarg2;
 3974     Register limit = c_rarg3;
 3975 
 3976     Label sha1_loop;
 3977 
 3978     __ stpd(v8, v9, __ pre(sp, -32));
 3979     __ stpd(v10, v11, Address(sp, 16));
 3980 
 3981 // dga == v0
 3982 // dgb == v1
 3983 // dg0 == v2
 3984 // dg1 == v3
 3985 // dg2 == v4
 3986 // t0 == v6
 3987 // t1 == v7
 3988 
 3989     // load 16 keys to v16..v31
 3990     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3991     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3992     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3993     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3994     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3995 
 3996     // load 8 words (256 bits) state
 3997     __ ldpq(v0, v1, state);
 3998 
 3999     __ BIND(sha1_loop);
 4000     // load 64 bytes of data into v8..v11
 4001     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4002     __ rev32(v8, __ T16B, v8);
 4003     __ rev32(v9, __ T16B, v9);
 4004     __ rev32(v10, __ T16B, v10);
 4005     __ rev32(v11, __ T16B, v11);
 4006 
 4007     __ addv(v6, __ T4S, v8, v16);
 4008     __ orr(v2, __ T16B, v0, v0);
 4009     __ orr(v3, __ T16B, v1, v1);
 4010 
 4011     FloatRegister d0 = v8;
 4012     FloatRegister d1 = v9;
 4013     FloatRegister d2 = v10;
 4014     FloatRegister d3 = v11;
 4015 
 4016 
 4017     for (int round = 0; round < 16; round++) {
 4018       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4019       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4020       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4021       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4022 
 4023       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4024        __ orr(v4, __ T16B, v2, v2);
 4025       if (round < 15)
 4026         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4027       __ sha256h(v2, __ T4S, v3, tmp2);
 4028       __ sha256h2(v3, __ T4S, v4, tmp2);
 4029       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4030 
 4031       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4032     }
 4033 
 4034     __ addv(v0, __ T4S, v0, v2);
 4035     __ addv(v1, __ T4S, v1, v3);
 4036 
 4037     if (multi_block) {
 4038       __ add(ofs, ofs, 64);
 4039       __ cmp(ofs, limit);
 4040       __ br(Assembler::LE, sha1_loop);
 4041       __ mov(c_rarg0, ofs); // return ofs
 4042     }
 4043 
 4044     __ ldpd(v10, v11, Address(sp, 16));
 4045     __ ldpd(v8, v9, __ post(sp, 32));
 4046 
 4047     __ stpq(v0, v1, state);
 4048 
 4049     __ ret(lr);
 4050 
 4051     return start;
 4052   }
 4053 
 4054   // Double rounds for sha512.
 4055   void sha512_dround(int dr,
 4056                      FloatRegister vi0, FloatRegister vi1,
 4057                      FloatRegister vi2, FloatRegister vi3,
 4058                      FloatRegister vi4, FloatRegister vrc0,
 4059                      FloatRegister vrc1, FloatRegister vin0,
 4060                      FloatRegister vin1, FloatRegister vin2,
 4061                      FloatRegister vin3, FloatRegister vin4) {
 4062       if (dr < 36) {
 4063         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4064       }
 4065       __ addv(v5, __ T2D, vrc0, vin0);
 4066       __ ext(v6, __ T16B, vi2, vi3, 8);
 4067       __ ext(v5, __ T16B, v5, v5, 8);
 4068       __ ext(v7, __ T16B, vi1, vi2, 8);
 4069       __ addv(vi3, __ T2D, vi3, v5);
 4070       if (dr < 32) {
 4071         __ ext(v5, __ T16B, vin3, vin4, 8);
 4072         __ sha512su0(vin0, __ T2D, vin1);
 4073       }
 4074       __ sha512h(vi3, __ T2D, v6, v7);
 4075       if (dr < 32) {
 4076         __ sha512su1(vin0, __ T2D, vin2, v5);
 4077       }
 4078       __ addv(vi4, __ T2D, vi1, vi3);
 4079       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4080   }
 4081 
 4082   // Arguments:
 4083   //
 4084   // Inputs:
 4085   //   c_rarg0   - byte[]  source+offset
 4086   //   c_rarg1   - int[]   SHA.state
 4087   //   c_rarg2   - int     offset
 4088   //   c_rarg3   - int     limit
 4089   //
 4090   address generate_sha512_implCompress(StubId stub_id) {
 4091     bool multi_block;
 4092     switch (stub_id) {
 4093     case StubId::stubgen_sha512_implCompress_id:
 4094       multi_block = false;
 4095       break;
 4096     case StubId::stubgen_sha512_implCompressMB_id:
 4097       multi_block = true;
 4098       break;
 4099     default:
 4100       ShouldNotReachHere();
 4101     }
 4102 
 4103     static const uint64_t round_consts[80] = {
 4104       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4105       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4106       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4107       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4108       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4109       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4110       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4111       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4112       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4113       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4114       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4115       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4116       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4117       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4118       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4119       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4120       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4121       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4122       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4123       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4124       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4125       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4126       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4127       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4128       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4129       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4130       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4131     };
 4132 
 4133     __ align(CodeEntryAlignment);
 4134 
 4135     StubCodeMark mark(this, stub_id);
 4136     address start = __ pc();
 4137 
 4138     Register buf   = c_rarg0;
 4139     Register state = c_rarg1;
 4140     Register ofs   = c_rarg2;
 4141     Register limit = c_rarg3;
 4142 
 4143     __ stpd(v8, v9, __ pre(sp, -64));
 4144     __ stpd(v10, v11, Address(sp, 16));
 4145     __ stpd(v12, v13, Address(sp, 32));
 4146     __ stpd(v14, v15, Address(sp, 48));
 4147 
 4148     Label sha512_loop;
 4149 
 4150     // load state
 4151     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4152 
 4153     // load first 4 round constants
 4154     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4155     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4156 
 4157     __ BIND(sha512_loop);
 4158     // load 128B of data into v12..v19
 4159     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4160     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4161     __ rev64(v12, __ T16B, v12);
 4162     __ rev64(v13, __ T16B, v13);
 4163     __ rev64(v14, __ T16B, v14);
 4164     __ rev64(v15, __ T16B, v15);
 4165     __ rev64(v16, __ T16B, v16);
 4166     __ rev64(v17, __ T16B, v17);
 4167     __ rev64(v18, __ T16B, v18);
 4168     __ rev64(v19, __ T16B, v19);
 4169 
 4170     __ mov(rscratch2, rscratch1);
 4171 
 4172     __ mov(v0, __ T16B, v8);
 4173     __ mov(v1, __ T16B, v9);
 4174     __ mov(v2, __ T16B, v10);
 4175     __ mov(v3, __ T16B, v11);
 4176 
 4177     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4178     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4179     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4180     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4181     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4182     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4183     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4184     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4185     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4186     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4187     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4188     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4189     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4190     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4191     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4192     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4193     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4194     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4195     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4196     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4197     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4198     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4199     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4200     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4201     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4202     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4203     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4204     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4205     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4206     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4207     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4208     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4209     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4210     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4211     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4212     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4213     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4214     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4215     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4216     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4217 
 4218     __ addv(v8, __ T2D, v8, v0);
 4219     __ addv(v9, __ T2D, v9, v1);
 4220     __ addv(v10, __ T2D, v10, v2);
 4221     __ addv(v11, __ T2D, v11, v3);
 4222 
 4223     if (multi_block) {
 4224       __ add(ofs, ofs, 128);
 4225       __ cmp(ofs, limit);
 4226       __ br(Assembler::LE, sha512_loop);
 4227       __ mov(c_rarg0, ofs); // return ofs
 4228     }
 4229 
 4230     __ st1(v8, v9, v10, v11, __ T2D, state);
 4231 
 4232     __ ldpd(v14, v15, Address(sp, 48));
 4233     __ ldpd(v12, v13, Address(sp, 32));
 4234     __ ldpd(v10, v11, Address(sp, 16));
 4235     __ ldpd(v8, v9, __ post(sp, 64));
 4236 
 4237     __ ret(lr);
 4238 
 4239     return start;
 4240   }
 4241 
 4242   // Execute one round of keccak of two computations in parallel.
 4243   // One of the states should be loaded into the lower halves of
 4244   // the vector registers v0-v24, the other should be loaded into
 4245   // the upper halves of those registers. The ld1r instruction loads
 4246   // the round constant into both halves of register v31.
 4247   // Intermediate results c0...c5 and d0...d5 are computed
 4248   // in registers v25...v30.
 4249   // All vector instructions that are used operate on both register
 4250   // halves in parallel.
 4251   // If only a single computation is needed, one can only load the lower halves.
 4252   void keccak_round(Register rscratch1) {
 4253   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4254   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4255   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4256   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4257   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4258   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4259   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4260   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4261   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4262   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4263 
 4264   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4265   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4266   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4267   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4268   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4269 
 4270   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4271   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4272   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4273   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4274   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4275   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4276   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4277   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4278   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4279   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4280   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4281   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4282   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4283   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4284   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4285   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4286   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4287   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4288   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4289   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4290   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4291   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4292   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4293   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4294   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4295 
 4296   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4297   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4298   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4299   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4300   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4301 
 4302   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4303 
 4304   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4305   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4306   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4307   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4308   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4309 
 4310   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4311   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4312   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4313   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4314   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4315 
 4316   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4317   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4318   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4319   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4320   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4321 
 4322   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4323   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4324   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4325   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4326   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4327 
 4328   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4329   }
 4330 
 4331   // Arguments:
 4332   //
 4333   // Inputs:
 4334   //   c_rarg0   - byte[]  source+offset
 4335   //   c_rarg1   - byte[]  SHA.state
 4336   //   c_rarg2   - int     block_size
 4337   //   c_rarg3   - int     offset
 4338   //   c_rarg4   - int     limit
 4339   //
 4340   address generate_sha3_implCompress(StubId stub_id) {
 4341     bool multi_block;
 4342     switch (stub_id) {
 4343     case StubId::stubgen_sha3_implCompress_id:
 4344       multi_block = false;
 4345       break;
 4346     case StubId::stubgen_sha3_implCompressMB_id:
 4347       multi_block = true;
 4348       break;
 4349     default:
 4350       ShouldNotReachHere();
 4351     }
 4352 
 4353     static const uint64_t round_consts[24] = {
 4354       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4355       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4356       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4357       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4358       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4359       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4360       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4361       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4362     };
 4363 
 4364     __ align(CodeEntryAlignment);
 4365 
 4366     StubCodeMark mark(this, stub_id);
 4367     address start = __ pc();
 4368 
 4369     Register buf           = c_rarg0;
 4370     Register state         = c_rarg1;
 4371     Register block_size    = c_rarg2;
 4372     Register ofs           = c_rarg3;
 4373     Register limit         = c_rarg4;
 4374 
 4375     Label sha3_loop, rounds24_loop;
 4376     Label sha3_512_or_sha3_384, shake128;
 4377 
 4378     __ stpd(v8, v9, __ pre(sp, -64));
 4379     __ stpd(v10, v11, Address(sp, 16));
 4380     __ stpd(v12, v13, Address(sp, 32));
 4381     __ stpd(v14, v15, Address(sp, 48));
 4382 
 4383     // load state
 4384     __ add(rscratch1, state, 32);
 4385     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4386     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4387     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4388     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4389     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4390     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4391     __ ld1(v24, __ T1D, rscratch1);
 4392 
 4393     __ BIND(sha3_loop);
 4394 
 4395     // 24 keccak rounds
 4396     __ movw(rscratch2, 24);
 4397 
 4398     // load round_constants base
 4399     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4400 
 4401     // load input
 4402     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4403     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4404     __ eor(v0, __ T8B, v0, v25);
 4405     __ eor(v1, __ T8B, v1, v26);
 4406     __ eor(v2, __ T8B, v2, v27);
 4407     __ eor(v3, __ T8B, v3, v28);
 4408     __ eor(v4, __ T8B, v4, v29);
 4409     __ eor(v5, __ T8B, v5, v30);
 4410     __ eor(v6, __ T8B, v6, v31);
 4411 
 4412     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4413     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4414 
 4415     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4416     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4417     __ eor(v7, __ T8B, v7, v25);
 4418     __ eor(v8, __ T8B, v8, v26);
 4419     __ eor(v9, __ T8B, v9, v27);
 4420     __ eor(v10, __ T8B, v10, v28);
 4421     __ eor(v11, __ T8B, v11, v29);
 4422     __ eor(v12, __ T8B, v12, v30);
 4423     __ eor(v13, __ T8B, v13, v31);
 4424 
 4425     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4426     __ eor(v14, __ T8B, v14, v25);
 4427     __ eor(v15, __ T8B, v15, v26);
 4428     __ eor(v16, __ T8B, v16, v27);
 4429 
 4430     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4431     __ andw(c_rarg5, block_size, 48);
 4432     __ cbzw(c_rarg5, rounds24_loop);
 4433 
 4434     __ tbnz(block_size, 5, shake128);
 4435     // block_size == 144, bit5 == 0, SHA3-224
 4436     __ ldrd(v28, __ post(buf, 8));
 4437     __ eor(v17, __ T8B, v17, v28);
 4438     __ b(rounds24_loop);
 4439 
 4440     __ BIND(shake128);
 4441     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4442     __ eor(v17, __ T8B, v17, v28);
 4443     __ eor(v18, __ T8B, v18, v29);
 4444     __ eor(v19, __ T8B, v19, v30);
 4445     __ eor(v20, __ T8B, v20, v31);
 4446     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4447 
 4448     __ BIND(sha3_512_or_sha3_384);
 4449     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4450     __ eor(v7, __ T8B, v7, v25);
 4451     __ eor(v8, __ T8B, v8, v26);
 4452     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4453 
 4454     // SHA3-384
 4455     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4456     __ eor(v9,  __ T8B, v9,  v27);
 4457     __ eor(v10, __ T8B, v10, v28);
 4458     __ eor(v11, __ T8B, v11, v29);
 4459     __ eor(v12, __ T8B, v12, v30);
 4460 
 4461     __ BIND(rounds24_loop);
 4462     __ subw(rscratch2, rscratch2, 1);
 4463 
 4464     keccak_round(rscratch1);
 4465 
 4466     __ cbnzw(rscratch2, rounds24_loop);
 4467 
 4468     if (multi_block) {
 4469       __ add(ofs, ofs, block_size);
 4470       __ cmp(ofs, limit);
 4471       __ br(Assembler::LE, sha3_loop);
 4472       __ mov(c_rarg0, ofs); // return ofs
 4473     }
 4474 
 4475     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4476     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4477     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4478     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4479     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4480     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4481     __ st1(v24, __ T1D, state);
 4482 
 4483     // restore callee-saved registers
 4484     __ ldpd(v14, v15, Address(sp, 48));
 4485     __ ldpd(v12, v13, Address(sp, 32));
 4486     __ ldpd(v10, v11, Address(sp, 16));
 4487     __ ldpd(v8, v9, __ post(sp, 64));
 4488 
 4489     __ ret(lr);
 4490 
 4491     return start;
 4492   }
 4493 
 4494   // Inputs:
 4495   //   c_rarg0   - long[]  state0
 4496   //   c_rarg1   - long[]  state1
 4497   address generate_double_keccak() {
 4498     static const uint64_t round_consts[24] = {
 4499       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4500       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4501       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4502       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4503       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4504       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4505       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4506       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4507     };
 4508 
 4509     // Implements the double_keccak() method of the
 4510     // sun.secyrity.provider.SHA3Parallel class
 4511     __ align(CodeEntryAlignment);
 4512     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4513     address start = __ pc();
 4514     __ enter();
 4515 
 4516     Register state0        = c_rarg0;
 4517     Register state1        = c_rarg1;
 4518 
 4519     Label rounds24_loop;
 4520 
 4521     // save callee-saved registers
 4522     __ stpd(v8, v9, __ pre(sp, -64));
 4523     __ stpd(v10, v11, Address(sp, 16));
 4524     __ stpd(v12, v13, Address(sp, 32));
 4525     __ stpd(v14, v15, Address(sp, 48));
 4526 
 4527     // load states
 4528     __ add(rscratch1, state0, 32);
 4529     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4530     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4531     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4532     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4533     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4534     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4535     __ ld1(v24, __ D, 0, rscratch1);
 4536     __ add(rscratch1, state1, 32);
 4537     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4538     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4539     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4540     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4541     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4542     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4543     __ ld1(v24, __ D, 1, rscratch1);
 4544 
 4545     // 24 keccak rounds
 4546     __ movw(rscratch2, 24);
 4547 
 4548     // load round_constants base
 4549     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4550 
 4551     __ BIND(rounds24_loop);
 4552     __ subw(rscratch2, rscratch2, 1);
 4553     keccak_round(rscratch1);
 4554     __ cbnzw(rscratch2, rounds24_loop);
 4555 
 4556     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4557     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4558     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4559     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4560     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4561     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4562     __ st1(v24, __ D, 0, state0);
 4563     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4564     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4565     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4566     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4567     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4568     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4569     __ st1(v24, __ D, 1, state1);
 4570 
 4571     // restore callee-saved vector registers
 4572     __ ldpd(v14, v15, Address(sp, 48));
 4573     __ ldpd(v12, v13, Address(sp, 32));
 4574     __ ldpd(v10, v11, Address(sp, 16));
 4575     __ ldpd(v8, v9, __ post(sp, 64));
 4576 
 4577     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4578     __ mov(r0, zr); // return 0
 4579     __ ret(lr);
 4580 
 4581     return start;
 4582   }
 4583 
 4584   // ChaCha20 block function.  This version parallelizes the 32-bit
 4585   // state elements on each of 16 vectors, producing 4 blocks of
 4586   // keystream at a time.
 4587   //
 4588   // state (int[16]) = c_rarg0
 4589   // keystream (byte[256]) = c_rarg1
 4590   // return - number of bytes of produced keystream (always 256)
 4591   //
 4592   // This implementation takes each 32-bit integer from the state
 4593   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4594   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4595   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4596   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4597   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4598   // operations represented by one QUARTERROUND function, we instead stack all
 4599   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4600   // and then do the same for the second set of 4 quarter rounds.  This removes
 4601   // some latency that would otherwise be incurred by waiting for an add to
 4602   // complete before performing an xor (which depends on the result of the
 4603   // add), etc. An adjustment happens between the first and second groups of 4
 4604   // quarter rounds, but this is done only in the inputs to the macro functions
 4605   // that generate the assembly instructions - these adjustments themselves are
 4606   // not part of the resulting assembly.
 4607   // The 4 registers v0-v3 are used during the quarter round operations as
 4608   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4609   // registers become the vectors involved in adding the start state back onto
 4610   // the post-QR working state.  After the adds are complete, each of the 16
 4611   // vectors write their first lane back to the keystream buffer, followed
 4612   // by the second lane from all vectors and so on.
 4613   address generate_chacha20Block_blockpar() {
 4614     Label L_twoRounds, L_cc20_const;
 4615     __ align(CodeEntryAlignment);
 4616     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4617     StubCodeMark mark(this, stub_id);
 4618     address start = __ pc();
 4619     __ enter();
 4620 
 4621     int i, j;
 4622     const Register state = c_rarg0;
 4623     const Register keystream = c_rarg1;
 4624     const Register loopCtr = r10;
 4625     const Register tmpAddr = r11;
 4626     const FloatRegister ctrAddOverlay = v28;
 4627     const FloatRegister lrot8Tbl = v29;
 4628 
 4629     // Organize SIMD registers in an array that facilitates
 4630     // putting repetitive opcodes into loop structures.  It is
 4631     // important that each grouping of 4 registers is monotonically
 4632     // increasing to support the requirements of multi-register
 4633     // instructions (e.g. ld4r, st4, etc.)
 4634     const FloatRegister workSt[16] = {
 4635          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4636         v20, v21, v22, v23, v24, v25, v26, v27
 4637     };
 4638 
 4639     // Pull in constant data.  The first 16 bytes are the add overlay
 4640     // which is applied to the vector holding the counter (state[12]).
 4641     // The second 16 bytes is the index register for the 8-bit left
 4642     // rotation tbl instruction.
 4643     __ adr(tmpAddr, L_cc20_const);
 4644     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4645 
 4646     // Load from memory and interlace across 16 SIMD registers,
 4647     // With each word from memory being broadcast to all lanes of
 4648     // each successive SIMD register.
 4649     //      Addr(0) -> All lanes in workSt[i]
 4650     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4651     __ mov(tmpAddr, state);
 4652     for (i = 0; i < 16; i += 4) {
 4653       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4654           __ post(tmpAddr, 16));
 4655     }
 4656     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4657 
 4658     // Before entering the loop, create 5 4-register arrays.  These
 4659     // will hold the 4 registers that represent the a/b/c/d fields
 4660     // in the quarter round operation.  For instance the "b" field
 4661     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4662     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4663     // since it is part of a diagonal organization.  The aSet and scratch
 4664     // register sets are defined at declaration time because they do not change
 4665     // organization at any point during the 20-round processing.
 4666     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4667     FloatRegister bSet[4];
 4668     FloatRegister cSet[4];
 4669     FloatRegister dSet[4];
 4670     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4671 
 4672     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4673     __ mov(loopCtr, 10);
 4674     __ BIND(L_twoRounds);
 4675 
 4676     // Set to columnar organization and do the following 4 quarter-rounds:
 4677     // QUARTERROUND(0, 4, 8, 12)
 4678     // QUARTERROUND(1, 5, 9, 13)
 4679     // QUARTERROUND(2, 6, 10, 14)
 4680     // QUARTERROUND(3, 7, 11, 15)
 4681     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4682     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4683     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4684 
 4685     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4686     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4687     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4688 
 4689     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4690     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4691     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4692 
 4693     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4694     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4695     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4696 
 4697     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4698     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4699     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4700 
 4701     // Set to diagonal organization and do the next 4 quarter-rounds:
 4702     // QUARTERROUND(0, 5, 10, 15)
 4703     // QUARTERROUND(1, 6, 11, 12)
 4704     // QUARTERROUND(2, 7, 8, 13)
 4705     // QUARTERROUND(3, 4, 9, 14)
 4706     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4707     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4708     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4709 
 4710     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4711     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4712     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4713 
 4714     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4715     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4716     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4717 
 4718     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4719     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4720     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4721 
 4722     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4723     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4724     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4725 
 4726     // Decrement and iterate
 4727     __ sub(loopCtr, loopCtr, 1);
 4728     __ cbnz(loopCtr, L_twoRounds);
 4729 
 4730     __ mov(tmpAddr, state);
 4731 
 4732     // Add the starting state back to the post-loop keystream
 4733     // state.  We read/interlace the state array from memory into
 4734     // 4 registers similar to what we did in the beginning.  Then
 4735     // add the counter overlay onto workSt[12] at the end.
 4736     for (i = 0; i < 16; i += 4) {
 4737       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4738       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4739       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4740       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4741       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4742     }
 4743     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4744 
 4745     // Write working state into the keystream buffer.  This is accomplished
 4746     // by taking the lane "i" from each of the four vectors and writing
 4747     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4748     // repeating with the next 4 vectors until all 16 vectors have been used.
 4749     // Then move to the next lane and repeat the process until all lanes have
 4750     // been written.
 4751     for (i = 0; i < 4; i++) {
 4752       for (j = 0; j < 16; j += 4) {
 4753         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4754             __ post(keystream, 16));
 4755       }
 4756     }
 4757 
 4758     __ mov(r0, 256);             // Return length of output keystream
 4759     __ leave();
 4760     __ ret(lr);
 4761 
 4762     // bind label and generate local constant data used by this stub
 4763     // The constant data is broken into two 128-bit segments to be loaded
 4764     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4765     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4766     // The second 128-bits is a table constant used for 8-bit left rotations.
 4767     __ BIND(L_cc20_const);
 4768     __ emit_int64(0x0000000100000000UL);
 4769     __ emit_int64(0x0000000300000002UL);
 4770     __ emit_int64(0x0605040702010003UL);
 4771     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4772 
 4773     return start;
 4774   }
 4775 
 4776   // Helpers to schedule parallel operation bundles across vector
 4777   // register sequences of size 2, 4 or 8.
 4778 
 4779   // Implement various primitive computations across vector sequences
 4780 
 4781   template<int N>
 4782   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4783                const VSeq<N>& v1, const VSeq<N>& v2) {
 4784     // output must not be constant
 4785     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4786     // output cannot overwrite pending inputs
 4787     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4788     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4789     for (int i = 0; i < N; i++) {
 4790       __ addv(v[i], T, v1[i], v2[i]);
 4791     }
 4792   }
 4793 
 4794   template<int N>
 4795   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4796                const VSeq<N>& v1, const VSeq<N>& v2) {
 4797     // output must not be constant
 4798     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4799     // output cannot overwrite pending inputs
 4800     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4801     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4802     for (int i = 0; i < N; i++) {
 4803       __ subv(v[i], T, v1[i], v2[i]);
 4804     }
 4805   }
 4806 
 4807   template<int N>
 4808   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4809                const VSeq<N>& v1, const VSeq<N>& v2) {
 4810     // output must not be constant
 4811     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4812     // output cannot overwrite pending inputs
 4813     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4814     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4815     for (int i = 0; i < N; i++) {
 4816       __ mulv(v[i], T, v1[i], v2[i]);
 4817     }
 4818   }
 4819 
 4820   template<int N>
 4821   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4822     // output must not be constant
 4823     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4824     // output cannot overwrite pending inputs
 4825     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4826     for (int i = 0; i < N; i++) {
 4827       __ negr(v[i], T, v1[i]);
 4828     }
 4829   }
 4830 
 4831   template<int N>
 4832   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4833                const VSeq<N>& v1, int shift) {
 4834     // output must not be constant
 4835     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4836     // output cannot overwrite pending inputs
 4837     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4838     for (int i = 0; i < N; i++) {
 4839       __ sshr(v[i], T, v1[i], shift);
 4840     }
 4841   }
 4842 
 4843   template<int N>
 4844   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4845     // output must not be constant
 4846     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4847     // output cannot overwrite pending inputs
 4848     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4849     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4850     for (int i = 0; i < N; i++) {
 4851       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4852     }
 4853   }
 4854 
 4855   template<int N>
 4856   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4857     // output must not be constant
 4858     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4859     // output cannot overwrite pending inputs
 4860     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4861     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4862     for (int i = 0; i < N; i++) {
 4863       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4864     }
 4865   }
 4866 
 4867   template<int N>
 4868   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4869     // output must not be constant
 4870     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4871     // output cannot overwrite pending inputs
 4872     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4873     for (int i = 0; i < N; i++) {
 4874       __ notr(v[i], __ T16B, v1[i]);
 4875     }
 4876   }
 4877 
 4878   template<int N>
 4879   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4880     // output must not be constant
 4881     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4882     // output cannot overwrite pending inputs
 4883     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4884     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4885     for (int i = 0; i < N; i++) {
 4886       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4887     }
 4888   }
 4889 
 4890   template<int N>
 4891   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4892     // output must not be constant
 4893     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4894     // output cannot overwrite pending inputs
 4895     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4896     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4897     for (int i = 0; i < N; i++) {
 4898       __ mlsv(v[i], T, v1[i], v2[i]);
 4899     }
 4900   }
 4901 
 4902   // load N/2 successive pairs of quadword values from memory in order
 4903   // into N successive vector registers of the sequence via the
 4904   // address supplied in base.
 4905   template<int N>
 4906   void vs_ldpq(const VSeq<N>& v, Register base) {
 4907     for (int i = 0; i < N; i += 2) {
 4908       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4909     }
 4910   }
 4911 
 4912   // load N/2 successive pairs of quadword values from memory in order
 4913   // into N vector registers of the sequence via the address supplied
 4914   // in base using post-increment addressing
 4915   template<int N>
 4916   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4917     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4918     for (int i = 0; i < N; i += 2) {
 4919       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4920     }
 4921   }
 4922 
 4923   // store N successive vector registers of the sequence into N/2
 4924   // successive pairs of quadword memory locations via the address
 4925   // supplied in base using post-increment addressing
 4926   template<int N>
 4927   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4928     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4929     for (int i = 0; i < N; i += 2) {
 4930       __ stpq(v[i], v[i+1], __ post(base, 32));
 4931     }
 4932   }
 4933 
 4934   // load N/2 pairs of quadword values from memory de-interleaved into
 4935   // N vector registers 2 at a time via the address supplied in base
 4936   // using post-increment addressing.
 4937   template<int N>
 4938   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4939     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4940     for (int i = 0; i < N; i += 2) {
 4941       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4942     }
 4943   }
 4944 
 4945   // store N vector registers interleaved into N/2 pairs of quadword
 4946   // memory locations via the address supplied in base using
 4947   // post-increment addressing.
 4948   template<int N>
 4949   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4950     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4951     for (int i = 0; i < N; i += 2) {
 4952       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4953     }
 4954   }
 4955 
 4956   // load N quadword values from memory de-interleaved into N vector
 4957   // registers 3 elements at a time via the address supplied in base.
 4958   template<int N>
 4959   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4960     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4961     for (int i = 0; i < N; i += 3) {
 4962       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4963     }
 4964   }
 4965 
 4966   // load N quadword values from memory de-interleaved into N vector
 4967   // registers 3 elements at a time via the address supplied in base
 4968   // using post-increment addressing.
 4969   template<int N>
 4970   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4971     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4972     for (int i = 0; i < N; i += 3) {
 4973       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4974     }
 4975   }
 4976 
 4977   // load N/2 pairs of quadword values from memory into N vector
 4978   // registers via the address supplied in base with each pair indexed
 4979   // using the the start offset plus the corresponding entry in the
 4980   // offsets array
 4981   template<int N>
 4982   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4983     for (int i = 0; i < N/2; i++) {
 4984       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4985     }
 4986   }
 4987 
 4988   // store N vector registers into N/2 pairs of quadword memory
 4989   // locations via the address supplied in base with each pair indexed
 4990   // using the the start offset plus the corresponding entry in the
 4991   // offsets array
 4992   template<int N>
 4993   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4994     for (int i = 0; i < N/2; i++) {
 4995       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4996     }
 4997   }
 4998 
 4999   // load N single quadword values from memory into N vector registers
 5000   // via the address supplied in base with each value indexed using
 5001   // the the start offset plus the corresponding entry in the offsets
 5002   // array
 5003   template<int N>
 5004   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5005                       int start, int (&offsets)[N]) {
 5006     for (int i = 0; i < N; i++) {
 5007       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5008     }
 5009   }
 5010 
 5011   // store N vector registers into N single quadword memory locations
 5012   // via the address supplied in base with each value indexed using
 5013   // the the start offset plus the corresponding entry in the offsets
 5014   // array
 5015   template<int N>
 5016   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5017                       int start, int (&offsets)[N]) {
 5018     for (int i = 0; i < N; i++) {
 5019       __ str(v[i], T, Address(base, start + offsets[i]));
 5020     }
 5021   }
 5022 
 5023   // load N/2 pairs of quadword values from memory de-interleaved into
 5024   // N vector registers 2 at a time via the address supplied in base
 5025   // with each pair indexed using the the start offset plus the
 5026   // corresponding entry in the offsets array
 5027   template<int N>
 5028   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5029                       Register tmp, int start, int (&offsets)[N/2]) {
 5030     for (int i = 0; i < N/2; i++) {
 5031       __ add(tmp, base, start + offsets[i]);
 5032       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5033     }
 5034   }
 5035 
 5036   // store N vector registers 2 at a time interleaved into N/2 pairs
 5037   // of quadword memory locations via the address supplied in base
 5038   // with each pair indexed using the the start offset plus the
 5039   // corresponding entry in the offsets array
 5040   template<int N>
 5041   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5042                       Register tmp, int start, int (&offsets)[N/2]) {
 5043     for (int i = 0; i < N/2; i++) {
 5044       __ add(tmp, base, start + offsets[i]);
 5045       __ st2(v[2*i], v[2*i+1], T, tmp);
 5046     }
 5047   }
 5048 
 5049   // Helper routines for various flavours of Montgomery multiply
 5050 
 5051   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5052   // multiplications in parallel
 5053   //
 5054 
 5055   // See the montMul() method of the sun.security.provider.ML_DSA
 5056   // class.
 5057   //
 5058   // Computes 4x4S results or 8x8H results
 5059   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5060   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5061   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5062   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5063   // Outputs: va - 4x4S or 4x8H vector register sequences
 5064   // vb, vc, vtmp and vq must all be disjoint
 5065   // va must be disjoint from all other inputs/temps or must equal vc
 5066   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5067   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5068   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5069                    Assembler::SIMD_Arrangement T,
 5070                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5071     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5072     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5073     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5074     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5075 
 5076     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5077     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5078 
 5079     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5080 
 5081     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5082     assert(vs_disjoint(va, vb), "va and vb overlap");
 5083     assert(vs_disjoint(va, vq), "va and vq overlap");
 5084     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5085     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5086 
 5087     // schedule 4 streams of instructions across the vector sequences
 5088     for (int i = 0; i < 4; i++) {
 5089       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5090       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5091     }
 5092 
 5093     for (int i = 0; i < 4; i++) {
 5094       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5095     }
 5096 
 5097     for (int i = 0; i < 4; i++) {
 5098       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5099     }
 5100 
 5101     for (int i = 0; i < 4; i++) {
 5102       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5103     }
 5104   }
 5105 
 5106   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5107   // multiplications in parallel
 5108   //
 5109 
 5110   // See the montMul() method of the sun.security.provider.ML_DSA
 5111   // class.
 5112   //
 5113   // Computes 4x4S results or 8x8H results
 5114   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5115   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5116   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5117   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5118   // Outputs: va - 4x4S or 4x8H vector register sequences
 5119   // vb, vc, vtmp and vq must all be disjoint
 5120   // va must be disjoint from all other inputs/temps or must equal vc
 5121   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5122   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5123   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5124                    Assembler::SIMD_Arrangement T,
 5125                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5126     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5127     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5128     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5129     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5130 
 5131     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5132     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5133 
 5134     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5135 
 5136     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5137     assert(vs_disjoint(va, vb), "va and vb overlap");
 5138     assert(vs_disjoint(va, vq), "va and vq overlap");
 5139     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5140     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5141 
 5142     // schedule 2 streams of instructions across the vector sequences
 5143     for (int i = 0; i < 2; i++) {
 5144       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5145       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5146     }
 5147 
 5148     for (int i = 0; i < 2; i++) {
 5149       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5150     }
 5151 
 5152     for (int i = 0; i < 2; i++) {
 5153       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5154     }
 5155 
 5156     for (int i = 0; i < 2; i++) {
 5157       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5158     }
 5159   }
 5160 
 5161   // Perform 16 16-bit Montgomery multiplications in parallel.
 5162   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5163                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5164     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5165     // It will assert that the register use is valid
 5166     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5167   }
 5168 
 5169   // Perform 32 16-bit Montgomery multiplications in parallel.
 5170   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5171                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5172     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5173     // It will assert that the register use is valid
 5174     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5175   }
 5176 
 5177   // Perform 64 16-bit Montgomery multiplications in parallel.
 5178   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5179                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5180     // Schedule two successive 4x8H multiplies via the montmul helper
 5181     // on the front and back halves of va, vb and vc. The helper will
 5182     // assert that the register use has no overlap conflicts on each
 5183     // individual call but we also need to ensure that the necessary
 5184     // disjoint/equality constraints are met across both calls.
 5185 
 5186     // vb, vc, vtmp and vq must be disjoint. va must either be
 5187     // disjoint from all other registers or equal vc
 5188 
 5189     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5190     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5191     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5192 
 5193     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5194     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5195 
 5196     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5197 
 5198     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5199     assert(vs_disjoint(va, vb), "va and vb overlap");
 5200     assert(vs_disjoint(va, vq), "va and vq overlap");
 5201     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5202 
 5203     // we multiply the front and back halves of each sequence 4 at a
 5204     // time because
 5205     //
 5206     // 1) we are currently only able to get 4-way instruction
 5207     // parallelism at best
 5208     //
 5209     // 2) we need registers for the constants in vq and temporary
 5210     // scratch registers to hold intermediate results so vtmp can only
 5211     // be a VSeq<4> which means we only have 4 scratch slots
 5212 
 5213     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5214     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5215   }
 5216 
 5217   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5218                                const VSeq<4>& vc,
 5219                                const VSeq<4>& vtmp,
 5220                                const VSeq<2>& vq) {
 5221     // compute a = montmul(a1, c)
 5222     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5223     // ouptut a1 = a0 - a
 5224     vs_subv(va1, __ T8H, va0, vc);
 5225     //    and a0 = a0 + a
 5226     vs_addv(va0, __ T8H, va0, vc);
 5227   }
 5228 
 5229   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5230                                const VSeq<4>& vb,
 5231                                const VSeq<4>& vtmp1,
 5232                                const VSeq<4>& vtmp2,
 5233                                const VSeq<2>& vq) {
 5234     // compute c = a0 - a1
 5235     vs_subv(vtmp1, __ T8H, va0, va1);
 5236     // output a0 = a0 + a1
 5237     vs_addv(va0, __ T8H, va0, va1);
 5238     // output a1 = b montmul c
 5239     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5240   }
 5241 
 5242   void load64shorts(const VSeq<8>& v, Register shorts) {
 5243     vs_ldpq_post(v, shorts);
 5244   }
 5245 
 5246   void load32shorts(const VSeq<4>& v, Register shorts) {
 5247     vs_ldpq_post(v, shorts);
 5248   }
 5249 
 5250   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5251     vs_stpq_post(v, tmpAddr);
 5252   }
 5253 
 5254   // Kyber NTT function.
 5255   // Implements
 5256   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5257   //
 5258   // coeffs (short[256]) = c_rarg0
 5259   // ntt_zetas (short[256]) = c_rarg1
 5260   address generate_kyberNtt() {
 5261 
 5262     __ align(CodeEntryAlignment);
 5263     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5264     StubCodeMark mark(this, stub_id);
 5265     address start = __ pc();
 5266     __ enter();
 5267 
 5268     const Register coeffs = c_rarg0;
 5269     const Register zetas = c_rarg1;
 5270 
 5271     const Register kyberConsts = r10;
 5272     const Register tmpAddr = r11;
 5273 
 5274     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5275     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5276     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5277 
 5278     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5279     // load the montmul constants
 5280     vs_ldpq(vq, kyberConsts);
 5281 
 5282     // Each level corresponds to an iteration of the outermost loop of the
 5283     // Java method seilerNTT(int[] coeffs). There are some differences
 5284     // from what is done in the seilerNTT() method, though:
 5285     // 1. The computation is using 16-bit signed values, we do not convert them
 5286     // to ints here.
 5287     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5288     // this array for each level, it is easier that way to fill up the vector
 5289     // registers.
 5290     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5291     // multiplications (this is because that way there should not be any
 5292     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5293     // that we can use the 16-bit arithmetic in the vector unit.
 5294     //
 5295     // On each level, we fill up the vector registers in such a way that the
 5296     // array elements that need to be multiplied by the zetas go into one
 5297     // set of vector registers while the corresponding ones that don't need to
 5298     // be multiplied, go into another set.
 5299     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5300     // registers interleaving the steps of 4 identical computations,
 5301     // each done on 8 16-bit values per register.
 5302 
 5303     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5304     // to the zetas occur in discrete blocks whose size is some multiple
 5305     // of 32.
 5306 
 5307     // level 0
 5308     __ add(tmpAddr, coeffs, 256);
 5309     load64shorts(vs1, tmpAddr);
 5310     load64shorts(vs2, zetas);
 5311     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5312     __ add(tmpAddr, coeffs, 0);
 5313     load64shorts(vs1, tmpAddr);
 5314     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5315     vs_addv(vs1, __ T8H, vs1, vs2);
 5316     __ add(tmpAddr, coeffs, 0);
 5317     vs_stpq_post(vs1, tmpAddr);
 5318     __ add(tmpAddr, coeffs, 256);
 5319     vs_stpq_post(vs3, tmpAddr);
 5320     // restore montmul constants
 5321     vs_ldpq(vq, kyberConsts);
 5322     load64shorts(vs1, tmpAddr);
 5323     load64shorts(vs2, zetas);
 5324     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5325     __ add(tmpAddr, coeffs, 128);
 5326     load64shorts(vs1, tmpAddr);
 5327     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5328     vs_addv(vs1, __ T8H, vs1, vs2);
 5329     __ add(tmpAddr, coeffs, 128);
 5330     store64shorts(vs1, tmpAddr);
 5331     __ add(tmpAddr, coeffs, 384);
 5332     store64shorts(vs3, tmpAddr);
 5333 
 5334     // level 1
 5335     // restore montmul constants
 5336     vs_ldpq(vq, kyberConsts);
 5337     __ add(tmpAddr, coeffs, 128);
 5338     load64shorts(vs1, tmpAddr);
 5339     load64shorts(vs2, zetas);
 5340     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5341     __ add(tmpAddr, coeffs, 0);
 5342     load64shorts(vs1, tmpAddr);
 5343     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5344     vs_addv(vs1, __ T8H, vs1, vs2);
 5345     __ add(tmpAddr, coeffs, 0);
 5346     store64shorts(vs1, tmpAddr);
 5347     store64shorts(vs3, tmpAddr);
 5348     vs_ldpq(vq, kyberConsts);
 5349     __ add(tmpAddr, coeffs, 384);
 5350     load64shorts(vs1, tmpAddr);
 5351     load64shorts(vs2, zetas);
 5352     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5353     __ add(tmpAddr, coeffs, 256);
 5354     load64shorts(vs1, tmpAddr);
 5355     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5356     vs_addv(vs1, __ T8H, vs1, vs2);
 5357     __ add(tmpAddr, coeffs, 256);
 5358     store64shorts(vs1, tmpAddr);
 5359     store64shorts(vs3, tmpAddr);
 5360 
 5361     // level 2
 5362     vs_ldpq(vq, kyberConsts);
 5363     int offsets1[4] = { 0, 32, 128, 160 };
 5364     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5365     load64shorts(vs2, zetas);
 5366     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5367     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5368     // kyber_subv_addv64();
 5369     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5370     vs_addv(vs1, __ T8H, vs1, vs2);
 5371     __ add(tmpAddr, coeffs, 0);
 5372     vs_stpq_post(vs_front(vs1), tmpAddr);
 5373     vs_stpq_post(vs_front(vs3), tmpAddr);
 5374     vs_stpq_post(vs_back(vs1), tmpAddr);
 5375     vs_stpq_post(vs_back(vs3), tmpAddr);
 5376     vs_ldpq(vq, kyberConsts);
 5377     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5378     load64shorts(vs2, zetas);
 5379     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5380     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5381     // kyber_subv_addv64();
 5382     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5383     vs_addv(vs1, __ T8H, vs1, vs2);
 5384     __ add(tmpAddr, coeffs, 256);
 5385     vs_stpq_post(vs_front(vs1), tmpAddr);
 5386     vs_stpq_post(vs_front(vs3), tmpAddr);
 5387     vs_stpq_post(vs_back(vs1), tmpAddr);
 5388     vs_stpq_post(vs_back(vs3), tmpAddr);
 5389 
 5390     // level 3
 5391     vs_ldpq(vq, kyberConsts);
 5392     int offsets2[4] = { 0, 64, 128, 192 };
 5393     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5394     load64shorts(vs2, zetas);
 5395     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5396     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5397     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5398     vs_addv(vs1, __ T8H, vs1, vs2);
 5399     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5400     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5401 
 5402     vs_ldpq(vq, kyberConsts);
 5403     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5404     load64shorts(vs2, zetas);
 5405     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5406     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5407     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5408     vs_addv(vs1, __ T8H, vs1, vs2);
 5409     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5410     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5411 
 5412     // level 4
 5413     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5414     // so they are loaded using employing an ldr at 8 distinct offsets.
 5415 
 5416     vs_ldpq(vq, kyberConsts);
 5417     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5418     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5419     load64shorts(vs2, zetas);
 5420     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5421     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5422     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5423     vs_addv(vs1, __ T8H, vs1, vs2);
 5424     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5425     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5426 
 5427     vs_ldpq(vq, kyberConsts);
 5428     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5429     load64shorts(vs2, zetas);
 5430     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5431     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5432     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5433     vs_addv(vs1, __ T8H, vs1, vs2);
 5434     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5435     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5436 
 5437     // level 5
 5438     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5439     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5440 
 5441     vs_ldpq(vq, kyberConsts);
 5442     int offsets4[4] = { 0, 32, 64, 96 };
 5443     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5444     load32shorts(vs_front(vs2), zetas);
 5445     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5446     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5447     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5448     load32shorts(vs_front(vs2), zetas);
 5449     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5450     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5451     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5452     load32shorts(vs_front(vs2), zetas);
 5453     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5454     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5455 
 5456     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5457     load32shorts(vs_front(vs2), zetas);
 5458     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5459     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5460 
 5461     // level 6
 5462     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5463     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5464 
 5465     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5466     load32shorts(vs_front(vs2), zetas);
 5467     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5468     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5469     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5470     // __ ldpq(v18, v19, __ post(zetas, 32));
 5471     load32shorts(vs_front(vs2), zetas);
 5472     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5473     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5474 
 5475     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5476     load32shorts(vs_front(vs2), zetas);
 5477     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5478     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5479 
 5480     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5481     load32shorts(vs_front(vs2), zetas);
 5482     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5483     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5484 
 5485     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5486     __ mov(r0, zr); // return 0
 5487     __ ret(lr);
 5488 
 5489     return start;
 5490   }
 5491 
 5492   // Kyber Inverse NTT function
 5493   // Implements
 5494   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5495   //
 5496   // coeffs (short[256]) = c_rarg0
 5497   // ntt_zetas (short[256]) = c_rarg1
 5498   address generate_kyberInverseNtt() {
 5499 
 5500     __ align(CodeEntryAlignment);
 5501     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5502     StubCodeMark mark(this, stub_id);
 5503     address start = __ pc();
 5504     __ enter();
 5505 
 5506     const Register coeffs = c_rarg0;
 5507     const Register zetas = c_rarg1;
 5508 
 5509     const Register kyberConsts = r10;
 5510     const Register tmpAddr = r11;
 5511     const Register tmpAddr2 = c_rarg2;
 5512 
 5513     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5514     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5515     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5516 
 5517     __ lea(kyberConsts,
 5518              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5519 
 5520     // level 0
 5521     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5522     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5523 
 5524     vs_ldpq(vq, kyberConsts);
 5525     int offsets4[4] = { 0, 32, 64, 96 };
 5526     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5527     load32shorts(vs_front(vs2), zetas);
 5528     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5529                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5530     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5531     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5532     load32shorts(vs_front(vs2), zetas);
 5533     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5534                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5535     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5536     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5537     load32shorts(vs_front(vs2), zetas);
 5538     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5539                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5540     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5541     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5542     load32shorts(vs_front(vs2), zetas);
 5543     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5544                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5545     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5546 
 5547     // level 1
 5548     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5549     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5550 
 5551     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5552     load32shorts(vs_front(vs2), zetas);
 5553     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5554                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5555     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5556     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5557     load32shorts(vs_front(vs2), zetas);
 5558     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5559                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5560     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5561 
 5562     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5563     load32shorts(vs_front(vs2), zetas);
 5564     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5565                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5566     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5567     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5568     load32shorts(vs_front(vs2), zetas);
 5569     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5570                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5571     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5572 
 5573     // level 2
 5574     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5575     // so they are loaded using employing an ldr at 8 distinct offsets.
 5576 
 5577     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5578     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5579     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5580     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5581     vs_subv(vs1, __ T8H, vs1, vs2);
 5582     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5583     load64shorts(vs2, zetas);
 5584     vs_ldpq(vq, kyberConsts);
 5585     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5586     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5587 
 5588     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5589     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5590     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5591     vs_subv(vs1, __ T8H, vs1, vs2);
 5592     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5593     load64shorts(vs2, zetas);
 5594     vs_ldpq(vq, kyberConsts);
 5595     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5596     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5597 
 5598     // Barrett reduction at indexes where overflow may happen
 5599 
 5600     // load q and the multiplier for the Barrett reduction
 5601     __ add(tmpAddr, kyberConsts, 16);
 5602     vs_ldpq(vq, tmpAddr);
 5603 
 5604     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5605     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5606     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5607     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5608     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5609     vs_sshr(vs2, __ T8H, vs2, 11);
 5610     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5611     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5612     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5613     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5614     vs_sshr(vs2, __ T8H, vs2, 11);
 5615     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5616     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5617 
 5618     // level 3
 5619     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5620     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5621 
 5622     int offsets2[4] = { 0, 64, 128, 192 };
 5623     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5624     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5625     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5626     vs_subv(vs1, __ T8H, vs1, vs2);
 5627     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5628     load64shorts(vs2, zetas);
 5629     vs_ldpq(vq, kyberConsts);
 5630     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5631     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5632 
 5633     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5634     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5635     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5636     vs_subv(vs1, __ T8H, vs1, vs2);
 5637     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5638     load64shorts(vs2, zetas);
 5639     vs_ldpq(vq, kyberConsts);
 5640     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5641     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5642 
 5643     // level 4
 5644 
 5645     int offsets1[4] = { 0, 32, 128, 160 };
 5646     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5647     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5648     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5649     vs_subv(vs1, __ T8H, vs1, vs2);
 5650     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5651     load64shorts(vs2, zetas);
 5652     vs_ldpq(vq, kyberConsts);
 5653     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5654     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5655 
 5656     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5657     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5658     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5659     vs_subv(vs1, __ T8H, vs1, vs2);
 5660     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5661     load64shorts(vs2, zetas);
 5662     vs_ldpq(vq, kyberConsts);
 5663     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5664     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5665 
 5666     // level 5
 5667 
 5668     __ add(tmpAddr, coeffs, 0);
 5669     load64shorts(vs1, tmpAddr);
 5670     __ add(tmpAddr, coeffs, 128);
 5671     load64shorts(vs2, tmpAddr);
 5672     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5673     vs_subv(vs1, __ T8H, vs1, vs2);
 5674     __ add(tmpAddr, coeffs, 0);
 5675     store64shorts(vs3, tmpAddr);
 5676     load64shorts(vs2, zetas);
 5677     vs_ldpq(vq, kyberConsts);
 5678     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5679     __ add(tmpAddr, coeffs, 128);
 5680     store64shorts(vs2, tmpAddr);
 5681 
 5682     load64shorts(vs1, tmpAddr);
 5683     __ add(tmpAddr, coeffs, 384);
 5684     load64shorts(vs2, tmpAddr);
 5685     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5686     vs_subv(vs1, __ T8H, vs1, vs2);
 5687     __ add(tmpAddr, coeffs, 256);
 5688     store64shorts(vs3, tmpAddr);
 5689     load64shorts(vs2, zetas);
 5690     vs_ldpq(vq, kyberConsts);
 5691     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5692     __ add(tmpAddr, coeffs, 384);
 5693     store64shorts(vs2, tmpAddr);
 5694 
 5695     // Barrett reduction at indexes where overflow may happen
 5696 
 5697     // load q and the multiplier for the Barrett reduction
 5698     __ add(tmpAddr, kyberConsts, 16);
 5699     vs_ldpq(vq, tmpAddr);
 5700 
 5701     int offsets0[2] = { 0, 256 };
 5702     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5703     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5704     vs_sshr(vs2, __ T8H, vs2, 11);
 5705     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5706     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5707 
 5708     // level 6
 5709 
 5710     __ add(tmpAddr, coeffs, 0);
 5711     load64shorts(vs1, tmpAddr);
 5712     __ add(tmpAddr, coeffs, 256);
 5713     load64shorts(vs2, tmpAddr);
 5714     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5715     vs_subv(vs1, __ T8H, vs1, vs2);
 5716     __ add(tmpAddr, coeffs, 0);
 5717     store64shorts(vs3, tmpAddr);
 5718     load64shorts(vs2, zetas);
 5719     vs_ldpq(vq, kyberConsts);
 5720     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5721     __ add(tmpAddr, coeffs, 256);
 5722     store64shorts(vs2, tmpAddr);
 5723 
 5724     __ add(tmpAddr, coeffs, 128);
 5725     load64shorts(vs1, tmpAddr);
 5726     __ add(tmpAddr, coeffs, 384);
 5727     load64shorts(vs2, tmpAddr);
 5728     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5729     vs_subv(vs1, __ T8H, vs1, vs2);
 5730     __ add(tmpAddr, coeffs, 128);
 5731     store64shorts(vs3, tmpAddr);
 5732     load64shorts(vs2, zetas);
 5733     vs_ldpq(vq, kyberConsts);
 5734     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5735     __ add(tmpAddr, coeffs, 384);
 5736     store64shorts(vs2, tmpAddr);
 5737 
 5738     // multiply by 2^-n
 5739 
 5740     // load toMont(2^-n mod q)
 5741     __ add(tmpAddr, kyberConsts, 48);
 5742     __ ldr(v29, __ Q, tmpAddr);
 5743 
 5744     vs_ldpq(vq, kyberConsts);
 5745     __ add(tmpAddr, coeffs, 0);
 5746     load64shorts(vs1, tmpAddr);
 5747     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5748     __ add(tmpAddr, coeffs, 0);
 5749     store64shorts(vs2, tmpAddr);
 5750 
 5751     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5752     load64shorts(vs1, tmpAddr);
 5753     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5754     __ add(tmpAddr, coeffs, 128);
 5755     store64shorts(vs2, tmpAddr);
 5756 
 5757     // now tmpAddr contains coeffs + 256
 5758     load64shorts(vs1, tmpAddr);
 5759     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5760     __ add(tmpAddr, coeffs, 256);
 5761     store64shorts(vs2, tmpAddr);
 5762 
 5763     // now tmpAddr contains coeffs + 384
 5764     load64shorts(vs1, tmpAddr);
 5765     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5766     __ add(tmpAddr, coeffs, 384);
 5767     store64shorts(vs2, tmpAddr);
 5768 
 5769     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5770     __ mov(r0, zr); // return 0
 5771     __ ret(lr);
 5772 
 5773     return start;
 5774   }
 5775 
 5776   // Kyber multiply polynomials in the NTT domain.
 5777   // Implements
 5778   // static int implKyberNttMult(
 5779   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5780   //
 5781   // result (short[256]) = c_rarg0
 5782   // ntta (short[256]) = c_rarg1
 5783   // nttb (short[256]) = c_rarg2
 5784   // zetas (short[128]) = c_rarg3
 5785   address generate_kyberNttMult() {
 5786 
 5787     __ align(CodeEntryAlignment);
 5788     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5789     StubCodeMark mark(this, stub_id);
 5790     address start = __ pc();
 5791     __ enter();
 5792 
 5793     const Register result = c_rarg0;
 5794     const Register ntta = c_rarg1;
 5795     const Register nttb = c_rarg2;
 5796     const Register zetas = c_rarg3;
 5797 
 5798     const Register kyberConsts = r10;
 5799     const Register limit = r11;
 5800 
 5801     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5802     VSeq<4> vs3(16), vs4(20);
 5803     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5804     VSeq<2> vz(28);          // pair of zetas
 5805     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5806 
 5807     __ lea(kyberConsts,
 5808              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5809 
 5810     Label kyberNttMult_loop;
 5811 
 5812     __ add(limit, result, 512);
 5813 
 5814     // load q and qinv
 5815     vs_ldpq(vq, kyberConsts);
 5816 
 5817     // load R^2 mod q (to convert back from Montgomery representation)
 5818     __ add(kyberConsts, kyberConsts, 64);
 5819     __ ldr(v27, __ Q, kyberConsts);
 5820 
 5821     __ BIND(kyberNttMult_loop);
 5822 
 5823     // load 16 zetas
 5824     vs_ldpq_post(vz, zetas);
 5825 
 5826     // load 2 sets of 32 coefficients from the two input arrays
 5827     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5828     // are striped across pairs of vector registers
 5829     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5830     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5831     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5832     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5833 
 5834     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5835     // i.e. montmul the first and second halves of vs1 in order and
 5836     // then with one sequence reversed storing the two results in vs3
 5837     //
 5838     // vs3[0] <- montmul(a0, b0)
 5839     // vs3[1] <- montmul(a1, b1)
 5840     // vs3[2] <- montmul(a0, b1)
 5841     // vs3[3] <- montmul(a1, b0)
 5842     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5843     kyber_montmul16(vs_back(vs3),
 5844                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5845 
 5846     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5847     // i.e. montmul the first and second halves of vs4 in order and
 5848     // then with one sequence reversed storing the two results in vs1
 5849     //
 5850     // vs1[0] <- montmul(a2, b2)
 5851     // vs1[1] <- montmul(a3, b3)
 5852     // vs1[2] <- montmul(a2, b3)
 5853     // vs1[3] <- montmul(a3, b2)
 5854     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5855     kyber_montmul16(vs_back(vs1),
 5856                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5857 
 5858     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5859     // We can schedule two montmuls at a time if we use a suitable vector
 5860     // sequence <vs3[1], vs1[1]>.
 5861     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5862     VSeq<2> vs5(vs3[1], delta);
 5863 
 5864     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5865     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5866     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5867 
 5868     // add results in pairs storing in vs3
 5869     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5870     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5871     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5872 
 5873     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5874     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5875     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5876 
 5877     // vs1 <- montmul(vs3, montRSquareModQ)
 5878     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5879 
 5880     // store back the two pairs of result vectors de-interleaved as 8H elements
 5881     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5882     // in memory
 5883     vs_st2_post(vs1, __ T8H, result);
 5884 
 5885     __ cmp(result, limit);
 5886     __ br(Assembler::NE, kyberNttMult_loop);
 5887 
 5888     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5889     __ mov(r0, zr); // return 0
 5890     __ ret(lr);
 5891 
 5892     return start;
 5893   }
 5894 
 5895   // Kyber add 2 polynomials.
 5896   // Implements
 5897   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5898   //
 5899   // result (short[256]) = c_rarg0
 5900   // a (short[256]) = c_rarg1
 5901   // b (short[256]) = c_rarg2
 5902   address generate_kyberAddPoly_2() {
 5903 
 5904     __ align(CodeEntryAlignment);
 5905     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5906     StubCodeMark mark(this, stub_id);
 5907     address start = __ pc();
 5908     __ enter();
 5909 
 5910     const Register result = c_rarg0;
 5911     const Register a = c_rarg1;
 5912     const Register b = c_rarg2;
 5913 
 5914     const Register kyberConsts = r11;
 5915 
 5916     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5917     // So, we can load, add and store the data in 3 groups of 11,
 5918     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5919     // registers. A further constraint is that the mapping needs
 5920     // to skip callee saves. So, we allocate the register
 5921     // sequences using two 8 sequences, two 2 sequences and two
 5922     // single registers.
 5923     VSeq<8> vs1_1(0);
 5924     VSeq<2> vs1_2(16);
 5925     FloatRegister vs1_3 = v28;
 5926     VSeq<8> vs2_1(18);
 5927     VSeq<2> vs2_2(26);
 5928     FloatRegister vs2_3 = v29;
 5929 
 5930     // two constant vector sequences
 5931     VSeq<8> vc_1(31, 0);
 5932     VSeq<2> vc_2(31, 0);
 5933 
 5934     FloatRegister vc_3 = v31;
 5935     __ lea(kyberConsts,
 5936              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5937 
 5938     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5939     for (int i = 0; i < 3; i++) {
 5940       // load 80 or 88 values from a into vs1_1/2/3
 5941       vs_ldpq_post(vs1_1, a);
 5942       vs_ldpq_post(vs1_2, a);
 5943       if (i < 2) {
 5944         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5945       }
 5946       // load 80 or 88 values from b into vs2_1/2/3
 5947       vs_ldpq_post(vs2_1, b);
 5948       vs_ldpq_post(vs2_2, b);
 5949       if (i < 2) {
 5950         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5951       }
 5952       // sum 80 or 88 values across vs1 and vs2 into vs1
 5953       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5954       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5955       if (i < 2) {
 5956         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5957       }
 5958       // add constant to all 80 or 88 results
 5959       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5960       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5961       if (i < 2) {
 5962         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5963       }
 5964       // store 80 or 88 values
 5965       vs_stpq_post(vs1_1, result);
 5966       vs_stpq_post(vs1_2, result);
 5967       if (i < 2) {
 5968         __ str(vs1_3, __ Q, __ post(result, 16));
 5969       }
 5970     }
 5971 
 5972     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5973     __ mov(r0, zr); // return 0
 5974     __ ret(lr);
 5975 
 5976     return start;
 5977   }
 5978 
 5979   // Kyber add 3 polynomials.
 5980   // Implements
 5981   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5982   //
 5983   // result (short[256]) = c_rarg0
 5984   // a (short[256]) = c_rarg1
 5985   // b (short[256]) = c_rarg2
 5986   // c (short[256]) = c_rarg3
 5987   address generate_kyberAddPoly_3() {
 5988 
 5989     __ align(CodeEntryAlignment);
 5990     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 5991     StubCodeMark mark(this, stub_id);
 5992     address start = __ pc();
 5993     __ enter();
 5994 
 5995     const Register result = c_rarg0;
 5996     const Register a = c_rarg1;
 5997     const Register b = c_rarg2;
 5998     const Register c = c_rarg3;
 5999 
 6000     const Register kyberConsts = r11;
 6001 
 6002     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6003     // quadwords.  So, we can load, add and store the data in 3
 6004     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6005     // of 10 or 11 registers. A further constraint is that the
 6006     // mapping needs to skip callee saves. So, we allocate the
 6007     // register sequences using two 8 sequences, two 2 sequences
 6008     // and two single registers.
 6009     VSeq<8> vs1_1(0);
 6010     VSeq<2> vs1_2(16);
 6011     FloatRegister vs1_3 = v28;
 6012     VSeq<8> vs2_1(18);
 6013     VSeq<2> vs2_2(26);
 6014     FloatRegister vs2_3 = v29;
 6015 
 6016     // two constant vector sequences
 6017     VSeq<8> vc_1(31, 0);
 6018     VSeq<2> vc_2(31, 0);
 6019 
 6020     FloatRegister vc_3 = v31;
 6021 
 6022     __ lea(kyberConsts,
 6023              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6024 
 6025     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6026     for (int i = 0; i < 3; i++) {
 6027       // load 80 or 88 values from a into vs1_1/2/3
 6028       vs_ldpq_post(vs1_1, a);
 6029       vs_ldpq_post(vs1_2, a);
 6030       if (i < 2) {
 6031         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6032       }
 6033       // load 80 or 88 values from b into vs2_1/2/3
 6034       vs_ldpq_post(vs2_1, b);
 6035       vs_ldpq_post(vs2_2, b);
 6036       if (i < 2) {
 6037         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6038       }
 6039       // sum 80 or 88 values across vs1 and vs2 into vs1
 6040       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6041       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6042       if (i < 2) {
 6043         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6044       }
 6045       // load 80 or 88 values from c into vs2_1/2/3
 6046       vs_ldpq_post(vs2_1, c);
 6047       vs_ldpq_post(vs2_2, c);
 6048       if (i < 2) {
 6049         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6050       }
 6051       // sum 80 or 88 values across vs1 and vs2 into vs1
 6052       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6053       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6054       if (i < 2) {
 6055         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6056       }
 6057       // add constant to all 80 or 88 results
 6058       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6059       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6060       if (i < 2) {
 6061         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6062       }
 6063       // store 80 or 88 values
 6064       vs_stpq_post(vs1_1, result);
 6065       vs_stpq_post(vs1_2, result);
 6066       if (i < 2) {
 6067         __ str(vs1_3, __ Q, __ post(result, 16));
 6068       }
 6069     }
 6070 
 6071     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6072     __ mov(r0, zr); // return 0
 6073     __ ret(lr);
 6074 
 6075     return start;
 6076   }
 6077 
 6078   // Kyber parse XOF output to polynomial coefficient candidates
 6079   // or decodePoly(12, ...).
 6080   // Implements
 6081   // static int implKyber12To16(
 6082   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6083   //
 6084   // we assume that parsed and condensed are allocated such that for
 6085   // n = (parsedLength + 63) / 64
 6086   // n blocks of 96 bytes of input can be processed, i.e.
 6087   // index + n * 96 <= condensed.length and
 6088   // n * 64 <= parsed.length
 6089   //
 6090   // condensed (byte[]) = c_rarg0
 6091   // condensedIndex = c_rarg1
 6092   // parsed (short[]) = c_rarg2
 6093   // parsedLength = c_rarg3
 6094   address generate_kyber12To16() {
 6095     Label L_F00, L_loop;
 6096 
 6097     __ align(CodeEntryAlignment);
 6098     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6099     StubCodeMark mark(this, stub_id);
 6100     address start = __ pc();
 6101     __ enter();
 6102 
 6103     const Register condensed = c_rarg0;
 6104     const Register condensedOffs = c_rarg1;
 6105     const Register parsed = c_rarg2;
 6106     const Register parsedLength = c_rarg3;
 6107 
 6108     const Register tmpAddr = r11;
 6109 
 6110     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6111     // quadwords so we need a 6 vector sequence for the inputs.
 6112     // Parsing produces 64 shorts, employing two 8 vector
 6113     // sequences to store and combine the intermediate data.
 6114     VSeq<6> vin(24);
 6115     VSeq<8> va(0), vb(16);
 6116 
 6117     __ adr(tmpAddr, L_F00);
 6118     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6119     __ add(condensed, condensed, condensedOffs);
 6120 
 6121     __ BIND(L_loop);
 6122     // load 96 (6 x 16B) byte values
 6123     vs_ld3_post(vin, __ T16B, condensed);
 6124 
 6125     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6126     // holds 48 (16x3) contiguous bytes from memory striped
 6127     // horizontally across each of the 16 byte lanes. Equivalently,
 6128     // that is 16 pairs of 12-bit integers. Likewise the back half
 6129     // holds the next 48 bytes in the same arrangement.
 6130 
 6131     // Each vector in the front half can also be viewed as a vertical
 6132     // strip across the 16 pairs of 12 bit integers. Each byte in
 6133     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6134     // byte in vin[1] stores the high 4 bits of the first int and the
 6135     // low 4 bits of the second int. Each byte in vin[2] stores the
 6136     // high 8 bits of the second int. Likewise the vectors in second
 6137     // half.
 6138 
 6139     // Converting the data to 16-bit shorts requires first of all
 6140     // expanding each of the 6 x 16B vectors into 6 corresponding
 6141     // pairs of 8H vectors. Mask, shift and add operations on the
 6142     // resulting vector pairs can be used to combine 4 and 8 bit
 6143     // parts of related 8H vector elements.
 6144     //
 6145     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6146     // twice, one copy manipulated to provide the lower 4 bits
 6147     // belonging to the first short in a pair and another copy
 6148     // manipulated to provide the higher 4 bits belonging to the
 6149     // second short in a pair. This is why the the vector sequences va
 6150     // and vb used to hold the expanded 8H elements are of length 8.
 6151 
 6152     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6153     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6154     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6155     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6156     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6157     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6158     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6159     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6160 
 6161     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6162     // and vb[4:5]
 6163     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6164     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6165     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6166     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6167     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6168     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6169 
 6170     // shift lo byte of copy 1 of the middle stripe into the high byte
 6171     __ shl(va[2], __ T8H, va[2], 8);
 6172     __ shl(va[3], __ T8H, va[3], 8);
 6173     __ shl(vb[2], __ T8H, vb[2], 8);
 6174     __ shl(vb[3], __ T8H, vb[3], 8);
 6175 
 6176     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6177     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6178     // are in bit positions [4..11].
 6179     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6180     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6181     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6182     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6183 
 6184     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6185     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6186     // copy2
 6187     __ andr(va[2], __ T16B, va[2], v31);
 6188     __ andr(va[3], __ T16B, va[3], v31);
 6189     __ ushr(va[4], __ T8H, va[4], 4);
 6190     __ ushr(va[5], __ T8H, va[5], 4);
 6191     __ andr(vb[2], __ T16B, vb[2], v31);
 6192     __ andr(vb[3], __ T16B, vb[3], v31);
 6193     __ ushr(vb[4], __ T8H, vb[4], 4);
 6194     __ ushr(vb[5], __ T8H, vb[5], 4);
 6195 
 6196     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6197     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6198     // n.b. the ordering ensures: i) inputs are consumed before they
 6199     // are overwritten ii) the order of 16-bit results across successive
 6200     // pairs of vectors in va and then vb reflects the order of the
 6201     // corresponding 12-bit inputs
 6202     __ addv(va[0], __ T8H, va[0], va[2]);
 6203     __ addv(va[2], __ T8H, va[1], va[3]);
 6204     __ addv(va[1], __ T8H, va[4], va[6]);
 6205     __ addv(va[3], __ T8H, va[5], va[7]);
 6206     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6207     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6208     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6209     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6210 
 6211     // store 64 results interleaved as shorts
 6212     vs_st2_post(vs_front(va), __ T8H, parsed);
 6213     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6214 
 6215     __ sub(parsedLength, parsedLength, 64);
 6216     __ cmp(parsedLength, (u1)0);
 6217     __ br(Assembler::GT, L_loop);
 6218 
 6219     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6220     __ mov(r0, zr); // return 0
 6221     __ ret(lr);
 6222 
 6223     // bind label and generate constant data used by this stub
 6224     __ BIND(L_F00);
 6225     __ emit_int64(0x0f000f000f000f00);
 6226     __ emit_int64(0x0f000f000f000f00);
 6227 
 6228     return start;
 6229   }
 6230 
 6231   // Kyber Barrett reduce function.
 6232   // Implements
 6233   // static int implKyberBarrettReduce(short[] coeffs) {}
 6234   //
 6235   // coeffs (short[256]) = c_rarg0
 6236   address generate_kyberBarrettReduce() {
 6237 
 6238     __ align(CodeEntryAlignment);
 6239     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6240     StubCodeMark mark(this, stub_id);
 6241     address start = __ pc();
 6242     __ enter();
 6243 
 6244     const Register coeffs = c_rarg0;
 6245 
 6246     const Register kyberConsts = r10;
 6247     const Register result = r11;
 6248 
 6249     // As above we process 256 sets of values in total i.e. 32 x
 6250     // 8H quadwords. So, we can load, add and store the data in 3
 6251     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6252     // of 10 or 11 registers. A further constraint is that the
 6253     // mapping needs to skip callee saves. So, we allocate the
 6254     // register sequences using two 8 sequences, two 2 sequences
 6255     // and two single registers.
 6256     VSeq<8> vs1_1(0);
 6257     VSeq<2> vs1_2(16);
 6258     FloatRegister vs1_3 = v28;
 6259     VSeq<8> vs2_1(18);
 6260     VSeq<2> vs2_2(26);
 6261     FloatRegister vs2_3 = v29;
 6262 
 6263     // we also need a pair of corresponding constant sequences
 6264 
 6265     VSeq<8> vc1_1(30, 0);
 6266     VSeq<2> vc1_2(30, 0);
 6267     FloatRegister vc1_3 = v30; // for kyber_q
 6268 
 6269     VSeq<8> vc2_1(31, 0);
 6270     VSeq<2> vc2_2(31, 0);
 6271     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6272 
 6273     __ add(result, coeffs, 0);
 6274     __ lea(kyberConsts,
 6275              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6276 
 6277     // load q and the multiplier for the Barrett reduction
 6278     __ add(kyberConsts, kyberConsts, 16);
 6279     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6280 
 6281     for (int i = 0; i < 3; i++) {
 6282       // load 80 or 88 coefficients
 6283       vs_ldpq_post(vs1_1, coeffs);
 6284       vs_ldpq_post(vs1_2, coeffs);
 6285       if (i < 2) {
 6286         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6287       }
 6288 
 6289       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6290       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6291       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6292       if (i < 2) {
 6293         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6294       }
 6295 
 6296       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6297       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6298       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6299       if (i < 2) {
 6300         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6301       }
 6302 
 6303       // vs1 <- vs1 - vs2 * kyber_q
 6304       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6305       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6306       if (i < 2) {
 6307         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6308       }
 6309 
 6310       vs_stpq_post(vs1_1, result);
 6311       vs_stpq_post(vs1_2, result);
 6312       if (i < 2) {
 6313         __ str(vs1_3, __ Q, __ post(result, 16));
 6314       }
 6315     }
 6316 
 6317     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6318     __ mov(r0, zr); // return 0
 6319     __ ret(lr);
 6320 
 6321     return start;
 6322   }
 6323 
 6324 
 6325   // Dilithium-specific montmul helper routines that generate parallel
 6326   // code for, respectively, a single 4x4s vector sequence montmul or
 6327   // two such multiplies in a row.
 6328 
 6329   // Perform 16 32-bit Montgomery multiplications in parallel
 6330   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6331                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6332     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6333     // It will assert that the register use is valid
 6334     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6335   }
 6336 
 6337   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6338   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6339                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6340     // Schedule two successive 4x4S multiplies via the montmul helper
 6341     // on the front and back halves of va, vb and vc. The helper will
 6342     // assert that the register use has no overlap conflicts on each
 6343     // individual call but we also need to ensure that the necessary
 6344     // disjoint/equality constraints are met across both calls.
 6345 
 6346     // vb, vc, vtmp and vq must be disjoint. va must either be
 6347     // disjoint from all other registers or equal vc
 6348 
 6349     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6350     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6351     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6352 
 6353     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6354     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6355 
 6356     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6357 
 6358     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6359     assert(vs_disjoint(va, vb), "va and vb overlap");
 6360     assert(vs_disjoint(va, vq), "va and vq overlap");
 6361     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6362 
 6363     // We multiply the front and back halves of each sequence 4 at a
 6364     // time because
 6365     //
 6366     // 1) we are currently only able to get 4-way instruction
 6367     // parallelism at best
 6368     //
 6369     // 2) we need registers for the constants in vq and temporary
 6370     // scratch registers to hold intermediate results so vtmp can only
 6371     // be a VSeq<4> which means we only have 4 scratch slots.
 6372 
 6373     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6374     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6375   }
 6376 
 6377   // Perform combined montmul then add/sub on 4x4S vectors.
 6378   void dilithium_montmul16_sub_add(
 6379           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6380           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6381     // compute a = montmul(a1, c)
 6382     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6383     // ouptut a1 = a0 - a
 6384     vs_subv(va1, __ T4S, va0, vc);
 6385     //    and a0 = a0 + a
 6386     vs_addv(va0, __ T4S, va0, vc);
 6387   }
 6388 
 6389   // Perform combined add/sub then montul on 4x4S vectors.
 6390   void dilithium_sub_add_montmul16(
 6391           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6392           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6393     // compute c = a0 - a1
 6394     vs_subv(vtmp1, __ T4S, va0, va1);
 6395     // output a0 = a0 + a1
 6396     vs_addv(va0, __ T4S, va0, va1);
 6397     // output a1 = b montmul c
 6398     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6399   }
 6400 
 6401   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6402   // in the Java implementation come in sequences of at least 8, so we
 6403   // can use ldpq to collect the corresponding data into pairs of vector
 6404   // registers.
 6405   // We collect the coefficients corresponding to the 'j+l' indexes into
 6406   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6407   // then we do the (Montgomery) multiplications by the zetas in parallel
 6408   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6409   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6410   // v0-v7 and finally save the results back to the coeffs array.
 6411   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6412     const Register coeffs, const Register zetas) {
 6413     int c1 = 0;
 6414     int c2 = 512;
 6415     int startIncr;
 6416     // don't use callee save registers v8 - v15
 6417     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6418     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6419     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6420     int offsets[4] = { 0, 32, 64, 96 };
 6421 
 6422     for (int level = 0; level < 5; level++) {
 6423       int c1Start = c1;
 6424       int c2Start = c2;
 6425       if (level == 3) {
 6426         offsets[1] = 32;
 6427         offsets[2] = 128;
 6428         offsets[3] = 160;
 6429       } else if (level == 4) {
 6430         offsets[1] = 64;
 6431         offsets[2] = 128;
 6432         offsets[3] = 192;
 6433       }
 6434 
 6435       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6436       // time at 4 different offsets and multiply them in order by the
 6437       // next set of input values. So we employ indexed load and store
 6438       // pair instructions with arrangement 4S.
 6439       for (int i = 0; i < 4; i++) {
 6440         // reload q and qinv
 6441         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6442         // load 8x4S coefficients via second start pos == c2
 6443         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6444         // load next 8x4S inputs == b
 6445         vs_ldpq_post(vs2, zetas);
 6446         // compute a == c2 * b mod MONT_Q
 6447         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6448         // load 8x4s coefficients via first start pos == c1
 6449         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6450         // compute a1 =  c1 + a
 6451         vs_addv(vs3, __ T4S, vs1, vs2);
 6452         // compute a2 =  c1 - a
 6453         vs_subv(vs1, __ T4S, vs1, vs2);
 6454         // output a1 and a2
 6455         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6456         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6457 
 6458         int k = 4 * level + i;
 6459 
 6460         if (k > 7) {
 6461           startIncr = 256;
 6462         } else if (k == 5) {
 6463           startIncr = 384;
 6464         } else {
 6465           startIncr = 128;
 6466         }
 6467 
 6468         c1Start += startIncr;
 6469         c2Start += startIncr;
 6470       }
 6471 
 6472       c2 /= 2;
 6473     }
 6474   }
 6475 
 6476   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6477   // Implements the method
 6478   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6479   // of the Java class sun.security.provider
 6480   //
 6481   // coeffs (int[256]) = c_rarg0
 6482   // zetas (int[256]) = c_rarg1
 6483   address generate_dilithiumAlmostNtt() {
 6484 
 6485     __ align(CodeEntryAlignment);
 6486     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6487     StubCodeMark mark(this, stub_id);
 6488     address start = __ pc();
 6489     __ enter();
 6490 
 6491     const Register coeffs = c_rarg0;
 6492     const Register zetas = c_rarg1;
 6493 
 6494     const Register tmpAddr = r9;
 6495     const Register dilithiumConsts = r10;
 6496     const Register result = r11;
 6497     // don't use callee save registers v8 - v15
 6498     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6499     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6500     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6501     int offsets[4] = { 0, 32, 64, 96};
 6502     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6503     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6504     __ add(result, coeffs, 0);
 6505     __ lea(dilithiumConsts,
 6506              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6507 
 6508     // Each level represents one iteration of the outer for loop of the Java version.
 6509 
 6510     // level 0-4
 6511     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6512 
 6513     // level 5
 6514 
 6515     // At level 5 the coefficients we need to combine with the zetas
 6516     // are grouped in memory in blocks of size 4. So, for both sets of
 6517     // coefficients we load 4 adjacent values at 8 different offsets
 6518     // using an indexed ldr with register variant Q and multiply them
 6519     // in sequence order by the next set of inputs. Likewise we store
 6520     // the resuls using an indexed str with register variant Q.
 6521     for (int i = 0; i < 1024; i += 256) {
 6522       // reload constants q, qinv each iteration as they get clobbered later
 6523       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6524       // load 32 (8x4S) coefficients via first offsets = c1
 6525       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6526       // load next 32 (8x4S) inputs = b
 6527       vs_ldpq_post(vs2, zetas);
 6528       // a = b montul c1
 6529       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6530       // load 32 (8x4S) coefficients via second offsets = c2
 6531       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6532       // add/sub with result of multiply
 6533       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6534       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6535       // write back new coefficients using same offsets
 6536       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6537       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6538     }
 6539 
 6540     // level 6
 6541     // At level 6 the coefficients we need to combine with the zetas
 6542     // are grouped in memory in pairs, the first two being montmul
 6543     // inputs and the second add/sub inputs. We can still implement
 6544     // the montmul+sub+add using 4-way parallelism but only if we
 6545     // combine the coefficients with the zetas 16 at a time. We load 8
 6546     // adjacent values at 4 different offsets using an ld2 load with
 6547     // arrangement 2D. That interleaves the lower and upper halves of
 6548     // each pair of quadwords into successive vector registers. We
 6549     // then need to montmul the 4 even elements of the coefficients
 6550     // register sequence by the zetas in order and then add/sub the 4
 6551     // odd elements of the coefficients register sequence. We use an
 6552     // equivalent st2 operation to store the results back into memory
 6553     // de-interleaved.
 6554     for (int i = 0; i < 1024; i += 128) {
 6555       // reload constants q, qinv each iteration as they get clobbered later
 6556       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6557       // load interleaved 16 (4x2D) coefficients via offsets
 6558       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6559       // load next 16 (4x4S) inputs
 6560       vs_ldpq_post(vs_front(vs2), zetas);
 6561       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6562       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6563                                   vs_front(vs2), vtmp, vq);
 6564       // store interleaved 16 (4x2D) coefficients via offsets
 6565       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6566     }
 6567 
 6568     // level 7
 6569     // At level 7 the coefficients we need to combine with the zetas
 6570     // occur singly with montmul inputs alterating with add/sub
 6571     // inputs. Once again we can use 4-way parallelism to combine 16
 6572     // zetas at a time. However, we have to load 8 adjacent values at
 6573     // 4 different offsets using an ld2 load with arrangement 4S. That
 6574     // interleaves the the odd words of each pair into one
 6575     // coefficients vector register and the even words of the pair
 6576     // into the next register. We then need to montmul the 4 even
 6577     // elements of the coefficients register sequence by the zetas in
 6578     // order and then add/sub the 4 odd elements of the coefficients
 6579     // register sequence. We use an equivalent st2 operation to store
 6580     // the results back into memory de-interleaved.
 6581 
 6582     for (int i = 0; i < 1024; i += 128) {
 6583       // reload constants q, qinv each iteration as they get clobbered later
 6584       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6585       // load interleaved 16 (4x4S) coefficients via offsets
 6586       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6587       // load next 16 (4x4S) inputs
 6588       vs_ldpq_post(vs_front(vs2), zetas);
 6589       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6590       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6591                                   vs_front(vs2), vtmp, vq);
 6592       // store interleaved 16 (4x4S) coefficients via offsets
 6593       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6594     }
 6595     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6596     __ mov(r0, zr); // return 0
 6597     __ ret(lr);
 6598 
 6599     return start;
 6600   }
 6601 
 6602   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6603   // in the Java implementation come in sequences of at least 8, so we
 6604   // can use ldpq to collect the corresponding data into pairs of vector
 6605   // registers
 6606   // We collect the coefficients that correspond to the 'j's into vs1
 6607   // the coefficiets that correspond to the 'j+l's into vs2 then
 6608   // do the additions into vs3 and the subtractions into vs1 then
 6609   // save the result of the additions, load the zetas into vs2
 6610   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6611   // finally save the results back to the coeffs array
 6612   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6613     const Register coeffs, const Register zetas) {
 6614     int c1 = 0;
 6615     int c2 = 32;
 6616     int startIncr;
 6617     int offsets[4];
 6618     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6619     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6620     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6621 
 6622     offsets[0] = 0;
 6623 
 6624     for (int level = 3; level < 8; level++) {
 6625       int c1Start = c1;
 6626       int c2Start = c2;
 6627       if (level == 3) {
 6628         offsets[1] = 64;
 6629         offsets[2] = 128;
 6630         offsets[3] = 192;
 6631       } else if (level == 4) {
 6632         offsets[1] = 32;
 6633         offsets[2] = 128;
 6634         offsets[3] = 160;
 6635       } else {
 6636         offsets[1] = 32;
 6637         offsets[2] = 64;
 6638         offsets[3] = 96;
 6639       }
 6640 
 6641       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6642       // time at 4 different offsets and multiply them in order by the
 6643       // next set of input values. So we employ indexed load and store
 6644       // pair instructions with arrangement 4S.
 6645       for (int i = 0; i < 4; i++) {
 6646         // load v1 32 (8x4S) coefficients relative to first start index
 6647         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6648         // load v2 32 (8x4S) coefficients relative to second start index
 6649         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6650         // a0 = v1 + v2 -- n.b. clobbers vqs
 6651         vs_addv(vs3, __ T4S, vs1, vs2);
 6652         // a1 = v1 - v2
 6653         vs_subv(vs1, __ T4S, vs1, vs2);
 6654         // save a1 relative to first start index
 6655         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6656         // load constants q, qinv each iteration as they get clobbered above
 6657         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6658         // load b next 32 (8x4S) inputs
 6659         vs_ldpq_post(vs2, zetas);
 6660         // a = a1 montmul b
 6661         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6662         // save a relative to second start index
 6663         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6664 
 6665         int k = 4 * level + i;
 6666 
 6667         if (k < 24) {
 6668           startIncr = 256;
 6669         } else if (k == 25) {
 6670           startIncr = 384;
 6671         } else {
 6672           startIncr = 128;
 6673         }
 6674 
 6675         c1Start += startIncr;
 6676         c2Start += startIncr;
 6677       }
 6678 
 6679       c2 *= 2;
 6680     }
 6681   }
 6682 
 6683   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6684   // Implements the method
 6685   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6686   // the sun.security.provider.ML_DSA class.
 6687   //
 6688   // coeffs (int[256]) = c_rarg0
 6689   // zetas (int[256]) = c_rarg1
 6690   address generate_dilithiumAlmostInverseNtt() {
 6691 
 6692     __ align(CodeEntryAlignment);
 6693     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6694     StubCodeMark mark(this, stub_id);
 6695     address start = __ pc();
 6696     __ enter();
 6697 
 6698     const Register coeffs = c_rarg0;
 6699     const Register zetas = c_rarg1;
 6700 
 6701     const Register tmpAddr = r9;
 6702     const Register dilithiumConsts = r10;
 6703     const Register result = r11;
 6704     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6705     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6706     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6707     int offsets[4] = { 0, 32, 64, 96 };
 6708     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6709     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6710 
 6711     __ add(result, coeffs, 0);
 6712     __ lea(dilithiumConsts,
 6713              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6714 
 6715     // Each level represents one iteration of the outer for loop of the Java version
 6716 
 6717     // level 0
 6718     // At level 0 we need to interleave adjacent quartets of
 6719     // coefficients before we multiply and add/sub by the next 16
 6720     // zetas just as we did for level 7 in the multiply code. So we
 6721     // load and store the values using an ld2/st2 with arrangement 4S.
 6722     for (int i = 0; i < 1024; i += 128) {
 6723       // load constants q, qinv
 6724       // n.b. this can be moved out of the loop as they do not get
 6725       // clobbered by first two loops
 6726       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6727       // a0/a1 load interleaved 32 (8x4S) coefficients
 6728       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6729       // b load next 32 (8x4S) inputs
 6730       vs_ldpq_post(vs_front(vs2), zetas);
 6731       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6732       // n.b. second half of vs2 provides temporary register storage
 6733       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6734                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6735       // a0/a1 store interleaved 32 (8x4S) coefficients
 6736       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6737     }
 6738 
 6739     // level 1
 6740     // At level 1 we need to interleave pairs of adjacent pairs of
 6741     // coefficients before we multiply by the next 16 zetas just as we
 6742     // did for level 6 in the multiply code. So we load and store the
 6743     // values an ld2/st2 with arrangement 2D.
 6744     for (int i = 0; i < 1024; i += 128) {
 6745       // a0/a1 load interleaved 32 (8x2D) coefficients
 6746       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6747       // b load next 16 (4x4S) inputs
 6748       vs_ldpq_post(vs_front(vs2), zetas);
 6749       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6750       // n.b. second half of vs2 provides temporary register storage
 6751       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6752                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6753       // a0/a1 store interleaved 32 (8x2D) coefficients
 6754       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6755     }
 6756 
 6757     // level 2
 6758     // At level 2 coefficients come in blocks of 4. So, we load 4
 6759     // adjacent coefficients at 8 distinct offsets for both the first
 6760     // and second coefficient sequences, using an ldr with register
 6761     // variant Q then combine them with next set of 32 zetas. Likewise
 6762     // we store the results using an str with register variant Q.
 6763     for (int i = 0; i < 1024; i += 256) {
 6764       // c0 load 32 (8x4S) coefficients via first offsets
 6765       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6766       // c1 load 32 (8x4S) coefficients via second offsets
 6767       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6768       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6769       vs_addv(vs3, __ T4S, vs1, vs2);
 6770       // c = c0 - c1
 6771       vs_subv(vs1, __ T4S, vs1, vs2);
 6772       // store a0 32 (8x4S) coefficients via first offsets
 6773       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6774       // b load 32 (8x4S) next inputs
 6775       vs_ldpq_post(vs2, zetas);
 6776       // reload constants q, qinv -- they were clobbered earlier
 6777       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6778       // compute a1 = b montmul c
 6779       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6780       // store a1 32 (8x4S) coefficients via second offsets
 6781       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6782     }
 6783 
 6784     // level 3-7
 6785     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6786 
 6787     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6788     __ mov(r0, zr); // return 0
 6789     __ ret(lr);
 6790 
 6791     return start;
 6792   }
 6793 
 6794   // Dilithium multiply polynomials in the NTT domain.
 6795   // Straightforward implementation of the method
 6796   // static int implDilithiumNttMult(
 6797   //              int[] result, int[] ntta, int[] nttb {} of
 6798   // the sun.security.provider.ML_DSA class.
 6799   //
 6800   // result (int[256]) = c_rarg0
 6801   // poly1 (int[256]) = c_rarg1
 6802   // poly2 (int[256]) = c_rarg2
 6803   address generate_dilithiumNttMult() {
 6804 
 6805         __ align(CodeEntryAlignment);
 6806     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6807     StubCodeMark mark(this, stub_id);
 6808     address start = __ pc();
 6809     __ enter();
 6810 
 6811     Label L_loop;
 6812 
 6813     const Register result = c_rarg0;
 6814     const Register poly1 = c_rarg1;
 6815     const Register poly2 = c_rarg2;
 6816 
 6817     const Register dilithiumConsts = r10;
 6818     const Register len = r11;
 6819 
 6820     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6821     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6822     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6823     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6824 
 6825     __ lea(dilithiumConsts,
 6826              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6827 
 6828     // load constants q, qinv
 6829     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6830     // load constant rSquare into v29
 6831     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6832 
 6833     __ mov(len, zr);
 6834     __ add(len, len, 1024);
 6835 
 6836     __ BIND(L_loop);
 6837 
 6838     // b load 32 (8x4S) next inputs from poly1
 6839     vs_ldpq_post(vs1, poly1);
 6840     // c load 32 (8x4S) next inputs from poly2
 6841     vs_ldpq_post(vs2, poly2);
 6842     // compute a = b montmul c
 6843     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6844     // compute a = rsquare montmul a
 6845     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6846     // save a 32 (8x4S) results
 6847     vs_stpq_post(vs2, result);
 6848 
 6849     __ sub(len, len, 128);
 6850     __ cmp(len, (u1)128);
 6851     __ br(Assembler::GE, L_loop);
 6852 
 6853     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6854     __ mov(r0, zr); // return 0
 6855     __ ret(lr);
 6856 
 6857     return start;
 6858   }
 6859 
 6860   // Dilithium Motgomery multiply an array by a constant.
 6861   // A straightforward implementation of the method
 6862   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6863   // of the sun.security.provider.MLDSA class
 6864   //
 6865   // coeffs (int[256]) = c_rarg0
 6866   // constant (int) = c_rarg1
 6867   address generate_dilithiumMontMulByConstant() {
 6868 
 6869     __ align(CodeEntryAlignment);
 6870     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6871     StubCodeMark mark(this, stub_id);
 6872     address start = __ pc();
 6873     __ enter();
 6874 
 6875     Label L_loop;
 6876 
 6877     const Register coeffs = c_rarg0;
 6878     const Register constant = c_rarg1;
 6879 
 6880     const Register dilithiumConsts = r10;
 6881     const Register result = r11;
 6882     const Register len = r12;
 6883 
 6884     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6885     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6886     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6887     VSeq<8> vconst(29, 0);             // for montmul by constant
 6888 
 6889     // results track inputs
 6890     __ add(result, coeffs, 0);
 6891     __ lea(dilithiumConsts,
 6892              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6893 
 6894     // load constants q, qinv -- they do not get clobbered by first two loops
 6895     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6896     // copy caller supplied constant across vconst
 6897     __ dup(vconst[0], __ T4S, constant);
 6898     __ mov(len, zr);
 6899     __ add(len, len, 1024);
 6900 
 6901     __ BIND(L_loop);
 6902 
 6903     // load next 32 inputs
 6904     vs_ldpq_post(vs2, coeffs);
 6905     // mont mul by constant
 6906     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6907     // write next 32 results
 6908     vs_stpq_post(vs2, result);
 6909 
 6910     __ sub(len, len, 128);
 6911     __ cmp(len, (u1)128);
 6912     __ br(Assembler::GE, L_loop);
 6913 
 6914     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6915     __ mov(r0, zr); // return 0
 6916     __ ret(lr);
 6917 
 6918     return start;
 6919   }
 6920 
 6921   // Dilithium decompose poly.
 6922   // Implements the method
 6923   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6924   // of the sun.security.provider.ML_DSA class
 6925   //
 6926   // input (int[256]) = c_rarg0
 6927   // lowPart (int[256]) = c_rarg1
 6928   // highPart (int[256]) = c_rarg2
 6929   // twoGamma2  (int) = c_rarg3
 6930   // multiplier (int) = c_rarg4
 6931   address generate_dilithiumDecomposePoly() {
 6932 
 6933     __ align(CodeEntryAlignment);
 6934     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 6935     StubCodeMark mark(this, stub_id);
 6936     address start = __ pc();
 6937     Label L_loop;
 6938 
 6939     const Register input = c_rarg0;
 6940     const Register lowPart = c_rarg1;
 6941     const Register highPart = c_rarg2;
 6942     const Register twoGamma2 = c_rarg3;
 6943     const Register multiplier = c_rarg4;
 6944 
 6945     const Register len = r9;
 6946     const Register dilithiumConsts = r10;
 6947     const Register tmp = r11;
 6948 
 6949     // 6 independent sets of 4x4s values
 6950     VSeq<4> vs1(0), vs2(4), vs3(8);
 6951     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6952 
 6953     // 7 constants for cross-multiplying
 6954     VSeq<4> one(25, 0);
 6955     VSeq<4> qminus1(26, 0);
 6956     VSeq<4> g2(27, 0);
 6957     VSeq<4> twog2(28, 0);
 6958     VSeq<4> mult(29, 0);
 6959     VSeq<4> q(30, 0);
 6960     VSeq<4> qadd(31, 0);
 6961 
 6962     __ enter();
 6963 
 6964     __ lea(dilithiumConsts,
 6965              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6966 
 6967     // save callee-saved registers
 6968     __ stpd(v8, v9, __ pre(sp, -64));
 6969     __ stpd(v10, v11, Address(sp, 16));
 6970     __ stpd(v12, v13, Address(sp, 32));
 6971     __ stpd(v14, v15, Address(sp, 48));
 6972 
 6973     // populate constant registers
 6974     __ mov(tmp, zr);
 6975     __ add(tmp, tmp, 1);
 6976     __ dup(one[0], __ T4S, tmp); // 1
 6977     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6978     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6979     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6980     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6981     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6982     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6983 
 6984     __ mov(len, zr);
 6985     __ add(len, len, 1024);
 6986 
 6987     __ BIND(L_loop);
 6988 
 6989     // load next 4x4S inputs interleaved: rplus --> vs1
 6990     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 6991 
 6992     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 6993     vs_addv(vtmp, __ T4S, vs1, qadd);
 6994     vs_sshr(vtmp, __ T4S, vtmp, 23);
 6995     vs_mulv(vtmp, __ T4S, vtmp, q);
 6996     vs_subv(vs1, __ T4S, vs1, vtmp);
 6997 
 6998     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 6999     vs_sshr(vtmp, __ T4S, vs1, 31);
 7000     vs_andr(vtmp, vtmp, q);
 7001     vs_addv(vs1, __ T4S, vs1, vtmp);
 7002 
 7003     // quotient --> vs2
 7004     // int quotient = (rplus * multiplier) >> 22;
 7005     vs_mulv(vtmp, __ T4S, vs1, mult);
 7006     vs_sshr(vs2, __ T4S, vtmp, 22);
 7007 
 7008     // r0 --> vs3
 7009     // int r0 = rplus - quotient * twoGamma2;
 7010     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7011     vs_subv(vs3, __ T4S, vs1, vtmp);
 7012 
 7013     // mask --> vs4
 7014     // int mask = (twoGamma2 - r0) >> 22;
 7015     vs_subv(vtmp, __ T4S, twog2, vs3);
 7016     vs_sshr(vs4, __ T4S, vtmp, 22);
 7017 
 7018     // r0 -= (mask & twoGamma2);
 7019     vs_andr(vtmp, vs4, twog2);
 7020     vs_subv(vs3, __ T4S, vs3, vtmp);
 7021 
 7022     //  quotient += (mask & 1);
 7023     vs_andr(vtmp, vs4, one);
 7024     vs_addv(vs2, __ T4S, vs2, vtmp);
 7025 
 7026     // mask = (twoGamma2 / 2 - r0) >> 31;
 7027     vs_subv(vtmp, __ T4S, g2, vs3);
 7028     vs_sshr(vs4, __ T4S, vtmp, 31);
 7029 
 7030     // r0 -= (mask & twoGamma2);
 7031     vs_andr(vtmp, vs4, twog2);
 7032     vs_subv(vs3, __ T4S, vs3, vtmp);
 7033 
 7034     // quotient += (mask & 1);
 7035     vs_andr(vtmp, vs4, one);
 7036     vs_addv(vs2, __ T4S, vs2, vtmp);
 7037 
 7038     // r1 --> vs5
 7039     // int r1 = rplus - r0 - (dilithium_q - 1);
 7040     vs_subv(vtmp, __ T4S, vs1, vs3);
 7041     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7042 
 7043     // r1 --> vs1 (overwriting rplus)
 7044     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7045     vs_negr(vtmp, __ T4S, vs5);
 7046     vs_orr(vtmp, vs5, vtmp);
 7047     vs_sshr(vs1, __ T4S, vtmp, 31);
 7048 
 7049     // r0 += ~r1;
 7050     vs_notr(vtmp, vs1);
 7051     vs_addv(vs3, __ T4S, vs3, vtmp);
 7052 
 7053     // r1 = r1 & quotient;
 7054     vs_andr(vs1, vs2, vs1);
 7055 
 7056     // store results inteleaved
 7057     // lowPart[m] = r0;
 7058     // highPart[m] = r1;
 7059     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7060     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7061 
 7062     __ sub(len, len, 64);
 7063     __ cmp(len, (u1)64);
 7064     __ br(Assembler::GE, L_loop);
 7065 
 7066     // restore callee-saved vector registers
 7067     __ ldpd(v14, v15, Address(sp, 48));
 7068     __ ldpd(v12, v13, Address(sp, 32));
 7069     __ ldpd(v10, v11, Address(sp, 16));
 7070     __ ldpd(v8, v9, __ post(sp, 64));
 7071 
 7072     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7073     __ mov(r0, zr); // return 0
 7074     __ ret(lr);
 7075 
 7076     return start;
 7077   }
 7078 
 7079   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7080              Register tmp0, Register tmp1, Register tmp2) {
 7081     __ bic(tmp0, a2, a1); // for a0
 7082     __ bic(tmp1, a3, a2); // for a1
 7083     __ bic(tmp2, a4, a3); // for a2
 7084     __ eor(a2, a2, tmp2);
 7085     __ bic(tmp2, a0, a4); // for a3
 7086     __ eor(a3, a3, tmp2);
 7087     __ bic(tmp2, a1, a0); // for a4
 7088     __ eor(a0, a0, tmp0);
 7089     __ eor(a1, a1, tmp1);
 7090     __ eor(a4, a4, tmp2);
 7091   }
 7092 
 7093   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7094                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7095                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7096                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7097                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7098                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7099                         Register tmp0, Register tmp1, Register tmp2) {
 7100     __ eor3(tmp1, a4, a9, a14);
 7101     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7102     __ eor3(tmp2, a1, a6, a11);
 7103     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7104     __ rax1(tmp2, tmp0, tmp1); // d0
 7105     {
 7106 
 7107       Register tmp3, tmp4;
 7108       if (can_use_fp && can_use_r18) {
 7109         tmp3 = rfp;
 7110         tmp4 = r18_tls;
 7111       } else {
 7112         tmp3 = a4;
 7113         tmp4 = a9;
 7114         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7115       }
 7116 
 7117       __ eor3(tmp3, a0, a5, a10);
 7118       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7119       __ eor(a0, a0, tmp2);
 7120       __ eor(a5, a5, tmp2);
 7121       __ eor(a10, a10, tmp2);
 7122       __ eor(a15, a15, tmp2);
 7123       __ eor(a20, a20, tmp2); // d0(tmp2)
 7124       __ eor3(tmp3, a2, a7, a12);
 7125       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7126       __ rax1(tmp3, tmp4, tmp2); // d1
 7127       __ eor(a1, a1, tmp3);
 7128       __ eor(a6, a6, tmp3);
 7129       __ eor(a11, a11, tmp3);
 7130       __ eor(a16, a16, tmp3);
 7131       __ eor(a21, a21, tmp3); // d1(tmp3)
 7132       __ rax1(tmp3, tmp2, tmp0); // d3
 7133       __ eor3(tmp2, a3, a8, a13);
 7134       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7135       __ eor(a3, a3, tmp3);
 7136       __ eor(a8, a8, tmp3);
 7137       __ eor(a13, a13, tmp3);
 7138       __ eor(a18, a18, tmp3);
 7139       __ eor(a23, a23, tmp3);
 7140       __ rax1(tmp2, tmp1, tmp0); // d2
 7141       __ eor(a2, a2, tmp2);
 7142       __ eor(a7, a7, tmp2);
 7143       __ eor(a12, a12, tmp2);
 7144       __ rax1(tmp0, tmp0, tmp4); // d4
 7145       if (!can_use_fp || !can_use_r18) {
 7146         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7147       }
 7148       __ eor(a17, a17, tmp2);
 7149       __ eor(a22, a22, tmp2);
 7150       __ eor(a4, a4, tmp0);
 7151       __ eor(a9, a9, tmp0);
 7152       __ eor(a14, a14, tmp0);
 7153       __ eor(a19, a19, tmp0);
 7154       __ eor(a24, a24, tmp0);
 7155     }
 7156 
 7157     __ rol(tmp0, a10, 3);
 7158     __ rol(a10, a1, 1);
 7159     __ rol(a1, a6, 44);
 7160     __ rol(a6, a9, 20);
 7161     __ rol(a9, a22, 61);
 7162     __ rol(a22, a14, 39);
 7163     __ rol(a14, a20, 18);
 7164     __ rol(a20, a2, 62);
 7165     __ rol(a2, a12, 43);
 7166     __ rol(a12, a13, 25);
 7167     __ rol(a13, a19, 8) ;
 7168     __ rol(a19, a23, 56);
 7169     __ rol(a23, a15, 41);
 7170     __ rol(a15, a4, 27);
 7171     __ rol(a4, a24, 14);
 7172     __ rol(a24, a21, 2);
 7173     __ rol(a21, a8, 55);
 7174     __ rol(a8, a16, 45);
 7175     __ rol(a16, a5, 36);
 7176     __ rol(a5, a3, 28);
 7177     __ rol(a3, a18, 21);
 7178     __ rol(a18, a17, 15);
 7179     __ rol(a17, a11, 10);
 7180     __ rol(a11, a7, 6);
 7181     __ mov(a7, tmp0);
 7182 
 7183     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7184     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7185     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7186     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7187     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7188 
 7189     __ ldr(tmp1, __ post(rc, 8));
 7190     __ eor(a0, a0, tmp1);
 7191 
 7192   }
 7193 
 7194   // Arguments:
 7195   //
 7196   // Inputs:
 7197   //   c_rarg0   - byte[]  source+offset
 7198   //   c_rarg1   - byte[]  SHA.state
 7199   //   c_rarg2   - int     block_size
 7200   //   c_rarg3   - int     offset
 7201   //   c_rarg4   - int     limit
 7202   //
 7203   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7204     bool multi_block;
 7205     switch (stub_id) {
 7206     case StubId::stubgen_sha3_implCompress_id:
 7207       multi_block = false;
 7208       break;
 7209     case StubId::stubgen_sha3_implCompressMB_id:
 7210       multi_block = true;
 7211       break;
 7212     default:
 7213       ShouldNotReachHere();
 7214     }
 7215 
 7216     static const uint64_t round_consts[24] = {
 7217       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7218       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7219       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7220       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7221       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7222       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7223       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7224       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7225     };
 7226 
 7227     __ align(CodeEntryAlignment);
 7228     StubCodeMark mark(this, stub_id);
 7229     address start = __ pc();
 7230 
 7231     Register buf           = c_rarg0;
 7232     Register state         = c_rarg1;
 7233     Register block_size    = c_rarg2;
 7234     Register ofs           = c_rarg3;
 7235     Register limit         = c_rarg4;
 7236 
 7237     // use r3.r17,r19..r28 to keep a0..a24.
 7238     // a0..a24 are respective locals from SHA3.java
 7239     Register a0 = r25,
 7240              a1 = r26,
 7241              a2 = r27,
 7242              a3 = r3,
 7243              a4 = r4,
 7244              a5 = r5,
 7245              a6 = r6,
 7246              a7 = r7,
 7247              a8 = rscratch1, // r8
 7248              a9 = rscratch2, // r9
 7249              a10 = r10,
 7250              a11 = r11,
 7251              a12 = r12,
 7252              a13 = r13,
 7253              a14 = r14,
 7254              a15 = r15,
 7255              a16 = r16,
 7256              a17 = r17,
 7257              a18 = r28,
 7258              a19 = r19,
 7259              a20 = r20,
 7260              a21 = r21,
 7261              a22 = r22,
 7262              a23 = r23,
 7263              a24 = r24;
 7264 
 7265     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7266 
 7267     Label sha3_loop, rounds24_preloop, loop_body;
 7268     Label sha3_512_or_sha3_384, shake128;
 7269 
 7270     bool can_use_r18 = false;
 7271 #ifndef R18_RESERVED
 7272     can_use_r18 = true;
 7273 #endif
 7274     bool can_use_fp = !PreserveFramePointer;
 7275 
 7276     __ enter();
 7277 
 7278     // save almost all yet unsaved gpr registers on stack
 7279     __ str(block_size, __ pre(sp, -128));
 7280     if (multi_block) {
 7281       __ stpw(ofs, limit, Address(sp, 8));
 7282     }
 7283     // 8 bytes at sp+16 will be used to keep buf
 7284     __ stp(r19, r20, Address(sp, 32));
 7285     __ stp(r21, r22, Address(sp, 48));
 7286     __ stp(r23, r24, Address(sp, 64));
 7287     __ stp(r25, r26, Address(sp, 80));
 7288     __ stp(r27, r28, Address(sp, 96));
 7289     if (can_use_r18 && can_use_fp) {
 7290       __ stp(r18_tls, state, Address(sp, 112));
 7291     } else {
 7292       __ str(state, Address(sp, 112));
 7293     }
 7294 
 7295     // begin sha3 calculations: loading a0..a24 from state arrary
 7296     __ ldp(a0, a1, state);
 7297     __ ldp(a2, a3, Address(state, 16));
 7298     __ ldp(a4, a5, Address(state, 32));
 7299     __ ldp(a6, a7, Address(state, 48));
 7300     __ ldp(a8, a9, Address(state, 64));
 7301     __ ldp(a10, a11, Address(state, 80));
 7302     __ ldp(a12, a13, Address(state, 96));
 7303     __ ldp(a14, a15, Address(state, 112));
 7304     __ ldp(a16, a17, Address(state, 128));
 7305     __ ldp(a18, a19, Address(state, 144));
 7306     __ ldp(a20, a21, Address(state, 160));
 7307     __ ldp(a22, a23, Address(state, 176));
 7308     __ ldr(a24, Address(state, 192));
 7309 
 7310     __ BIND(sha3_loop);
 7311 
 7312     // load input
 7313     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7314     __ eor(a0, a0, tmp3);
 7315     __ eor(a1, a1, tmp2);
 7316     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7317     __ eor(a2, a2, tmp3);
 7318     __ eor(a3, a3, tmp2);
 7319     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7320     __ eor(a4, a4, tmp3);
 7321     __ eor(a5, a5, tmp2);
 7322     __ ldr(tmp3, __ post(buf, 8));
 7323     __ eor(a6, a6, tmp3);
 7324 
 7325     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7326     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7327 
 7328     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7329     __ eor(a7, a7, tmp3);
 7330     __ eor(a8, a8, tmp2);
 7331     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7332     __ eor(a9, a9, tmp3);
 7333     __ eor(a10, a10, tmp2);
 7334     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7335     __ eor(a11, a11, tmp3);
 7336     __ eor(a12, a12, tmp2);
 7337     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7338     __ eor(a13, a13, tmp3);
 7339     __ eor(a14, a14, tmp2);
 7340     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7341     __ eor(a15, a15, tmp3);
 7342     __ eor(a16, a16, tmp2);
 7343 
 7344     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7345     __ andw(tmp2, block_size, 48);
 7346     __ cbzw(tmp2, rounds24_preloop);
 7347     __ tbnz(block_size, 5, shake128);
 7348     // block_size == 144, bit5 == 0, SHA3-244
 7349     __ ldr(tmp3, __ post(buf, 8));
 7350     __ eor(a17, a17, tmp3);
 7351     __ b(rounds24_preloop);
 7352 
 7353     __ BIND(shake128);
 7354     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7355     __ eor(a17, a17, tmp3);
 7356     __ eor(a18, a18, tmp2);
 7357     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7358     __ eor(a19, a19, tmp3);
 7359     __ eor(a20, a20, tmp2);
 7360     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7361 
 7362     __ BIND(sha3_512_or_sha3_384);
 7363     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7364     __ eor(a7, a7, tmp3);
 7365     __ eor(a8, a8, tmp2);
 7366     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7367 
 7368     // SHA3-384
 7369     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7370     __ eor(a9, a9, tmp3);
 7371     __ eor(a10, a10, tmp2);
 7372     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7373     __ eor(a11, a11, tmp3);
 7374     __ eor(a12, a12, tmp2);
 7375 
 7376     __ BIND(rounds24_preloop);
 7377     __ fmovs(v0, 24.0); // float loop counter,
 7378     __ fmovs(v1, 1.0);  // exact representation
 7379 
 7380     __ str(buf, Address(sp, 16));
 7381     __ lea(tmp3, ExternalAddress((address) round_consts));
 7382 
 7383     __ BIND(loop_body);
 7384     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7385                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7386                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7387                      tmp0, tmp1, tmp2);
 7388     __ fsubs(v0, v0, v1);
 7389     __ fcmps(v0, 0.0);
 7390     __ br(__ NE, loop_body);
 7391 
 7392     if (multi_block) {
 7393       __ ldrw(block_size, sp); // block_size
 7394       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7395       __ addw(tmp2, tmp2, block_size);
 7396       __ cmpw(tmp2, tmp1);
 7397       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7398       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7399       __ br(Assembler::LE, sha3_loop);
 7400       __ movw(c_rarg0, tmp2); // return offset
 7401     }
 7402     if (can_use_fp && can_use_r18) {
 7403       __ ldp(r18_tls, state, Address(sp, 112));
 7404     } else {
 7405       __ ldr(state, Address(sp, 112));
 7406     }
 7407     // save calculated sha3 state
 7408     __ stp(a0, a1, Address(state));
 7409     __ stp(a2, a3, Address(state, 16));
 7410     __ stp(a4, a5, Address(state, 32));
 7411     __ stp(a6, a7, Address(state, 48));
 7412     __ stp(a8, a9, Address(state, 64));
 7413     __ stp(a10, a11, Address(state, 80));
 7414     __ stp(a12, a13, Address(state, 96));
 7415     __ stp(a14, a15, Address(state, 112));
 7416     __ stp(a16, a17, Address(state, 128));
 7417     __ stp(a18, a19, Address(state, 144));
 7418     __ stp(a20, a21, Address(state, 160));
 7419     __ stp(a22, a23, Address(state, 176));
 7420     __ str(a24, Address(state, 192));
 7421 
 7422     // restore required registers from stack
 7423     __ ldp(r19, r20, Address(sp, 32));
 7424     __ ldp(r21, r22, Address(sp, 48));
 7425     __ ldp(r23, r24, Address(sp, 64));
 7426     __ ldp(r25, r26, Address(sp, 80));
 7427     __ ldp(r27, r28, Address(sp, 96));
 7428     if (can_use_fp && can_use_r18) {
 7429       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7430     } // else no need to recalculate rfp, since it wasn't changed
 7431 
 7432     __ leave();
 7433 
 7434     __ ret(lr);
 7435 
 7436     return start;
 7437   }
 7438 
 7439   /**
 7440    *  Arguments:
 7441    *
 7442    * Inputs:
 7443    *   c_rarg0   - int crc
 7444    *   c_rarg1   - byte* buf
 7445    *   c_rarg2   - int length
 7446    *
 7447    * Output:
 7448    *       rax   - int crc result
 7449    */
 7450   address generate_updateBytesCRC32() {
 7451     assert(UseCRC32Intrinsics, "what are we doing here?");
 7452 
 7453     __ align(CodeEntryAlignment);
 7454     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7455     StubCodeMark mark(this, stub_id);
 7456 
 7457     address start = __ pc();
 7458 
 7459     const Register crc   = c_rarg0;  // crc
 7460     const Register buf   = c_rarg1;  // source java byte array address
 7461     const Register len   = c_rarg2;  // length
 7462     const Register table0 = c_rarg3; // crc_table address
 7463     const Register table1 = c_rarg4;
 7464     const Register table2 = c_rarg5;
 7465     const Register table3 = c_rarg6;
 7466     const Register tmp3 = c_rarg7;
 7467 
 7468     BLOCK_COMMENT("Entry:");
 7469     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7470 
 7471     __ kernel_crc32(crc, buf, len,
 7472               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7473 
 7474     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7475     __ ret(lr);
 7476 
 7477     return start;
 7478   }
 7479 
 7480   /**
 7481    *  Arguments:
 7482    *
 7483    * Inputs:
 7484    *   c_rarg0   - int crc
 7485    *   c_rarg1   - byte* buf
 7486    *   c_rarg2   - int length
 7487    *   c_rarg3   - int* table
 7488    *
 7489    * Output:
 7490    *       r0   - int crc result
 7491    */
 7492   address generate_updateBytesCRC32C() {
 7493     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7494 
 7495     __ align(CodeEntryAlignment);
 7496     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7497     StubCodeMark mark(this, stub_id);
 7498 
 7499     address start = __ pc();
 7500 
 7501     const Register crc   = c_rarg0;  // crc
 7502     const Register buf   = c_rarg1;  // source java byte array address
 7503     const Register len   = c_rarg2;  // length
 7504     const Register table0 = c_rarg3; // crc_table address
 7505     const Register table1 = c_rarg4;
 7506     const Register table2 = c_rarg5;
 7507     const Register table3 = c_rarg6;
 7508     const Register tmp3 = c_rarg7;
 7509 
 7510     BLOCK_COMMENT("Entry:");
 7511     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7512 
 7513     __ kernel_crc32c(crc, buf, len,
 7514               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7515 
 7516     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7517     __ ret(lr);
 7518 
 7519     return start;
 7520   }
 7521 
 7522   /***
 7523    *  Arguments:
 7524    *
 7525    *  Inputs:
 7526    *   c_rarg0   - int   adler
 7527    *   c_rarg1   - byte* buff
 7528    *   c_rarg2   - int   len
 7529    *
 7530    * Output:
 7531    *   c_rarg0   - int adler result
 7532    */
 7533   address generate_updateBytesAdler32() {
 7534     __ align(CodeEntryAlignment);
 7535     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7536     StubCodeMark mark(this, stub_id);
 7537     address start = __ pc();
 7538 
 7539     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7540 
 7541     // Aliases
 7542     Register adler  = c_rarg0;
 7543     Register s1     = c_rarg0;
 7544     Register s2     = c_rarg3;
 7545     Register buff   = c_rarg1;
 7546     Register len    = c_rarg2;
 7547     Register nmax  = r4;
 7548     Register base  = r5;
 7549     Register count = r6;
 7550     Register temp0 = rscratch1;
 7551     Register temp1 = rscratch2;
 7552     FloatRegister vbytes = v0;
 7553     FloatRegister vs1acc = v1;
 7554     FloatRegister vs2acc = v2;
 7555     FloatRegister vtable = v3;
 7556 
 7557     // Max number of bytes we can process before having to take the mod
 7558     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7559     uint64_t BASE = 0xfff1;
 7560     uint64_t NMAX = 0x15B0;
 7561 
 7562     __ mov(base, BASE);
 7563     __ mov(nmax, NMAX);
 7564 
 7565     // Load accumulation coefficients for the upper 16 bits
 7566     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7567     __ ld1(vtable, __ T16B, Address(temp0));
 7568 
 7569     // s1 is initialized to the lower 16 bits of adler
 7570     // s2 is initialized to the upper 16 bits of adler
 7571     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7572     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7573 
 7574     // The pipelined loop needs at least 16 elements for 1 iteration
 7575     // It does check this, but it is more effective to skip to the cleanup loop
 7576     __ cmp(len, (u1)16);
 7577     __ br(Assembler::HS, L_nmax);
 7578     __ cbz(len, L_combine);
 7579 
 7580     __ bind(L_simple_by1_loop);
 7581     __ ldrb(temp0, Address(__ post(buff, 1)));
 7582     __ add(s1, s1, temp0);
 7583     __ add(s2, s2, s1);
 7584     __ subs(len, len, 1);
 7585     __ br(Assembler::HI, L_simple_by1_loop);
 7586 
 7587     // s1 = s1 % BASE
 7588     __ subs(temp0, s1, base);
 7589     __ csel(s1, temp0, s1, Assembler::HS);
 7590 
 7591     // s2 = s2 % BASE
 7592     __ lsr(temp0, s2, 16);
 7593     __ lsl(temp1, temp0, 4);
 7594     __ sub(temp1, temp1, temp0);
 7595     __ add(s2, temp1, s2, ext::uxth);
 7596 
 7597     __ subs(temp0, s2, base);
 7598     __ csel(s2, temp0, s2, Assembler::HS);
 7599 
 7600     __ b(L_combine);
 7601 
 7602     __ bind(L_nmax);
 7603     __ subs(len, len, nmax);
 7604     __ sub(count, nmax, 16);
 7605     __ br(Assembler::LO, L_by16);
 7606 
 7607     __ bind(L_nmax_loop);
 7608 
 7609     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7610                                       vbytes, vs1acc, vs2acc, vtable);
 7611 
 7612     __ subs(count, count, 16);
 7613     __ br(Assembler::HS, L_nmax_loop);
 7614 
 7615     // s1 = s1 % BASE
 7616     __ lsr(temp0, s1, 16);
 7617     __ lsl(temp1, temp0, 4);
 7618     __ sub(temp1, temp1, temp0);
 7619     __ add(temp1, temp1, s1, ext::uxth);
 7620 
 7621     __ lsr(temp0, temp1, 16);
 7622     __ lsl(s1, temp0, 4);
 7623     __ sub(s1, s1, temp0);
 7624     __ add(s1, s1, temp1, ext:: uxth);
 7625 
 7626     __ subs(temp0, s1, base);
 7627     __ csel(s1, temp0, s1, Assembler::HS);
 7628 
 7629     // s2 = s2 % BASE
 7630     __ lsr(temp0, s2, 16);
 7631     __ lsl(temp1, temp0, 4);
 7632     __ sub(temp1, temp1, temp0);
 7633     __ add(temp1, temp1, s2, ext::uxth);
 7634 
 7635     __ lsr(temp0, temp1, 16);
 7636     __ lsl(s2, temp0, 4);
 7637     __ sub(s2, s2, temp0);
 7638     __ add(s2, s2, temp1, ext:: uxth);
 7639 
 7640     __ subs(temp0, s2, base);
 7641     __ csel(s2, temp0, s2, Assembler::HS);
 7642 
 7643     __ subs(len, len, nmax);
 7644     __ sub(count, nmax, 16);
 7645     __ br(Assembler::HS, L_nmax_loop);
 7646 
 7647     __ bind(L_by16);
 7648     __ adds(len, len, count);
 7649     __ br(Assembler::LO, L_by1);
 7650 
 7651     __ bind(L_by16_loop);
 7652 
 7653     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7654                                       vbytes, vs1acc, vs2acc, vtable);
 7655 
 7656     __ subs(len, len, 16);
 7657     __ br(Assembler::HS, L_by16_loop);
 7658 
 7659     __ bind(L_by1);
 7660     __ adds(len, len, 15);
 7661     __ br(Assembler::LO, L_do_mod);
 7662 
 7663     __ bind(L_by1_loop);
 7664     __ ldrb(temp0, Address(__ post(buff, 1)));
 7665     __ add(s1, temp0, s1);
 7666     __ add(s2, s2, s1);
 7667     __ subs(len, len, 1);
 7668     __ br(Assembler::HS, L_by1_loop);
 7669 
 7670     __ bind(L_do_mod);
 7671     // s1 = s1 % BASE
 7672     __ lsr(temp0, s1, 16);
 7673     __ lsl(temp1, temp0, 4);
 7674     __ sub(temp1, temp1, temp0);
 7675     __ add(temp1, temp1, s1, ext::uxth);
 7676 
 7677     __ lsr(temp0, temp1, 16);
 7678     __ lsl(s1, temp0, 4);
 7679     __ sub(s1, s1, temp0);
 7680     __ add(s1, s1, temp1, ext:: uxth);
 7681 
 7682     __ subs(temp0, s1, base);
 7683     __ csel(s1, temp0, s1, Assembler::HS);
 7684 
 7685     // s2 = s2 % BASE
 7686     __ lsr(temp0, s2, 16);
 7687     __ lsl(temp1, temp0, 4);
 7688     __ sub(temp1, temp1, temp0);
 7689     __ add(temp1, temp1, s2, ext::uxth);
 7690 
 7691     __ lsr(temp0, temp1, 16);
 7692     __ lsl(s2, temp0, 4);
 7693     __ sub(s2, s2, temp0);
 7694     __ add(s2, s2, temp1, ext:: uxth);
 7695 
 7696     __ subs(temp0, s2, base);
 7697     __ csel(s2, temp0, s2, Assembler::HS);
 7698 
 7699     // Combine lower bits and higher bits
 7700     __ bind(L_combine);
 7701     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7702 
 7703     __ ret(lr);
 7704 
 7705     return start;
 7706   }
 7707 
 7708   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7709           Register temp0, Register temp1, FloatRegister vbytes,
 7710           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7711     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7712     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7713     // In non-vectorized code, we update s1 and s2 as:
 7714     //   s1 <- s1 + b1
 7715     //   s2 <- s2 + s1
 7716     //   s1 <- s1 + b2
 7717     //   s2 <- s2 + b1
 7718     //   ...
 7719     //   s1 <- s1 + b16
 7720     //   s2 <- s2 + s1
 7721     // Putting above assignments together, we have:
 7722     //   s1_new = s1 + b1 + b2 + ... + b16
 7723     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7724     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7725     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7726     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7727 
 7728     // s2 = s2 + s1 * 16
 7729     __ add(s2, s2, s1, Assembler::LSL, 4);
 7730 
 7731     // vs1acc = b1 + b2 + b3 + ... + b16
 7732     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7733     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7734     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7735     __ uaddlv(vs1acc, __ T16B, vbytes);
 7736     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7737 
 7738     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7739     __ fmovd(temp0, vs1acc);
 7740     __ fmovd(temp1, vs2acc);
 7741     __ add(s1, s1, temp0);
 7742     __ add(s2, s2, temp1);
 7743   }
 7744 
 7745   /**
 7746    *  Arguments:
 7747    *
 7748    *  Input:
 7749    *    c_rarg0   - x address
 7750    *    c_rarg1   - x length
 7751    *    c_rarg2   - y address
 7752    *    c_rarg3   - y length
 7753    *    c_rarg4   - z address
 7754    */
 7755   address generate_multiplyToLen() {
 7756     __ align(CodeEntryAlignment);
 7757     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7758     StubCodeMark mark(this, stub_id);
 7759 
 7760     address start = __ pc();
 7761     const Register x     = r0;
 7762     const Register xlen  = r1;
 7763     const Register y     = r2;
 7764     const Register ylen  = r3;
 7765     const Register z     = r4;
 7766 
 7767     const Register tmp0  = r5;
 7768     const Register tmp1  = r10;
 7769     const Register tmp2  = r11;
 7770     const Register tmp3  = r12;
 7771     const Register tmp4  = r13;
 7772     const Register tmp5  = r14;
 7773     const Register tmp6  = r15;
 7774     const Register tmp7  = r16;
 7775 
 7776     BLOCK_COMMENT("Entry:");
 7777     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7778     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7779     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7780     __ ret(lr);
 7781 
 7782     return start;
 7783   }
 7784 
 7785   address generate_squareToLen() {
 7786     // squareToLen algorithm for sizes 1..127 described in java code works
 7787     // faster than multiply_to_len on some CPUs and slower on others, but
 7788     // multiply_to_len shows a bit better overall results
 7789     __ align(CodeEntryAlignment);
 7790     StubId stub_id = StubId::stubgen_squareToLen_id;
 7791     StubCodeMark mark(this, stub_id);
 7792     address start = __ pc();
 7793 
 7794     const Register x     = r0;
 7795     const Register xlen  = r1;
 7796     const Register z     = r2;
 7797     const Register y     = r4; // == x
 7798     const Register ylen  = r5; // == xlen
 7799 
 7800     const Register tmp0  = r3;
 7801     const Register tmp1  = r10;
 7802     const Register tmp2  = r11;
 7803     const Register tmp3  = r12;
 7804     const Register tmp4  = r13;
 7805     const Register tmp5  = r14;
 7806     const Register tmp6  = r15;
 7807     const Register tmp7  = r16;
 7808 
 7809     RegSet spilled_regs = RegSet::of(y, ylen);
 7810     BLOCK_COMMENT("Entry:");
 7811     __ enter();
 7812     __ push(spilled_regs, sp);
 7813     __ mov(y, x);
 7814     __ mov(ylen, xlen);
 7815     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7816     __ pop(spilled_regs, sp);
 7817     __ leave();
 7818     __ ret(lr);
 7819     return start;
 7820   }
 7821 
 7822   address generate_mulAdd() {
 7823     __ align(CodeEntryAlignment);
 7824     StubId stub_id = StubId::stubgen_mulAdd_id;
 7825     StubCodeMark mark(this, stub_id);
 7826 
 7827     address start = __ pc();
 7828 
 7829     const Register out     = r0;
 7830     const Register in      = r1;
 7831     const Register offset  = r2;
 7832     const Register len     = r3;
 7833     const Register k       = r4;
 7834 
 7835     BLOCK_COMMENT("Entry:");
 7836     __ enter();
 7837     __ mul_add(out, in, offset, len, k);
 7838     __ leave();
 7839     __ ret(lr);
 7840 
 7841     return start;
 7842   }
 7843 
 7844   // Arguments:
 7845   //
 7846   // Input:
 7847   //   c_rarg0   - newArr address
 7848   //   c_rarg1   - oldArr address
 7849   //   c_rarg2   - newIdx
 7850   //   c_rarg3   - shiftCount
 7851   //   c_rarg4   - numIter
 7852   //
 7853   address generate_bigIntegerRightShift() {
 7854     __ align(CodeEntryAlignment);
 7855     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7856     StubCodeMark mark(this, stub_id);
 7857     address start = __ pc();
 7858 
 7859     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7860 
 7861     Register newArr        = c_rarg0;
 7862     Register oldArr        = c_rarg1;
 7863     Register newIdx        = c_rarg2;
 7864     Register shiftCount    = c_rarg3;
 7865     Register numIter       = c_rarg4;
 7866     Register idx           = numIter;
 7867 
 7868     Register newArrCur     = rscratch1;
 7869     Register shiftRevCount = rscratch2;
 7870     Register oldArrCur     = r13;
 7871     Register oldArrNext    = r14;
 7872 
 7873     FloatRegister oldElem0        = v0;
 7874     FloatRegister oldElem1        = v1;
 7875     FloatRegister newElem         = v2;
 7876     FloatRegister shiftVCount     = v3;
 7877     FloatRegister shiftVRevCount  = v4;
 7878 
 7879     __ cbz(idx, Exit);
 7880 
 7881     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7882 
 7883     // left shift count
 7884     __ movw(shiftRevCount, 32);
 7885     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7886 
 7887     // numIter too small to allow a 4-words SIMD loop, rolling back
 7888     __ cmp(numIter, (u1)4);
 7889     __ br(Assembler::LT, ShiftThree);
 7890 
 7891     __ dup(shiftVCount,    __ T4S, shiftCount);
 7892     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7893     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7894 
 7895     __ BIND(ShiftSIMDLoop);
 7896 
 7897     // Calculate the load addresses
 7898     __ sub(idx, idx, 4);
 7899     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7900     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7901     __ add(oldArrCur,  oldArrNext, 4);
 7902 
 7903     // Load 4 words and process
 7904     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7905     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7906     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7907     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7908     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7909     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7910 
 7911     __ cmp(idx, (u1)4);
 7912     __ br(Assembler::LT, ShiftTwoLoop);
 7913     __ b(ShiftSIMDLoop);
 7914 
 7915     __ BIND(ShiftTwoLoop);
 7916     __ cbz(idx, Exit);
 7917     __ cmp(idx, (u1)1);
 7918     __ br(Assembler::EQ, ShiftOne);
 7919 
 7920     // Calculate the load addresses
 7921     __ sub(idx, idx, 2);
 7922     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7923     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7924     __ add(oldArrCur,  oldArrNext, 4);
 7925 
 7926     // Load 2 words and process
 7927     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7928     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7929     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7930     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7931     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7932     __ st1(newElem,   __ T2S, Address(newArrCur));
 7933     __ b(ShiftTwoLoop);
 7934 
 7935     __ BIND(ShiftThree);
 7936     __ tbz(idx, 1, ShiftOne);
 7937     __ tbz(idx, 0, ShiftTwo);
 7938     __ ldrw(r10,  Address(oldArr, 12));
 7939     __ ldrw(r11,  Address(oldArr, 8));
 7940     __ lsrvw(r10, r10, shiftCount);
 7941     __ lslvw(r11, r11, shiftRevCount);
 7942     __ orrw(r12,  r10, r11);
 7943     __ strw(r12,  Address(newArr, 8));
 7944 
 7945     __ BIND(ShiftTwo);
 7946     __ ldrw(r10,  Address(oldArr, 8));
 7947     __ ldrw(r11,  Address(oldArr, 4));
 7948     __ lsrvw(r10, r10, shiftCount);
 7949     __ lslvw(r11, r11, shiftRevCount);
 7950     __ orrw(r12,  r10, r11);
 7951     __ strw(r12,  Address(newArr, 4));
 7952 
 7953     __ BIND(ShiftOne);
 7954     __ ldrw(r10,  Address(oldArr, 4));
 7955     __ ldrw(r11,  Address(oldArr));
 7956     __ lsrvw(r10, r10, shiftCount);
 7957     __ lslvw(r11, r11, shiftRevCount);
 7958     __ orrw(r12,  r10, r11);
 7959     __ strw(r12,  Address(newArr));
 7960 
 7961     __ BIND(Exit);
 7962     __ ret(lr);
 7963 
 7964     return start;
 7965   }
 7966 
 7967   // Arguments:
 7968   //
 7969   // Input:
 7970   //   c_rarg0   - newArr address
 7971   //   c_rarg1   - oldArr address
 7972   //   c_rarg2   - newIdx
 7973   //   c_rarg3   - shiftCount
 7974   //   c_rarg4   - numIter
 7975   //
 7976   address generate_bigIntegerLeftShift() {
 7977     __ align(CodeEntryAlignment);
 7978     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 7979     StubCodeMark mark(this, stub_id);
 7980     address start = __ pc();
 7981 
 7982     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7983 
 7984     Register newArr        = c_rarg0;
 7985     Register oldArr        = c_rarg1;
 7986     Register newIdx        = c_rarg2;
 7987     Register shiftCount    = c_rarg3;
 7988     Register numIter       = c_rarg4;
 7989 
 7990     Register shiftRevCount = rscratch1;
 7991     Register oldArrNext    = rscratch2;
 7992 
 7993     FloatRegister oldElem0        = v0;
 7994     FloatRegister oldElem1        = v1;
 7995     FloatRegister newElem         = v2;
 7996     FloatRegister shiftVCount     = v3;
 7997     FloatRegister shiftVRevCount  = v4;
 7998 
 7999     __ cbz(numIter, Exit);
 8000 
 8001     __ add(oldArrNext, oldArr, 4);
 8002     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8003 
 8004     // right shift count
 8005     __ movw(shiftRevCount, 32);
 8006     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8007 
 8008     // numIter too small to allow a 4-words SIMD loop, rolling back
 8009     __ cmp(numIter, (u1)4);
 8010     __ br(Assembler::LT, ShiftThree);
 8011 
 8012     __ dup(shiftVCount,     __ T4S, shiftCount);
 8013     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8014     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8015 
 8016     __ BIND(ShiftSIMDLoop);
 8017 
 8018     // load 4 words and process
 8019     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8020     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8021     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8022     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8023     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8024     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8025     __ sub(numIter,   numIter, 4);
 8026 
 8027     __ cmp(numIter, (u1)4);
 8028     __ br(Assembler::LT, ShiftTwoLoop);
 8029     __ b(ShiftSIMDLoop);
 8030 
 8031     __ BIND(ShiftTwoLoop);
 8032     __ cbz(numIter, Exit);
 8033     __ cmp(numIter, (u1)1);
 8034     __ br(Assembler::EQ, ShiftOne);
 8035 
 8036     // load 2 words and process
 8037     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8038     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8039     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8040     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8041     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8042     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8043     __ sub(numIter,   numIter, 2);
 8044     __ b(ShiftTwoLoop);
 8045 
 8046     __ BIND(ShiftThree);
 8047     __ ldrw(r10,  __ post(oldArr, 4));
 8048     __ ldrw(r11,  __ post(oldArrNext, 4));
 8049     __ lslvw(r10, r10, shiftCount);
 8050     __ lsrvw(r11, r11, shiftRevCount);
 8051     __ orrw(r12,  r10, r11);
 8052     __ strw(r12,  __ post(newArr, 4));
 8053     __ tbz(numIter, 1, Exit);
 8054     __ tbz(numIter, 0, ShiftOne);
 8055 
 8056     __ BIND(ShiftTwo);
 8057     __ ldrw(r10,  __ post(oldArr, 4));
 8058     __ ldrw(r11,  __ post(oldArrNext, 4));
 8059     __ lslvw(r10, r10, shiftCount);
 8060     __ lsrvw(r11, r11, shiftRevCount);
 8061     __ orrw(r12,  r10, r11);
 8062     __ strw(r12,  __ post(newArr, 4));
 8063 
 8064     __ BIND(ShiftOne);
 8065     __ ldrw(r10,  Address(oldArr));
 8066     __ ldrw(r11,  Address(oldArrNext));
 8067     __ lslvw(r10, r10, shiftCount);
 8068     __ lsrvw(r11, r11, shiftRevCount);
 8069     __ orrw(r12,  r10, r11);
 8070     __ strw(r12,  Address(newArr));
 8071 
 8072     __ BIND(Exit);
 8073     __ ret(lr);
 8074 
 8075     return start;
 8076   }
 8077 
 8078   address generate_count_positives(address &count_positives_long) {
 8079     const u1 large_loop_size = 64;
 8080     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8081     int dcache_line = VM_Version::dcache_line_size();
 8082 
 8083     Register ary1 = r1, len = r2, result = r0;
 8084 
 8085     __ align(CodeEntryAlignment);
 8086 
 8087     StubId stub_id = StubId::stubgen_count_positives_id;
 8088     StubCodeMark mark(this, stub_id);
 8089 
 8090     address entry = __ pc();
 8091 
 8092     __ enter();
 8093     // precondition: a copy of len is already in result
 8094     // __ mov(result, len);
 8095 
 8096   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8097         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8098 
 8099   __ cmp(len, (u1)15);
 8100   __ br(Assembler::GT, LEN_OVER_15);
 8101   // The only case when execution falls into this code is when pointer is near
 8102   // the end of memory page and we have to avoid reading next page
 8103   __ add(ary1, ary1, len);
 8104   __ subs(len, len, 8);
 8105   __ br(Assembler::GT, LEN_OVER_8);
 8106   __ ldr(rscratch2, Address(ary1, -8));
 8107   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8108   __ lsrv(rscratch2, rscratch2, rscratch1);
 8109   __ tst(rscratch2, UPPER_BIT_MASK);
 8110   __ csel(result, zr, result, Assembler::NE);
 8111   __ leave();
 8112   __ ret(lr);
 8113   __ bind(LEN_OVER_8);
 8114   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8115   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8116   __ tst(rscratch2, UPPER_BIT_MASK);
 8117   __ br(Assembler::NE, RET_NO_POP);
 8118   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8119   __ lsrv(rscratch1, rscratch1, rscratch2);
 8120   __ tst(rscratch1, UPPER_BIT_MASK);
 8121   __ bind(RET_NO_POP);
 8122   __ csel(result, zr, result, Assembler::NE);
 8123   __ leave();
 8124   __ ret(lr);
 8125 
 8126   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8127   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8128 
 8129   count_positives_long = __ pc(); // 2nd entry point
 8130 
 8131   __ enter();
 8132 
 8133   __ bind(LEN_OVER_15);
 8134     __ push(spilled_regs, sp);
 8135     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8136     __ cbz(rscratch2, ALIGNED);
 8137     __ ldp(tmp6, tmp1, Address(ary1));
 8138     __ mov(tmp5, 16);
 8139     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8140     __ add(ary1, ary1, rscratch1);
 8141     __ orr(tmp6, tmp6, tmp1);
 8142     __ tst(tmp6, UPPER_BIT_MASK);
 8143     __ br(Assembler::NE, RET_ADJUST);
 8144     __ sub(len, len, rscratch1);
 8145 
 8146   __ bind(ALIGNED);
 8147     __ cmp(len, large_loop_size);
 8148     __ br(Assembler::LT, CHECK_16);
 8149     // Perform 16-byte load as early return in pre-loop to handle situation
 8150     // when initially aligned large array has negative values at starting bytes,
 8151     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8152     // slower. Cases with negative bytes further ahead won't be affected that
 8153     // much. In fact, it'll be faster due to early loads, less instructions and
 8154     // less branches in LARGE_LOOP.
 8155     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8156     __ sub(len, len, 16);
 8157     __ orr(tmp6, tmp6, tmp1);
 8158     __ tst(tmp6, UPPER_BIT_MASK);
 8159     __ br(Assembler::NE, RET_ADJUST_16);
 8160     __ cmp(len, large_loop_size);
 8161     __ br(Assembler::LT, CHECK_16);
 8162 
 8163     if (SoftwarePrefetchHintDistance >= 0
 8164         && SoftwarePrefetchHintDistance >= dcache_line) {
 8165       // initial prefetch
 8166       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8167     }
 8168   __ bind(LARGE_LOOP);
 8169     if (SoftwarePrefetchHintDistance >= 0) {
 8170       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8171     }
 8172     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8173     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8174     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8175     // instructions per cycle and have less branches, but this approach disables
 8176     // early return, thus, all 64 bytes are loaded and checked every time.
 8177     __ ldp(tmp2, tmp3, Address(ary1));
 8178     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8179     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8180     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8181     __ add(ary1, ary1, large_loop_size);
 8182     __ sub(len, len, large_loop_size);
 8183     __ orr(tmp2, tmp2, tmp3);
 8184     __ orr(tmp4, tmp4, tmp5);
 8185     __ orr(rscratch1, rscratch1, rscratch2);
 8186     __ orr(tmp6, tmp6, tmp1);
 8187     __ orr(tmp2, tmp2, tmp4);
 8188     __ orr(rscratch1, rscratch1, tmp6);
 8189     __ orr(tmp2, tmp2, rscratch1);
 8190     __ tst(tmp2, UPPER_BIT_MASK);
 8191     __ br(Assembler::NE, RET_ADJUST_LONG);
 8192     __ cmp(len, large_loop_size);
 8193     __ br(Assembler::GE, LARGE_LOOP);
 8194 
 8195   __ bind(CHECK_16); // small 16-byte load pre-loop
 8196     __ cmp(len, (u1)16);
 8197     __ br(Assembler::LT, POST_LOOP16);
 8198 
 8199   __ bind(LOOP16); // small 16-byte load loop
 8200     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8201     __ sub(len, len, 16);
 8202     __ orr(tmp2, tmp2, tmp3);
 8203     __ tst(tmp2, UPPER_BIT_MASK);
 8204     __ br(Assembler::NE, RET_ADJUST_16);
 8205     __ cmp(len, (u1)16);
 8206     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8207 
 8208   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8209     __ cmp(len, (u1)8);
 8210     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8211     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8212     __ tst(tmp3, UPPER_BIT_MASK);
 8213     __ br(Assembler::NE, RET_ADJUST);
 8214     __ sub(len, len, 8);
 8215 
 8216   __ bind(POST_LOOP16_LOAD_TAIL);
 8217     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8218     __ ldr(tmp1, Address(ary1));
 8219     __ mov(tmp2, 64);
 8220     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8221     __ lslv(tmp1, tmp1, tmp4);
 8222     __ tst(tmp1, UPPER_BIT_MASK);
 8223     __ br(Assembler::NE, RET_ADJUST);
 8224     // Fallthrough
 8225 
 8226   __ bind(RET_LEN);
 8227     __ pop(spilled_regs, sp);
 8228     __ leave();
 8229     __ ret(lr);
 8230 
 8231     // difference result - len is the count of guaranteed to be
 8232     // positive bytes
 8233 
 8234   __ bind(RET_ADJUST_LONG);
 8235     __ add(len, len, (u1)(large_loop_size - 16));
 8236   __ bind(RET_ADJUST_16);
 8237     __ add(len, len, 16);
 8238   __ bind(RET_ADJUST);
 8239     __ pop(spilled_regs, sp);
 8240     __ leave();
 8241     __ sub(result, result, len);
 8242     __ ret(lr);
 8243 
 8244     return entry;
 8245   }
 8246 
 8247   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8248         bool usePrefetch, Label &NOT_EQUAL) {
 8249     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8250         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8251         tmp7 = r12, tmp8 = r13;
 8252     Label LOOP;
 8253 
 8254     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8255     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8256     __ bind(LOOP);
 8257     if (usePrefetch) {
 8258       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8259       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8260     }
 8261     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8262     __ eor(tmp1, tmp1, tmp2);
 8263     __ eor(tmp3, tmp3, tmp4);
 8264     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8265     __ orr(tmp1, tmp1, tmp3);
 8266     __ cbnz(tmp1, NOT_EQUAL);
 8267     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8268     __ eor(tmp5, tmp5, tmp6);
 8269     __ eor(tmp7, tmp7, tmp8);
 8270     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8271     __ orr(tmp5, tmp5, tmp7);
 8272     __ cbnz(tmp5, NOT_EQUAL);
 8273     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8274     __ eor(tmp1, tmp1, tmp2);
 8275     __ eor(tmp3, tmp3, tmp4);
 8276     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8277     __ orr(tmp1, tmp1, tmp3);
 8278     __ cbnz(tmp1, NOT_EQUAL);
 8279     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8280     __ eor(tmp5, tmp5, tmp6);
 8281     __ sub(cnt1, cnt1, 8 * wordSize);
 8282     __ eor(tmp7, tmp7, tmp8);
 8283     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8284     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8285     // cmp) because subs allows an unlimited range of immediate operand.
 8286     __ subs(tmp6, cnt1, loopThreshold);
 8287     __ orr(tmp5, tmp5, tmp7);
 8288     __ cbnz(tmp5, NOT_EQUAL);
 8289     __ br(__ GE, LOOP);
 8290     // post-loop
 8291     __ eor(tmp1, tmp1, tmp2);
 8292     __ eor(tmp3, tmp3, tmp4);
 8293     __ orr(tmp1, tmp1, tmp3);
 8294     __ sub(cnt1, cnt1, 2 * wordSize);
 8295     __ cbnz(tmp1, NOT_EQUAL);
 8296   }
 8297 
 8298   void generate_large_array_equals_loop_simd(int loopThreshold,
 8299         bool usePrefetch, Label &NOT_EQUAL) {
 8300     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8301         tmp2 = rscratch2;
 8302     Label LOOP;
 8303 
 8304     __ bind(LOOP);
 8305     if (usePrefetch) {
 8306       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8307       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8308     }
 8309     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8310     __ sub(cnt1, cnt1, 8 * wordSize);
 8311     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8312     __ subs(tmp1, cnt1, loopThreshold);
 8313     __ eor(v0, __ T16B, v0, v4);
 8314     __ eor(v1, __ T16B, v1, v5);
 8315     __ eor(v2, __ T16B, v2, v6);
 8316     __ eor(v3, __ T16B, v3, v7);
 8317     __ orr(v0, __ T16B, v0, v1);
 8318     __ orr(v1, __ T16B, v2, v3);
 8319     __ orr(v0, __ T16B, v0, v1);
 8320     __ umov(tmp1, v0, __ D, 0);
 8321     __ umov(tmp2, v0, __ D, 1);
 8322     __ orr(tmp1, tmp1, tmp2);
 8323     __ cbnz(tmp1, NOT_EQUAL);
 8324     __ br(__ GE, LOOP);
 8325   }
 8326 
 8327   // a1 = r1 - array1 address
 8328   // a2 = r2 - array2 address
 8329   // result = r0 - return value. Already contains "false"
 8330   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8331   // r3-r5 are reserved temporary registers
 8332   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8333   address generate_large_array_equals() {
 8334     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8335         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8336         tmp7 = r12, tmp8 = r13;
 8337     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8338         SMALL_LOOP, POST_LOOP;
 8339     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8340     // calculate if at least 32 prefetched bytes are used
 8341     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8342     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8343     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8344     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8345         tmp5, tmp6, tmp7, tmp8);
 8346 
 8347     __ align(CodeEntryAlignment);
 8348 
 8349     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8350     StubCodeMark mark(this, stub_id);
 8351 
 8352     address entry = __ pc();
 8353     __ enter();
 8354     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8355     // also advance pointers to use post-increment instead of pre-increment
 8356     __ add(a1, a1, wordSize);
 8357     __ add(a2, a2, wordSize);
 8358     if (AvoidUnalignedAccesses) {
 8359       // both implementations (SIMD/nonSIMD) are using relatively large load
 8360       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8361       // on some CPUs in case of address is not at least 16-byte aligned.
 8362       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8363       // load if needed at least for 1st address and make if 16-byte aligned.
 8364       Label ALIGNED16;
 8365       __ tbz(a1, 3, ALIGNED16);
 8366       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8367       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8368       __ sub(cnt1, cnt1, wordSize);
 8369       __ eor(tmp1, tmp1, tmp2);
 8370       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8371       __ bind(ALIGNED16);
 8372     }
 8373     if (UseSIMDForArrayEquals) {
 8374       if (SoftwarePrefetchHintDistance >= 0) {
 8375         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8376         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8377         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8378             /* prfm = */ true, NOT_EQUAL);
 8379         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8380         __ br(__ LT, TAIL);
 8381       }
 8382       __ bind(NO_PREFETCH_LARGE_LOOP);
 8383       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8384           /* prfm = */ false, NOT_EQUAL);
 8385     } else {
 8386       __ push(spilled_regs, sp);
 8387       if (SoftwarePrefetchHintDistance >= 0) {
 8388         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8389         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8390         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8391             /* prfm = */ true, NOT_EQUAL);
 8392         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8393         __ br(__ LT, TAIL);
 8394       }
 8395       __ bind(NO_PREFETCH_LARGE_LOOP);
 8396       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8397           /* prfm = */ false, NOT_EQUAL);
 8398     }
 8399     __ bind(TAIL);
 8400       __ cbz(cnt1, EQUAL);
 8401       __ subs(cnt1, cnt1, wordSize);
 8402       __ br(__ LE, POST_LOOP);
 8403     __ bind(SMALL_LOOP);
 8404       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8405       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8406       __ subs(cnt1, cnt1, wordSize);
 8407       __ eor(tmp1, tmp1, tmp2);
 8408       __ cbnz(tmp1, NOT_EQUAL);
 8409       __ br(__ GT, SMALL_LOOP);
 8410     __ bind(POST_LOOP);
 8411       __ ldr(tmp1, Address(a1, cnt1));
 8412       __ ldr(tmp2, Address(a2, cnt1));
 8413       __ eor(tmp1, tmp1, tmp2);
 8414       __ cbnz(tmp1, NOT_EQUAL);
 8415     __ bind(EQUAL);
 8416       __ mov(result, true);
 8417     __ bind(NOT_EQUAL);
 8418       if (!UseSIMDForArrayEquals) {
 8419         __ pop(spilled_regs, sp);
 8420       }
 8421     __ bind(NOT_EQUAL_NO_POP);
 8422     __ leave();
 8423     __ ret(lr);
 8424     return entry;
 8425   }
 8426 
 8427   // result = r0 - return value. Contains initial hashcode value on entry.
 8428   // ary = r1 - array address
 8429   // cnt = r2 - elements count
 8430   // Clobbers: v0-v13, rscratch1, rscratch2
 8431   address generate_large_arrays_hashcode(BasicType eltype) {
 8432     const Register result = r0, ary = r1, cnt = r2;
 8433     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8434     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8435     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8436     const FloatRegister vpowm = v13;
 8437 
 8438     ARRAYS_HASHCODE_REGISTERS;
 8439 
 8440     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8441 
 8442     unsigned int vf; // vectorization factor
 8443     bool multiply_by_halves;
 8444     Assembler::SIMD_Arrangement load_arrangement;
 8445     switch (eltype) {
 8446     case T_BOOLEAN:
 8447     case T_BYTE:
 8448       load_arrangement = Assembler::T8B;
 8449       multiply_by_halves = true;
 8450       vf = 8;
 8451       break;
 8452     case T_CHAR:
 8453     case T_SHORT:
 8454       load_arrangement = Assembler::T8H;
 8455       multiply_by_halves = true;
 8456       vf = 8;
 8457       break;
 8458     case T_INT:
 8459       load_arrangement = Assembler::T4S;
 8460       multiply_by_halves = false;
 8461       vf = 4;
 8462       break;
 8463     default:
 8464       ShouldNotReachHere();
 8465     }
 8466 
 8467     // Unroll factor
 8468     const unsigned uf = 4;
 8469 
 8470     // Effective vectorization factor
 8471     const unsigned evf = vf * uf;
 8472 
 8473     __ align(CodeEntryAlignment);
 8474 
 8475     StubId stub_id;
 8476     switch (eltype) {
 8477     case T_BOOLEAN:
 8478       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8479       break;
 8480     case T_BYTE:
 8481       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8482       break;
 8483     case T_CHAR:
 8484       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8485       break;
 8486     case T_SHORT:
 8487       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8488       break;
 8489     case T_INT:
 8490       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8491       break;
 8492     default:
 8493       stub_id = StubId::NO_STUBID;
 8494       ShouldNotReachHere();
 8495     };
 8496 
 8497     StubCodeMark mark(this, stub_id);
 8498 
 8499     address entry = __ pc();
 8500     __ enter();
 8501 
 8502     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8503     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8504     // value shouldn't change throughout both loops.
 8505     __ movw(rscratch1, intpow(31U, 3));
 8506     __ mov(vpow, Assembler::S, 0, rscratch1);
 8507     __ movw(rscratch1, intpow(31U, 2));
 8508     __ mov(vpow, Assembler::S, 1, rscratch1);
 8509     __ movw(rscratch1, intpow(31U, 1));
 8510     __ mov(vpow, Assembler::S, 2, rscratch1);
 8511     __ movw(rscratch1, intpow(31U, 0));
 8512     __ mov(vpow, Assembler::S, 3, rscratch1);
 8513 
 8514     __ mov(vmul0, Assembler::T16B, 0);
 8515     __ mov(vmul0, Assembler::S, 3, result);
 8516 
 8517     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8518     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8519 
 8520     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8521     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8522 
 8523     // SMALL LOOP
 8524     __ bind(SMALL_LOOP);
 8525 
 8526     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8527     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8528     __ subsw(rscratch2, rscratch2, vf);
 8529 
 8530     if (load_arrangement == Assembler::T8B) {
 8531       // Extend 8B to 8H to be able to use vector multiply
 8532       // instructions
 8533       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8534       if (is_signed_subword_type(eltype)) {
 8535         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8536       } else {
 8537         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8538       }
 8539     }
 8540 
 8541     switch (load_arrangement) {
 8542     case Assembler::T4S:
 8543       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8544       break;
 8545     case Assembler::T8B:
 8546     case Assembler::T8H:
 8547       assert(is_subword_type(eltype), "subword type expected");
 8548       if (is_signed_subword_type(eltype)) {
 8549         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8550       } else {
 8551         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8552       }
 8553       break;
 8554     default:
 8555       __ should_not_reach_here();
 8556     }
 8557 
 8558     // Process the upper half of a vector
 8559     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8560       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8561       if (is_signed_subword_type(eltype)) {
 8562         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8563       } else {
 8564         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8565       }
 8566     }
 8567 
 8568     __ br(Assembler::HI, SMALL_LOOP);
 8569 
 8570     // SMALL LOOP'S EPILOQUE
 8571     __ lsr(rscratch2, cnt, exact_log2(evf));
 8572     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8573 
 8574     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8575     __ addv(vmul0, Assembler::T4S, vmul0);
 8576     __ umov(result, vmul0, Assembler::S, 0);
 8577 
 8578     // TAIL
 8579     __ bind(TAIL);
 8580 
 8581     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8582     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8583     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8584     __ andr(rscratch2, cnt, vf - 1);
 8585     __ bind(TAIL_SHORTCUT);
 8586     __ adr(rscratch1, BR_BASE);
 8587     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8588     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8589     __ movw(rscratch2, 0x1f);
 8590     __ br(rscratch1);
 8591 
 8592     for (size_t i = 0; i < vf - 1; ++i) {
 8593       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8594                                    eltype);
 8595       __ maddw(result, result, rscratch2, rscratch1);
 8596       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8597       // Generate 2nd nop to have 4 instructions per iteration.
 8598       if (VM_Version::supports_a53mac()) {
 8599         __ nop();
 8600       }
 8601     }
 8602     __ bind(BR_BASE);
 8603 
 8604     __ leave();
 8605     __ ret(lr);
 8606 
 8607     // LARGE LOOP
 8608     __ bind(LARGE_LOOP_PREHEADER);
 8609 
 8610     __ lsr(rscratch2, cnt, exact_log2(evf));
 8611 
 8612     if (multiply_by_halves) {
 8613       // 31^4 - multiplier between lower and upper parts of a register
 8614       __ movw(rscratch1, intpow(31U, vf / 2));
 8615       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8616       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8617       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8618       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8619     } else {
 8620       // 31^16
 8621       __ movw(rscratch1, intpow(31U, evf));
 8622       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8623     }
 8624 
 8625     __ mov(vmul3, Assembler::T16B, 0);
 8626     __ mov(vmul2, Assembler::T16B, 0);
 8627     __ mov(vmul1, Assembler::T16B, 0);
 8628 
 8629     __ bind(LARGE_LOOP);
 8630 
 8631     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8632     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8633     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8634     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8635 
 8636     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8637            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8638 
 8639     if (load_arrangement == Assembler::T8B) {
 8640       // Extend 8B to 8H to be able to use vector multiply
 8641       // instructions
 8642       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8643       if (is_signed_subword_type(eltype)) {
 8644         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8645         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8646         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8647         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8648       } else {
 8649         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8650         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8651         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8652         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8653       }
 8654     }
 8655 
 8656     switch (load_arrangement) {
 8657     case Assembler::T4S:
 8658       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8659       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8660       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8661       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8662       break;
 8663     case Assembler::T8B:
 8664     case Assembler::T8H:
 8665       assert(is_subword_type(eltype), "subword type expected");
 8666       if (is_signed_subword_type(eltype)) {
 8667         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8668         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8669         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8670         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8671       } else {
 8672         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8673         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8674         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8675         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8676       }
 8677       break;
 8678     default:
 8679       __ should_not_reach_here();
 8680     }
 8681 
 8682     // Process the upper half of a vector
 8683     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8684       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8685       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8686       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8687       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8688       if (is_signed_subword_type(eltype)) {
 8689         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8690         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8691         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8692         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8693       } else {
 8694         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8695         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8696         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8697         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8698       }
 8699     }
 8700 
 8701     __ subsw(rscratch2, rscratch2, 1);
 8702     __ br(Assembler::HI, LARGE_LOOP);
 8703 
 8704     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8705     __ addv(vmul3, Assembler::T4S, vmul3);
 8706     __ umov(result, vmul3, Assembler::S, 0);
 8707 
 8708     __ mov(rscratch2, intpow(31U, vf));
 8709 
 8710     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8711     __ addv(vmul2, Assembler::T4S, vmul2);
 8712     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8713     __ maddw(result, result, rscratch2, rscratch1);
 8714 
 8715     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8716     __ addv(vmul1, Assembler::T4S, vmul1);
 8717     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8718     __ maddw(result, result, rscratch2, rscratch1);
 8719 
 8720     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8721     __ addv(vmul0, Assembler::T4S, vmul0);
 8722     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8723     __ maddw(result, result, rscratch2, rscratch1);
 8724 
 8725     __ andr(rscratch2, cnt, vf - 1);
 8726     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8727 
 8728     __ leave();
 8729     __ ret(lr);
 8730 
 8731     return entry;
 8732   }
 8733 
 8734   address generate_dsin_dcos(bool isCos) {
 8735     __ align(CodeEntryAlignment);
 8736     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8737     StubCodeMark mark(this, stub_id);
 8738     address start = __ pc();
 8739     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8740         (address)StubRoutines::aarch64::_two_over_pi,
 8741         (address)StubRoutines::aarch64::_pio2,
 8742         (address)StubRoutines::aarch64::_dsin_coef,
 8743         (address)StubRoutines::aarch64::_dcos_coef);
 8744     return start;
 8745   }
 8746 
 8747   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8748   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8749       Label &DIFF2) {
 8750     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8751     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8752 
 8753     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8754     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8755     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8756     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8757 
 8758     __ fmovd(tmpL, vtmp3);
 8759     __ eor(rscratch2, tmp3, tmpL);
 8760     __ cbnz(rscratch2, DIFF2);
 8761 
 8762     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8763     __ umov(tmpL, vtmp3, __ D, 1);
 8764     __ eor(rscratch2, tmpU, tmpL);
 8765     __ cbnz(rscratch2, DIFF1);
 8766 
 8767     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8768     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8769     __ fmovd(tmpL, vtmp);
 8770     __ eor(rscratch2, tmp3, tmpL);
 8771     __ cbnz(rscratch2, DIFF2);
 8772 
 8773     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8774     __ umov(tmpL, vtmp, __ D, 1);
 8775     __ eor(rscratch2, tmpU, tmpL);
 8776     __ cbnz(rscratch2, DIFF1);
 8777   }
 8778 
 8779   // r0  = result
 8780   // r1  = str1
 8781   // r2  = cnt1
 8782   // r3  = str2
 8783   // r4  = cnt2
 8784   // r10 = tmp1
 8785   // r11 = tmp2
 8786   address generate_compare_long_string_different_encoding(bool isLU) {
 8787     __ align(CodeEntryAlignment);
 8788     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8789     StubCodeMark mark(this, stub_id);
 8790     address entry = __ pc();
 8791     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8792         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8793         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8794     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8795         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8796     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8797     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8798 
 8799     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8800 
 8801     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8802     // cnt2 == amount of characters left to compare
 8803     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8804     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8805     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8806     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8807     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8808     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8809     __ eor(rscratch2, tmp1, tmp2);
 8810     __ mov(rscratch1, tmp2);
 8811     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8812     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8813              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8814     __ push(spilled_regs, sp);
 8815     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8816     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8817 
 8818     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8819 
 8820     if (SoftwarePrefetchHintDistance >= 0) {
 8821       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8822       __ br(__ LT, NO_PREFETCH);
 8823       __ bind(LARGE_LOOP_PREFETCH);
 8824         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8825         __ mov(tmp4, 2);
 8826         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8827         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8828           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8829           __ subs(tmp4, tmp4, 1);
 8830           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8831           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8832           __ mov(tmp4, 2);
 8833         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8834           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8835           __ subs(tmp4, tmp4, 1);
 8836           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8837           __ sub(cnt2, cnt2, 64);
 8838           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8839           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8840     }
 8841     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8842     __ bind(NO_PREFETCH);
 8843     __ subs(cnt2, cnt2, 16);
 8844     __ br(__ LT, TAIL);
 8845     __ align(OptoLoopAlignment);
 8846     __ bind(SMALL_LOOP); // smaller loop
 8847       __ subs(cnt2, cnt2, 16);
 8848       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8849       __ br(__ GE, SMALL_LOOP);
 8850       __ cmn(cnt2, (u1)16);
 8851       __ br(__ EQ, LOAD_LAST);
 8852     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8853       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8854       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8855       __ ldr(tmp3, Address(cnt1, -8));
 8856       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8857       __ b(LOAD_LAST);
 8858     __ bind(DIFF2);
 8859       __ mov(tmpU, tmp3);
 8860     __ bind(DIFF1);
 8861       __ pop(spilled_regs, sp);
 8862       __ b(CALCULATE_DIFFERENCE);
 8863     __ bind(LOAD_LAST);
 8864       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8865       // No need to load it again
 8866       __ mov(tmpU, tmp3);
 8867       __ pop(spilled_regs, sp);
 8868 
 8869       // tmp2 points to the address of the last 4 Latin1 characters right now
 8870       __ ldrs(vtmp, Address(tmp2));
 8871       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8872       __ fmovd(tmpL, vtmp);
 8873 
 8874       __ eor(rscratch2, tmpU, tmpL);
 8875       __ cbz(rscratch2, DONE);
 8876 
 8877     // Find the first different characters in the longwords and
 8878     // compute their difference.
 8879     __ bind(CALCULATE_DIFFERENCE);
 8880       __ rev(rscratch2, rscratch2);
 8881       __ clz(rscratch2, rscratch2);
 8882       __ andr(rscratch2, rscratch2, -16);
 8883       __ lsrv(tmp1, tmp1, rscratch2);
 8884       __ uxthw(tmp1, tmp1);
 8885       __ lsrv(rscratch1, rscratch1, rscratch2);
 8886       __ uxthw(rscratch1, rscratch1);
 8887       __ subw(result, tmp1, rscratch1);
 8888     __ bind(DONE);
 8889       __ ret(lr);
 8890     return entry;
 8891   }
 8892 
 8893   // r0 = input (float16)
 8894   // v0 = result (float)
 8895   // v1 = temporary float register
 8896   address generate_float16ToFloat() {
 8897     __ align(CodeEntryAlignment);
 8898     StubId stub_id = StubId::stubgen_hf2f_id;
 8899     StubCodeMark mark(this, stub_id);
 8900     address entry = __ pc();
 8901     BLOCK_COMMENT("Entry:");
 8902     __ flt16_to_flt(v0, r0, v1);
 8903     __ ret(lr);
 8904     return entry;
 8905   }
 8906 
 8907   // v0 = input (float)
 8908   // r0 = result (float16)
 8909   // v1 = temporary float register
 8910   address generate_floatToFloat16() {
 8911     __ align(CodeEntryAlignment);
 8912     StubId stub_id = StubId::stubgen_f2hf_id;
 8913     StubCodeMark mark(this, stub_id);
 8914     address entry = __ pc();
 8915     BLOCK_COMMENT("Entry:");
 8916     __ flt_to_flt16(r0, v0, v1);
 8917     __ ret(lr);
 8918     return entry;
 8919   }
 8920 
 8921   address generate_method_entry_barrier() {
 8922     __ align(CodeEntryAlignment);
 8923     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 8924     StubCodeMark mark(this, stub_id);
 8925 
 8926     Label deoptimize_label;
 8927 
 8928     address start = __ pc();
 8929 
 8930     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8931 
 8932     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8933       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8934       // We can get here despite the nmethod being good, if we have not
 8935       // yet applied our cross modification fence (or data fence).
 8936       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8937       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8938       __ ldrw(rscratch2, rscratch2);
 8939       __ strw(rscratch2, thread_epoch_addr);
 8940       __ isb();
 8941       __ membar(__ LoadLoad);
 8942     }
 8943 
 8944     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8945 
 8946     __ enter();
 8947     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8948 
 8949     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8950 
 8951     __ push_call_clobbered_registers();
 8952 
 8953     __ mov(c_rarg0, rscratch2);
 8954     __ call_VM_leaf
 8955          (CAST_FROM_FN_PTR
 8956           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8957 
 8958     __ reset_last_Java_frame(true);
 8959 
 8960     __ mov(rscratch1, r0);
 8961 
 8962     __ pop_call_clobbered_registers();
 8963 
 8964     __ cbnz(rscratch1, deoptimize_label);
 8965 
 8966     __ leave();
 8967     __ ret(lr);
 8968 
 8969     __ BIND(deoptimize_label);
 8970 
 8971     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8972     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8973 
 8974     __ mov(sp, rscratch1);
 8975     __ br(rscratch2);
 8976 
 8977     return start;
 8978   }
 8979 
 8980   // r0  = result
 8981   // r1  = str1
 8982   // r2  = cnt1
 8983   // r3  = str2
 8984   // r4  = cnt2
 8985   // r10 = tmp1
 8986   // r11 = tmp2
 8987   address generate_compare_long_string_same_encoding(bool isLL) {
 8988     __ align(CodeEntryAlignment);
 8989     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 8990     StubCodeMark mark(this, stub_id);
 8991     address entry = __ pc();
 8992     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8993         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 8994 
 8995     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 8996 
 8997     // exit from large loop when less than 64 bytes left to read or we're about
 8998     // to prefetch memory behind array border
 8999     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9000 
 9001     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9002     __ eor(rscratch2, tmp1, tmp2);
 9003     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9004 
 9005     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9006     // update pointers, because of previous read
 9007     __ add(str1, str1, wordSize);
 9008     __ add(str2, str2, wordSize);
 9009     if (SoftwarePrefetchHintDistance >= 0) {
 9010       __ align(OptoLoopAlignment);
 9011       __ bind(LARGE_LOOP_PREFETCH);
 9012         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9013         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9014 
 9015         for (int i = 0; i < 4; i++) {
 9016           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9017           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9018           __ cmp(tmp1, tmp2);
 9019           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9020           __ br(Assembler::NE, DIFF);
 9021         }
 9022         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9023         __ add(str1, str1, 64);
 9024         __ add(str2, str2, 64);
 9025         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9026         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9027         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9028     }
 9029 
 9030     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9031     __ br(Assembler::LE, LESS16);
 9032     __ align(OptoLoopAlignment);
 9033     __ bind(LOOP_COMPARE16);
 9034       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9035       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9036       __ cmp(tmp1, tmp2);
 9037       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9038       __ br(Assembler::NE, DIFF);
 9039       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9040       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9041       __ br(Assembler::LT, LESS16);
 9042 
 9043       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9044       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9045       __ cmp(tmp1, tmp2);
 9046       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9047       __ br(Assembler::NE, DIFF);
 9048       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9049       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9050       __ br(Assembler::GE, LOOP_COMPARE16);
 9051       __ cbz(cnt2, LENGTH_DIFF);
 9052 
 9053     __ bind(LESS16);
 9054       // each 8 compare
 9055       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9056       __ br(Assembler::LE, LESS8);
 9057       __ ldr(tmp1, Address(__ post(str1, 8)));
 9058       __ ldr(tmp2, Address(__ post(str2, 8)));
 9059       __ eor(rscratch2, tmp1, tmp2);
 9060       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9061       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9062 
 9063     __ bind(LESS8); // directly load last 8 bytes
 9064       if (!isLL) {
 9065         __ add(cnt2, cnt2, cnt2);
 9066       }
 9067       __ ldr(tmp1, Address(str1, cnt2));
 9068       __ ldr(tmp2, Address(str2, cnt2));
 9069       __ eor(rscratch2, tmp1, tmp2);
 9070       __ cbz(rscratch2, LENGTH_DIFF);
 9071       __ b(CAL_DIFFERENCE);
 9072 
 9073     __ bind(DIFF);
 9074       __ cmp(tmp1, tmp2);
 9075       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9076       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9077       // reuse rscratch2 register for the result of eor instruction
 9078       __ eor(rscratch2, tmp1, tmp2);
 9079 
 9080     __ bind(CAL_DIFFERENCE);
 9081       __ rev(rscratch2, rscratch2);
 9082       __ clz(rscratch2, rscratch2);
 9083       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9084       __ lsrv(tmp1, tmp1, rscratch2);
 9085       __ lsrv(tmp2, tmp2, rscratch2);
 9086       if (isLL) {
 9087         __ uxtbw(tmp1, tmp1);
 9088         __ uxtbw(tmp2, tmp2);
 9089       } else {
 9090         __ uxthw(tmp1, tmp1);
 9091         __ uxthw(tmp2, tmp2);
 9092       }
 9093       __ subw(result, tmp1, tmp2);
 9094 
 9095     __ bind(LENGTH_DIFF);
 9096       __ ret(lr);
 9097     return entry;
 9098   }
 9099 
 9100   enum string_compare_mode {
 9101     LL,
 9102     LU,
 9103     UL,
 9104     UU,
 9105   };
 9106 
 9107   // The following registers are declared in aarch64.ad
 9108   // r0  = result
 9109   // r1  = str1
 9110   // r2  = cnt1
 9111   // r3  = str2
 9112   // r4  = cnt2
 9113   // r10 = tmp1
 9114   // r11 = tmp2
 9115   // z0  = ztmp1
 9116   // z1  = ztmp2
 9117   // p0  = pgtmp1
 9118   // p1  = pgtmp2
 9119   address generate_compare_long_string_sve(string_compare_mode mode) {
 9120     StubId stub_id;
 9121     switch (mode) {
 9122       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9123       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9124       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9125       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9126       default: ShouldNotReachHere();
 9127     }
 9128 
 9129     __ align(CodeEntryAlignment);
 9130     address entry = __ pc();
 9131     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9132              tmp1 = r10, tmp2 = r11;
 9133 
 9134     Label LOOP, DONE, MISMATCH;
 9135     Register vec_len = tmp1;
 9136     Register idx = tmp2;
 9137     // The minimum of the string lengths has been stored in cnt2.
 9138     Register cnt = cnt2;
 9139     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9140     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9141 
 9142 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9143     switch (mode) {                                                            \
 9144       case LL:                                                                 \
 9145         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9146         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9147         break;                                                                 \
 9148       case LU:                                                                 \
 9149         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9150         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9151         break;                                                                 \
 9152       case UL:                                                                 \
 9153         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9154         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9155         break;                                                                 \
 9156       case UU:                                                                 \
 9157         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9158         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9159         break;                                                                 \
 9160       default:                                                                 \
 9161         ShouldNotReachHere();                                                  \
 9162     }
 9163 
 9164     StubCodeMark mark(this, stub_id);
 9165 
 9166     __ mov(idx, 0);
 9167     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9168 
 9169     if (mode == LL) {
 9170       __ sve_cntb(vec_len);
 9171     } else {
 9172       __ sve_cnth(vec_len);
 9173     }
 9174 
 9175     __ sub(rscratch1, cnt, vec_len);
 9176 
 9177     __ bind(LOOP);
 9178 
 9179       // main loop
 9180       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9181       __ add(idx, idx, vec_len);
 9182       // Compare strings.
 9183       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9184       __ br(__ NE, MISMATCH);
 9185       __ cmp(idx, rscratch1);
 9186       __ br(__ LT, LOOP);
 9187 
 9188     // post loop, last iteration
 9189     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9190 
 9191     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9192     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9193     __ br(__ EQ, DONE);
 9194 
 9195     __ bind(MISMATCH);
 9196 
 9197     // Crop the vector to find its location.
 9198     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9199     // Extract the first different characters of each string.
 9200     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9201     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9202 
 9203     // Compute the difference of the first different characters.
 9204     __ sub(result, rscratch1, rscratch2);
 9205 
 9206     __ bind(DONE);
 9207     __ ret(lr);
 9208 #undef LOAD_PAIR
 9209     return entry;
 9210   }
 9211 
 9212   void generate_compare_long_strings() {
 9213     if (UseSVE == 0) {
 9214       StubRoutines::aarch64::_compare_long_string_LL
 9215           = generate_compare_long_string_same_encoding(true);
 9216       StubRoutines::aarch64::_compare_long_string_UU
 9217           = generate_compare_long_string_same_encoding(false);
 9218       StubRoutines::aarch64::_compare_long_string_LU
 9219           = generate_compare_long_string_different_encoding(true);
 9220       StubRoutines::aarch64::_compare_long_string_UL
 9221           = generate_compare_long_string_different_encoding(false);
 9222     } else {
 9223       StubRoutines::aarch64::_compare_long_string_LL
 9224           = generate_compare_long_string_sve(LL);
 9225       StubRoutines::aarch64::_compare_long_string_UU
 9226           = generate_compare_long_string_sve(UU);
 9227       StubRoutines::aarch64::_compare_long_string_LU
 9228           = generate_compare_long_string_sve(LU);
 9229       StubRoutines::aarch64::_compare_long_string_UL
 9230           = generate_compare_long_string_sve(UL);
 9231     }
 9232   }
 9233 
 9234   // R0 = result
 9235   // R1 = str2
 9236   // R2 = cnt1
 9237   // R3 = str1
 9238   // R4 = cnt2
 9239   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9240   //
 9241   // This generic linear code use few additional ideas, which makes it faster:
 9242   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9243   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9244   // 2) we can use "fast" algorithm of finding single character to search for
 9245   // first symbol with less branches(1 branch per each loaded register instead
 9246   // of branch for each symbol), so, this is where constants like
 9247   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9248   // 3) after loading and analyzing 1st register of source string, it can be
 9249   // used to search for every 1st character entry, saving few loads in
 9250   // comparison with "simplier-but-slower" implementation
 9251   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9252   // re-using/re-initializing/compressing register values, which makes code
 9253   // larger and a bit less readable, however, most of extra operations are
 9254   // issued during loads or branches, so, penalty is minimal
 9255   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9256     StubId stub_id;
 9257     if (str1_isL) {
 9258       if (str2_isL) {
 9259         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9260       } else {
 9261         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9262       }
 9263     } else {
 9264       if (str2_isL) {
 9265         ShouldNotReachHere();
 9266       } else {
 9267         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9268       }
 9269     }
 9270     __ align(CodeEntryAlignment);
 9271     StubCodeMark mark(this, stub_id);
 9272     address entry = __ pc();
 9273 
 9274     int str1_chr_size = str1_isL ? 1 : 2;
 9275     int str2_chr_size = str2_isL ? 1 : 2;
 9276     int str1_chr_shift = str1_isL ? 0 : 1;
 9277     int str2_chr_shift = str2_isL ? 0 : 1;
 9278     bool isL = str1_isL && str2_isL;
 9279    // parameters
 9280     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9281     // temporary registers
 9282     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9283     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9284     // redefinitions
 9285     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9286 
 9287     __ push(spilled_regs, sp);
 9288     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9289         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9290         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9291         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9292         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9293         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9294     // Read whole register from str1. It is safe, because length >=8 here
 9295     __ ldr(ch1, Address(str1));
 9296     // Read whole register from str2. It is safe, because length >=8 here
 9297     __ ldr(ch2, Address(str2));
 9298     __ sub(cnt2, cnt2, cnt1);
 9299     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9300     if (str1_isL != str2_isL) {
 9301       __ eor(v0, __ T16B, v0, v0);
 9302     }
 9303     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9304     __ mul(first, first, tmp1);
 9305     // check if we have less than 1 register to check
 9306     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9307     if (str1_isL != str2_isL) {
 9308       __ fmovd(v1, ch1);
 9309     }
 9310     __ br(__ LE, L_SMALL);
 9311     __ eor(ch2, first, ch2);
 9312     if (str1_isL != str2_isL) {
 9313       __ zip1(v1, __ T16B, v1, v0);
 9314     }
 9315     __ sub(tmp2, ch2, tmp1);
 9316     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9317     __ bics(tmp2, tmp2, ch2);
 9318     if (str1_isL != str2_isL) {
 9319       __ fmovd(ch1, v1);
 9320     }
 9321     __ br(__ NE, L_HAS_ZERO);
 9322     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9323     __ add(result, result, wordSize/str2_chr_size);
 9324     __ add(str2, str2, wordSize);
 9325     __ br(__ LT, L_POST_LOOP);
 9326     __ BIND(L_LOOP);
 9327       __ ldr(ch2, Address(str2));
 9328       __ eor(ch2, first, ch2);
 9329       __ sub(tmp2, ch2, tmp1);
 9330       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9331       __ bics(tmp2, tmp2, ch2);
 9332       __ br(__ NE, L_HAS_ZERO);
 9333     __ BIND(L_LOOP_PROCEED);
 9334       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9335       __ add(str2, str2, wordSize);
 9336       __ add(result, result, wordSize/str2_chr_size);
 9337       __ br(__ GE, L_LOOP);
 9338     __ BIND(L_POST_LOOP);
 9339       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9340       __ br(__ LE, NOMATCH);
 9341       __ ldr(ch2, Address(str2));
 9342       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9343       __ eor(ch2, first, ch2);
 9344       __ sub(tmp2, ch2, tmp1);
 9345       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9346       __ mov(tmp4, -1); // all bits set
 9347       __ b(L_SMALL_PROCEED);
 9348     __ align(OptoLoopAlignment);
 9349     __ BIND(L_SMALL);
 9350       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9351       __ eor(ch2, first, ch2);
 9352       if (str1_isL != str2_isL) {
 9353         __ zip1(v1, __ T16B, v1, v0);
 9354       }
 9355       __ sub(tmp2, ch2, tmp1);
 9356       __ mov(tmp4, -1); // all bits set
 9357       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9358       if (str1_isL != str2_isL) {
 9359         __ fmovd(ch1, v1); // move converted 4 symbols
 9360       }
 9361     __ BIND(L_SMALL_PROCEED);
 9362       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9363       __ bic(tmp2, tmp2, ch2);
 9364       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9365       __ rbit(tmp2, tmp2);
 9366       __ br(__ EQ, NOMATCH);
 9367     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9368       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9369       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9370       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9371       if (str2_isL) { // LL
 9372         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9373         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9374         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9375         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9376         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9377       } else {
 9378         __ mov(ch2, 0xE); // all bits in byte set except last one
 9379         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9380         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9381         __ lslv(tmp2, tmp2, tmp4);
 9382         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9383         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9384         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9385         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9386       }
 9387       __ cmp(ch1, ch2);
 9388       __ mov(tmp4, wordSize/str2_chr_size);
 9389       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9390     __ BIND(L_SMALL_CMP_LOOP);
 9391       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9392                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9393       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9394                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9395       __ add(tmp4, tmp4, 1);
 9396       __ cmp(tmp4, cnt1);
 9397       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9398       __ cmp(first, ch2);
 9399       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9400     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9401       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9402       __ clz(tmp4, tmp2);
 9403       __ add(result, result, 1); // advance index
 9404       __ add(str2, str2, str2_chr_size); // advance pointer
 9405       __ b(L_SMALL_HAS_ZERO_LOOP);
 9406     __ align(OptoLoopAlignment);
 9407     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9408       __ cmp(first, ch2);
 9409       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9410       __ b(DONE);
 9411     __ align(OptoLoopAlignment);
 9412     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9413       if (str2_isL) { // LL
 9414         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9415         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9416         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9417         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9418         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9419       } else {
 9420         __ mov(ch2, 0xE); // all bits in byte set except last one
 9421         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9422         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9423         __ lslv(tmp2, tmp2, tmp4);
 9424         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9425         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9426         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9427         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9428       }
 9429       __ cmp(ch1, ch2);
 9430       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9431       __ b(DONE);
 9432     __ align(OptoLoopAlignment);
 9433     __ BIND(L_HAS_ZERO);
 9434       __ rbit(tmp2, tmp2);
 9435       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9436       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9437       // It's fine because both counters are 32bit and are not changed in this
 9438       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9439       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9440       __ sub(result, result, 1);
 9441     __ BIND(L_HAS_ZERO_LOOP);
 9442       __ mov(cnt1, wordSize/str2_chr_size);
 9443       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9444       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9445       if (str2_isL) {
 9446         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9447         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9448         __ lslv(tmp2, tmp2, tmp4);
 9449         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9450         __ add(tmp4, tmp4, 1);
 9451         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9452         __ lsl(tmp2, tmp2, 1);
 9453         __ mov(tmp4, wordSize/str2_chr_size);
 9454       } else {
 9455         __ mov(ch2, 0xE);
 9456         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9457         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9458         __ lslv(tmp2, tmp2, tmp4);
 9459         __ add(tmp4, tmp4, 1);
 9460         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9461         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9462         __ lsl(tmp2, tmp2, 1);
 9463         __ mov(tmp4, wordSize/str2_chr_size);
 9464         __ sub(str2, str2, str2_chr_size);
 9465       }
 9466       __ cmp(ch1, ch2);
 9467       __ mov(tmp4, wordSize/str2_chr_size);
 9468       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9469     __ BIND(L_CMP_LOOP);
 9470       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9471                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9472       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9473                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9474       __ add(tmp4, tmp4, 1);
 9475       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9476       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9477       __ cmp(cnt1, ch2);
 9478       __ br(__ EQ, L_CMP_LOOP);
 9479     __ BIND(L_CMP_LOOP_NOMATCH);
 9480       // here we're not matched
 9481       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9482       __ clz(tmp4, tmp2);
 9483       __ add(str2, str2, str2_chr_size); // advance pointer
 9484       __ b(L_HAS_ZERO_LOOP);
 9485     __ align(OptoLoopAlignment);
 9486     __ BIND(L_CMP_LOOP_LAST_CMP);
 9487       __ cmp(cnt1, ch2);
 9488       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9489       __ b(DONE);
 9490     __ align(OptoLoopAlignment);
 9491     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9492       if (str2_isL) {
 9493         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9494         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9495         __ lslv(tmp2, tmp2, tmp4);
 9496         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9497         __ add(tmp4, tmp4, 1);
 9498         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9499         __ lsl(tmp2, tmp2, 1);
 9500       } else {
 9501         __ mov(ch2, 0xE);
 9502         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9503         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9504         __ lslv(tmp2, tmp2, tmp4);
 9505         __ add(tmp4, tmp4, 1);
 9506         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9507         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9508         __ lsl(tmp2, tmp2, 1);
 9509         __ sub(str2, str2, str2_chr_size);
 9510       }
 9511       __ cmp(ch1, ch2);
 9512       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9513       __ b(DONE);
 9514     __ align(OptoLoopAlignment);
 9515     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9516       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9517       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9518       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9519       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9520       // result by analyzed characters value, so, we can just reset lower bits
 9521       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9522       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9523       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9524       // index of last analyzed substring inside current octet. So, str2 in at
 9525       // respective start address. We need to advance it to next octet
 9526       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9527       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9528       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9529       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9530       __ movw(cnt2, cnt2);
 9531       __ b(L_LOOP_PROCEED);
 9532     __ align(OptoLoopAlignment);
 9533     __ BIND(NOMATCH);
 9534       __ mov(result, -1);
 9535     __ BIND(DONE);
 9536       __ pop(spilled_regs, sp);
 9537       __ ret(lr);
 9538     return entry;
 9539   }
 9540 
 9541   void generate_string_indexof_stubs() {
 9542     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9543     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9544     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9545   }
 9546 
 9547   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9548       FloatRegister src1, FloatRegister src2) {
 9549     Register dst = r1;
 9550     __ zip1(v1, __ T16B, src1, v0);
 9551     __ zip2(v2, __ T16B, src1, v0);
 9552     if (generatePrfm) {
 9553       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9554     }
 9555     __ zip1(v3, __ T16B, src2, v0);
 9556     __ zip2(v4, __ T16B, src2, v0);
 9557     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9558   }
 9559 
 9560   // R0 = src
 9561   // R1 = dst
 9562   // R2 = len
 9563   // R3 = len >> 3
 9564   // V0 = 0
 9565   // v1 = loaded 8 bytes
 9566   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9567   address generate_large_byte_array_inflate() {
 9568     __ align(CodeEntryAlignment);
 9569     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9570     StubCodeMark mark(this, stub_id);
 9571     address entry = __ pc();
 9572     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9573     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9574     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9575 
 9576     // do one more 8-byte read to have address 16-byte aligned in most cases
 9577     // also use single store instruction
 9578     __ ldrd(v2, __ post(src, 8));
 9579     __ sub(octetCounter, octetCounter, 2);
 9580     __ zip1(v1, __ T16B, v1, v0);
 9581     __ zip1(v2, __ T16B, v2, v0);
 9582     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9583     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9584     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9585     __ br(__ LE, LOOP_START);
 9586     __ b(LOOP_PRFM_START);
 9587     __ bind(LOOP_PRFM);
 9588       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9589     __ bind(LOOP_PRFM_START);
 9590       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9591       __ sub(octetCounter, octetCounter, 8);
 9592       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9593       inflate_and_store_2_fp_registers(true, v3, v4);
 9594       inflate_and_store_2_fp_registers(true, v5, v6);
 9595       __ br(__ GT, LOOP_PRFM);
 9596       __ cmp(octetCounter, (u1)8);
 9597       __ br(__ LT, DONE);
 9598     __ bind(LOOP);
 9599       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9600       __ bind(LOOP_START);
 9601       __ sub(octetCounter, octetCounter, 8);
 9602       __ cmp(octetCounter, (u1)8);
 9603       inflate_and_store_2_fp_registers(false, v3, v4);
 9604       inflate_and_store_2_fp_registers(false, v5, v6);
 9605       __ br(__ GE, LOOP);
 9606     __ bind(DONE);
 9607       __ ret(lr);
 9608     return entry;
 9609   }
 9610 
 9611   /**
 9612    *  Arguments:
 9613    *
 9614    *  Input:
 9615    *  c_rarg0   - current state address
 9616    *  c_rarg1   - H key address
 9617    *  c_rarg2   - data address
 9618    *  c_rarg3   - number of blocks
 9619    *
 9620    *  Output:
 9621    *  Updated state at c_rarg0
 9622    */
 9623   address generate_ghash_processBlocks() {
 9624     // Bafflingly, GCM uses little-endian for the byte order, but
 9625     // big-endian for the bit order.  For example, the polynomial 1 is
 9626     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9627     //
 9628     // So, we must either reverse the bytes in each word and do
 9629     // everything big-endian or reverse the bits in each byte and do
 9630     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9631     // the bits in each byte (we have an instruction, RBIT, to do
 9632     // that) and keep the data in little-endian bit order through the
 9633     // calculation, bit-reversing the inputs and outputs.
 9634 
 9635     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9636     StubCodeMark mark(this, stub_id);
 9637     Label polynomial; // local data generated at end of stub
 9638     __ align(CodeEntryAlignment);
 9639     address start = __ pc();
 9640 
 9641     Register state   = c_rarg0;
 9642     Register subkeyH = c_rarg1;
 9643     Register data    = c_rarg2;
 9644     Register blocks  = c_rarg3;
 9645 
 9646     FloatRegister vzr = v30;
 9647     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9648 
 9649     __ adr(rscratch1, polynomial);
 9650     __ ldrq(v24, rscratch1);    // The field polynomial
 9651 
 9652     __ ldrq(v0, Address(state));
 9653     __ ldrq(v1, Address(subkeyH));
 9654 
 9655     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9656     __ rbit(v0, __ T16B, v0);
 9657     __ rev64(v1, __ T16B, v1);
 9658     __ rbit(v1, __ T16B, v1);
 9659 
 9660     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9661     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9662 
 9663     {
 9664       Label L_ghash_loop;
 9665       __ bind(L_ghash_loop);
 9666 
 9667       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9668                                                  // reversing each byte
 9669       __ rbit(v2, __ T16B, v2);
 9670       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9671 
 9672       // Multiply state in v2 by subkey in v1
 9673       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9674                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9675                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9676       // Reduce v7:v5 by the field polynomial
 9677       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9678 
 9679       __ sub(blocks, blocks, 1);
 9680       __ cbnz(blocks, L_ghash_loop);
 9681     }
 9682 
 9683     // The bit-reversed result is at this point in v0
 9684     __ rev64(v0, __ T16B, v0);
 9685     __ rbit(v0, __ T16B, v0);
 9686 
 9687     __ st1(v0, __ T16B, state);
 9688     __ ret(lr);
 9689 
 9690     // bind label and generate local polynomial data
 9691     __ align(wordSize * 2);
 9692     __ bind(polynomial);
 9693     __ emit_int64(0x87);  // The low-order bits of the field
 9694                           // polynomial (i.e. p = z^7+z^2+z+1)
 9695                           // repeated in the low and high parts of a
 9696                           // 128-bit vector
 9697     __ emit_int64(0x87);
 9698 
 9699     return start;
 9700   }
 9701 
 9702   address generate_ghash_processBlocks_wide() {
 9703     address small = generate_ghash_processBlocks();
 9704 
 9705     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9706     StubCodeMark mark(this, stub_id);
 9707     Label polynomial;           // local data generated after stub
 9708     __ align(CodeEntryAlignment);
 9709     address start = __ pc();
 9710 
 9711     Register state   = c_rarg0;
 9712     Register subkeyH = c_rarg1;
 9713     Register data    = c_rarg2;
 9714     Register blocks  = c_rarg3;
 9715 
 9716     const int unroll = 4;
 9717 
 9718     __ cmp(blocks, (unsigned char)(unroll * 2));
 9719     __ br(__ LT, small);
 9720 
 9721     if (unroll > 1) {
 9722     // Save state before entering routine
 9723       __ sub(sp, sp, 4 * 16);
 9724       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9725       __ sub(sp, sp, 4 * 16);
 9726       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9727     }
 9728 
 9729     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
 9730 
 9731     if (unroll > 1) {
 9732       // And restore state
 9733       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9734       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9735     }
 9736 
 9737     __ cmp(blocks, (unsigned char)0);
 9738     __ br(__ GT, small);
 9739 
 9740     __ ret(lr);
 9741 
 9742     // bind label and generate polynomial data
 9743     __ align(wordSize * 2);
 9744     __ bind(polynomial);
 9745     __ emit_int64(0x87);  // The low-order bits of the field
 9746                           // polynomial (i.e. p = z^7+z^2+z+1)
 9747                           // repeated in the low and high parts of a
 9748                           // 128-bit vector
 9749     __ emit_int64(0x87);
 9750 
 9751     return start;
 9752 
 9753   }
 9754 
 9755   void generate_base64_encode_simdround(Register src, Register dst,
 9756         FloatRegister codec, u8 size) {
 9757 
 9758     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9759     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9760     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9761 
 9762     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9763 
 9764     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9765 
 9766     __ ushr(ind0, arrangement, in0,  2);
 9767 
 9768     __ ushr(ind1, arrangement, in1,  2);
 9769     __ shl(in0,   arrangement, in0,  6);
 9770     __ orr(ind1,  arrangement, ind1, in0);
 9771     __ ushr(ind1, arrangement, ind1, 2);
 9772 
 9773     __ ushr(ind2, arrangement, in2,  4);
 9774     __ shl(in1,   arrangement, in1,  4);
 9775     __ orr(ind2,  arrangement, in1,  ind2);
 9776     __ ushr(ind2, arrangement, ind2, 2);
 9777 
 9778     __ shl(ind3,  arrangement, in2,  2);
 9779     __ ushr(ind3, arrangement, ind3, 2);
 9780 
 9781     __ tbl(out0,  arrangement, codec,  4, ind0);
 9782     __ tbl(out1,  arrangement, codec,  4, ind1);
 9783     __ tbl(out2,  arrangement, codec,  4, ind2);
 9784     __ tbl(out3,  arrangement, codec,  4, ind3);
 9785 
 9786     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9787   }
 9788 
 9789    /**
 9790    *  Arguments:
 9791    *
 9792    *  Input:
 9793    *  c_rarg0   - src_start
 9794    *  c_rarg1   - src_offset
 9795    *  c_rarg2   - src_length
 9796    *  c_rarg3   - dest_start
 9797    *  c_rarg4   - dest_offset
 9798    *  c_rarg5   - isURL
 9799    *
 9800    */
 9801   address generate_base64_encodeBlock() {
 9802 
 9803     static const char toBase64[64] = {
 9804       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9805       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9806       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9807       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9808       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9809     };
 9810 
 9811     static const char toBase64URL[64] = {
 9812       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9813       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9814       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9815       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9816       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9817     };
 9818 
 9819     __ align(CodeEntryAlignment);
 9820     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9821     StubCodeMark mark(this, stub_id);
 9822     address start = __ pc();
 9823 
 9824     Register src   = c_rarg0;  // source array
 9825     Register soff  = c_rarg1;  // source start offset
 9826     Register send  = c_rarg2;  // source end offset
 9827     Register dst   = c_rarg3;  // dest array
 9828     Register doff  = c_rarg4;  // position for writing to dest array
 9829     Register isURL = c_rarg5;  // Base64 or URL character set
 9830 
 9831     // c_rarg6 and c_rarg7 are free to use as temps
 9832     Register codec  = c_rarg6;
 9833     Register length = c_rarg7;
 9834 
 9835     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9836 
 9837     __ add(src, src, soff);
 9838     __ add(dst, dst, doff);
 9839     __ sub(length, send, soff);
 9840 
 9841     // load the codec base address
 9842     __ lea(codec, ExternalAddress((address) toBase64));
 9843     __ cbz(isURL, ProcessData);
 9844     __ lea(codec, ExternalAddress((address) toBase64URL));
 9845 
 9846     __ BIND(ProcessData);
 9847 
 9848     // too short to formup a SIMD loop, roll back
 9849     __ cmp(length, (u1)24);
 9850     __ br(Assembler::LT, Process3B);
 9851 
 9852     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9853 
 9854     __ BIND(Process48B);
 9855     __ cmp(length, (u1)48);
 9856     __ br(Assembler::LT, Process24B);
 9857     generate_base64_encode_simdround(src, dst, v0, 16);
 9858     __ sub(length, length, 48);
 9859     __ b(Process48B);
 9860 
 9861     __ BIND(Process24B);
 9862     __ cmp(length, (u1)24);
 9863     __ br(Assembler::LT, SIMDExit);
 9864     generate_base64_encode_simdround(src, dst, v0, 8);
 9865     __ sub(length, length, 24);
 9866 
 9867     __ BIND(SIMDExit);
 9868     __ cbz(length, Exit);
 9869 
 9870     __ BIND(Process3B);
 9871     //  3 src bytes, 24 bits
 9872     __ ldrb(r10, __ post(src, 1));
 9873     __ ldrb(r11, __ post(src, 1));
 9874     __ ldrb(r12, __ post(src, 1));
 9875     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9876     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9877     // codec index
 9878     __ ubfmw(r15, r12, 18, 23);
 9879     __ ubfmw(r14, r12, 12, 17);
 9880     __ ubfmw(r13, r12, 6,  11);
 9881     __ andw(r12,  r12, 63);
 9882     // get the code based on the codec
 9883     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9884     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9885     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9886     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9887     __ strb(r15, __ post(dst, 1));
 9888     __ strb(r14, __ post(dst, 1));
 9889     __ strb(r13, __ post(dst, 1));
 9890     __ strb(r12, __ post(dst, 1));
 9891     __ sub(length, length, 3);
 9892     __ cbnz(length, Process3B);
 9893 
 9894     __ BIND(Exit);
 9895     __ ret(lr);
 9896 
 9897     return start;
 9898   }
 9899 
 9900   void generate_base64_decode_simdround(Register src, Register dst,
 9901         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9902 
 9903     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9904     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9905 
 9906     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9907     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9908 
 9909     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9910 
 9911     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9912 
 9913     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9914 
 9915     // we need unsigned saturating subtract, to make sure all input values
 9916     // in range [0, 63] will have 0U value in the higher half lookup
 9917     __ uqsubv(decH0, __ T16B, in0, v27);
 9918     __ uqsubv(decH1, __ T16B, in1, v27);
 9919     __ uqsubv(decH2, __ T16B, in2, v27);
 9920     __ uqsubv(decH3, __ T16B, in3, v27);
 9921 
 9922     // lower half lookup
 9923     __ tbl(decL0, arrangement, codecL, 4, in0);
 9924     __ tbl(decL1, arrangement, codecL, 4, in1);
 9925     __ tbl(decL2, arrangement, codecL, 4, in2);
 9926     __ tbl(decL3, arrangement, codecL, 4, in3);
 9927 
 9928     // higher half lookup
 9929     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9930     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9931     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9932     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9933 
 9934     // combine lower and higher
 9935     __ orr(decL0, arrangement, decL0, decH0);
 9936     __ orr(decL1, arrangement, decL1, decH1);
 9937     __ orr(decL2, arrangement, decL2, decH2);
 9938     __ orr(decL3, arrangement, decL3, decH3);
 9939 
 9940     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9941     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9942     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9943     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9944     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9945     __ orr(in0, arrangement, decH0, decH1);
 9946     __ orr(in1, arrangement, decH2, decH3);
 9947     __ orr(in2, arrangement, in0,   in1);
 9948     __ umaxv(in3, arrangement, in2);
 9949     __ umov(rscratch2, in3, __ B, 0);
 9950 
 9951     // get the data to output
 9952     __ shl(out0,  arrangement, decL0, 2);
 9953     __ ushr(out1, arrangement, decL1, 4);
 9954     __ orr(out0,  arrangement, out0,  out1);
 9955     __ shl(out1,  arrangement, decL1, 4);
 9956     __ ushr(out2, arrangement, decL2, 2);
 9957     __ orr(out1,  arrangement, out1,  out2);
 9958     __ shl(out2,  arrangement, decL2, 6);
 9959     __ orr(out2,  arrangement, out2,  decL3);
 9960 
 9961     __ cbz(rscratch2, NoIllegalData);
 9962 
 9963     // handle illegal input
 9964     __ umov(r10, in2, __ D, 0);
 9965     if (size == 16) {
 9966       __ cbnz(r10, ErrorInLowerHalf);
 9967 
 9968       // illegal input is in higher half, store the lower half now.
 9969       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9970 
 9971       __ umov(r10, in2,  __ D, 1);
 9972       __ umov(r11, out0, __ D, 1);
 9973       __ umov(r12, out1, __ D, 1);
 9974       __ umov(r13, out2, __ D, 1);
 9975       __ b(StoreLegalData);
 9976 
 9977       __ BIND(ErrorInLowerHalf);
 9978     }
 9979     __ umov(r11, out0, __ D, 0);
 9980     __ umov(r12, out1, __ D, 0);
 9981     __ umov(r13, out2, __ D, 0);
 9982 
 9983     __ BIND(StoreLegalData);
 9984     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 9985     __ strb(r11, __ post(dst, 1));
 9986     __ strb(r12, __ post(dst, 1));
 9987     __ strb(r13, __ post(dst, 1));
 9988     __ lsr(r10, r10, 8);
 9989     __ lsr(r11, r11, 8);
 9990     __ lsr(r12, r12, 8);
 9991     __ lsr(r13, r13, 8);
 9992     __ b(StoreLegalData);
 9993 
 9994     __ BIND(NoIllegalData);
 9995     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 9996   }
 9997 
 9998 
 9999    /**
10000    *  Arguments:
10001    *
10002    *  Input:
10003    *  c_rarg0   - src_start
10004    *  c_rarg1   - src_offset
10005    *  c_rarg2   - src_length
10006    *  c_rarg3   - dest_start
10007    *  c_rarg4   - dest_offset
10008    *  c_rarg5   - isURL
10009    *  c_rarg6   - isMIME
10010    *
10011    */
10012   address generate_base64_decodeBlock() {
10013 
10014     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10015     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10016     // titled "Base64 decoding".
10017 
10018     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10019     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10020     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10021     static const uint8_t fromBase64ForNoSIMD[256] = {
10022       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10023       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10024       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10025        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10026       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10027        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10028       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10029        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10030       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10031       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10032       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10033       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10034       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10035       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10036       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10037       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10038     };
10039 
10040     static const uint8_t fromBase64URLForNoSIMD[256] = {
10041       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10042       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10043       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10044        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10045       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10046        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10047       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10048        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10049       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10050       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10051       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10052       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10054       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10055       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10056       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057     };
10058 
10059     // A legal value of base64 code is in range [0, 127].  We need two lookups
10060     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10061     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10062     // table vector lookup use tbx, out of range indices are unchanged in
10063     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10064     // The value of index 64 is set to 0, so that we know that we already get the
10065     // decoded data with the 1st lookup.
10066     static const uint8_t fromBase64ForSIMD[128] = {
10067       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10068       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10069       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10070        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10071         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10072        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10073       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10074        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10075     };
10076 
10077     static const uint8_t fromBase64URLForSIMD[128] = {
10078       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10079       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10080       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10081        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10082         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10083        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10084        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10085        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10086     };
10087 
10088     __ align(CodeEntryAlignment);
10089     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10090     StubCodeMark mark(this, stub_id);
10091     address start = __ pc();
10092 
10093     Register src    = c_rarg0;  // source array
10094     Register soff   = c_rarg1;  // source start offset
10095     Register send   = c_rarg2;  // source end offset
10096     Register dst    = c_rarg3;  // dest array
10097     Register doff   = c_rarg4;  // position for writing to dest array
10098     Register isURL  = c_rarg5;  // Base64 or URL character set
10099     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10100 
10101     Register length = send;    // reuse send as length of source data to process
10102 
10103     Register simd_codec   = c_rarg6;
10104     Register nosimd_codec = c_rarg7;
10105 
10106     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10107 
10108     __ enter();
10109 
10110     __ add(src, src, soff);
10111     __ add(dst, dst, doff);
10112 
10113     __ mov(doff, dst);
10114 
10115     __ sub(length, send, soff);
10116     __ bfm(length, zr, 0, 1);
10117 
10118     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10119     __ cbz(isURL, ProcessData);
10120     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10121 
10122     __ BIND(ProcessData);
10123     __ mov(rscratch1, length);
10124     __ cmp(length, (u1)144); // 144 = 80 + 64
10125     __ br(Assembler::LT, Process4B);
10126 
10127     // In the MIME case, the line length cannot be more than 76
10128     // bytes (see RFC 2045). This is too short a block for SIMD
10129     // to be worthwhile, so we use non-SIMD here.
10130     __ movw(rscratch1, 79);
10131 
10132     __ BIND(Process4B);
10133     __ ldrw(r14, __ post(src, 4));
10134     __ ubfxw(r10, r14, 0,  8);
10135     __ ubfxw(r11, r14, 8,  8);
10136     __ ubfxw(r12, r14, 16, 8);
10137     __ ubfxw(r13, r14, 24, 8);
10138     // get the de-code
10139     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10140     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10141     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10142     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10143     // error detection, 255u indicates an illegal input
10144     __ orrw(r14, r10, r11);
10145     __ orrw(r15, r12, r13);
10146     __ orrw(r14, r14, r15);
10147     __ tbnz(r14, 7, Exit);
10148     // recover the data
10149     __ lslw(r14, r10, 10);
10150     __ bfiw(r14, r11, 4, 6);
10151     __ bfmw(r14, r12, 2, 5);
10152     __ rev16w(r14, r14);
10153     __ bfiw(r13, r12, 6, 2);
10154     __ strh(r14, __ post(dst, 2));
10155     __ strb(r13, __ post(dst, 1));
10156     // non-simd loop
10157     __ subsw(rscratch1, rscratch1, 4);
10158     __ br(Assembler::GT, Process4B);
10159 
10160     // if exiting from PreProcess80B, rscratch1 == -1;
10161     // otherwise, rscratch1 == 0.
10162     __ cbzw(rscratch1, Exit);
10163     __ sub(length, length, 80);
10164 
10165     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10166     __ cbz(isURL, SIMDEnter);
10167     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10168 
10169     __ BIND(SIMDEnter);
10170     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10171     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10172     __ mov(rscratch1, 63);
10173     __ dup(v27, __ T16B, rscratch1);
10174 
10175     __ BIND(Process64B);
10176     __ cmp(length, (u1)64);
10177     __ br(Assembler::LT, Process32B);
10178     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10179     __ sub(length, length, 64);
10180     __ b(Process64B);
10181 
10182     __ BIND(Process32B);
10183     __ cmp(length, (u1)32);
10184     __ br(Assembler::LT, SIMDExit);
10185     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10186     __ sub(length, length, 32);
10187     __ b(Process32B);
10188 
10189     __ BIND(SIMDExit);
10190     __ cbz(length, Exit);
10191     __ movw(rscratch1, length);
10192     __ b(Process4B);
10193 
10194     __ BIND(Exit);
10195     __ sub(c_rarg0, dst, doff);
10196 
10197     __ leave();
10198     __ ret(lr);
10199 
10200     return start;
10201   }
10202 
10203   // Support for spin waits.
10204   address generate_spin_wait() {
10205     __ align(CodeEntryAlignment);
10206     StubId stub_id = StubId::stubgen_spin_wait_id;
10207     StubCodeMark mark(this, stub_id);
10208     address start = __ pc();
10209 
10210     __ spin_wait();
10211     __ ret(lr);
10212 
10213     return start;
10214   }
10215 
10216   void generate_lookup_secondary_supers_table_stub() {
10217     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10218     StubCodeMark mark(this, stub_id);
10219 
10220     const Register
10221       r_super_klass  = r0,
10222       r_array_base   = r1,
10223       r_array_length = r2,
10224       r_array_index  = r3,
10225       r_sub_klass    = r4,
10226       r_bitmap       = rscratch2,
10227       result         = r5;
10228     const FloatRegister
10229       vtemp          = v0;
10230 
10231     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10232       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10233       Label L_success;
10234       __ enter();
10235       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10236                                              r_array_base, r_array_length, r_array_index,
10237                                              vtemp, result, slot,
10238                                              /*stub_is_near*/true);
10239       __ leave();
10240       __ ret(lr);
10241     }
10242   }
10243 
10244   // Slow path implementation for UseSecondarySupersTable.
10245   address generate_lookup_secondary_supers_table_slow_path_stub() {
10246     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10247     StubCodeMark mark(this, stub_id);
10248 
10249     address start = __ pc();
10250     const Register
10251       r_super_klass  = r0,        // argument
10252       r_array_base   = r1,        // argument
10253       temp1          = r2,        // temp
10254       r_array_index  = r3,        // argument
10255       r_bitmap       = rscratch2, // argument
10256       result         = r5;        // argument
10257 
10258     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10259     __ ret(lr);
10260 
10261     return start;
10262   }
10263 
10264 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10265 
10266   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10267   //
10268   // If LSE is in use, generate LSE versions of all the stubs. The
10269   // non-LSE versions are in atomic_aarch64.S.
10270 
10271   // class AtomicStubMark records the entry point of a stub and the
10272   // stub pointer which will point to it. The stub pointer is set to
10273   // the entry point when ~AtomicStubMark() is called, which must be
10274   // after ICache::invalidate_range. This ensures safe publication of
10275   // the generated code.
10276   class AtomicStubMark {
10277     address _entry_point;
10278     aarch64_atomic_stub_t *_stub;
10279     MacroAssembler *_masm;
10280   public:
10281     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10282       _masm = masm;
10283       __ align(32);
10284       _entry_point = __ pc();
10285       _stub = stub;
10286     }
10287     ~AtomicStubMark() {
10288       *_stub = (aarch64_atomic_stub_t)_entry_point;
10289     }
10290   };
10291 
10292   // NB: For memory_order_conservative we need a trailing membar after
10293   // LSE atomic operations but not a leading membar.
10294   //
10295   // We don't need a leading membar because a clause in the Arm ARM
10296   // says:
10297   //
10298   //   Barrier-ordered-before
10299   //
10300   //   Barrier instructions order prior Memory effects before subsequent
10301   //   Memory effects generated by the same Observer. A read or a write
10302   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10303   //   Observer if and only if RW1 appears in program order before RW 2
10304   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10305   //   instruction with both Acquire and Release semantics.
10306   //
10307   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10308   // and Release semantics, therefore we don't need a leading
10309   // barrier. However, there is no corresponding Barrier-ordered-after
10310   // relationship, therefore we need a trailing membar to prevent a
10311   // later store or load from being reordered with the store in an
10312   // atomic instruction.
10313   //
10314   // This was checked by using the herd7 consistency model simulator
10315   // (http://diy.inria.fr/) with this test case:
10316   //
10317   // AArch64 LseCas
10318   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10319   // P0 | P1;
10320   // LDR W4, [X2] | MOV W3, #0;
10321   // DMB LD       | MOV W4, #1;
10322   // LDR W3, [X1] | CASAL W3, W4, [X1];
10323   //              | DMB ISH;
10324   //              | STR W4, [X2];
10325   // exists
10326   // (0:X3=0 /\ 0:X4=1)
10327   //
10328   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10329   // with the store to x in P1. Without the DMB in P1 this may happen.
10330   //
10331   // At the time of writing we don't know of any AArch64 hardware that
10332   // reorders stores in this way, but the Reference Manual permits it.
10333 
10334   void gen_cas_entry(Assembler::operand_size size,
10335                      atomic_memory_order order) {
10336     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10337       exchange_val = c_rarg2;
10338     bool acquire, release;
10339     switch (order) {
10340       case memory_order_relaxed:
10341         acquire = false;
10342         release = false;
10343         break;
10344       case memory_order_release:
10345         acquire = false;
10346         release = true;
10347         break;
10348       default:
10349         acquire = true;
10350         release = true;
10351         break;
10352     }
10353     __ mov(prev, compare_val);
10354     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10355     if (order == memory_order_conservative) {
10356       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10357     }
10358     if (size == Assembler::xword) {
10359       __ mov(r0, prev);
10360     } else {
10361       __ movw(r0, prev);
10362     }
10363     __ ret(lr);
10364   }
10365 
10366   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10367     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10368     // If not relaxed, then default to conservative.  Relaxed is the only
10369     // case we use enough to be worth specializing.
10370     if (order == memory_order_relaxed) {
10371       __ ldadd(size, incr, prev, addr);
10372     } else {
10373       __ ldaddal(size, incr, prev, addr);
10374       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10375     }
10376     if (size == Assembler::xword) {
10377       __ mov(r0, prev);
10378     } else {
10379       __ movw(r0, prev);
10380     }
10381     __ ret(lr);
10382   }
10383 
10384   void gen_swpal_entry(Assembler::operand_size size) {
10385     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10386     __ swpal(size, incr, prev, addr);
10387     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10388     if (size == Assembler::xword) {
10389       __ mov(r0, prev);
10390     } else {
10391       __ movw(r0, prev);
10392     }
10393     __ ret(lr);
10394   }
10395 
10396   void generate_atomic_entry_points() {
10397     if (! UseLSE) {
10398       return;
10399     }
10400     __ align(CodeEntryAlignment);
10401     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10402     StubCodeMark mark(this, stub_id);
10403     address first_entry = __ pc();
10404 
10405     // ADD, memory_order_conservative
10406     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10407     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10408     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10409     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10410 
10411     // ADD, memory_order_relaxed
10412     AtomicStubMark mark_fetch_add_4_relaxed
10413       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10414     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10415     AtomicStubMark mark_fetch_add_8_relaxed
10416       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10417     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10418 
10419     // XCHG, memory_order_conservative
10420     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10421     gen_swpal_entry(Assembler::word);
10422     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10423     gen_swpal_entry(Assembler::xword);
10424 
10425     // CAS, memory_order_conservative
10426     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10427     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10428     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10429     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10430     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10431     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10432 
10433     // CAS, memory_order_relaxed
10434     AtomicStubMark mark_cmpxchg_1_relaxed
10435       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10436     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10437     AtomicStubMark mark_cmpxchg_4_relaxed
10438       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10439     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10440     AtomicStubMark mark_cmpxchg_8_relaxed
10441       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10442     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10443 
10444     AtomicStubMark mark_cmpxchg_4_release
10445       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10446     gen_cas_entry(MacroAssembler::word, memory_order_release);
10447     AtomicStubMark mark_cmpxchg_8_release
10448       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10449     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10450 
10451     AtomicStubMark mark_cmpxchg_4_seq_cst
10452       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10453     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10454     AtomicStubMark mark_cmpxchg_8_seq_cst
10455       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10456     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10457 
10458     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10459   }
10460 #endif // LINUX
10461 
10462   address generate_cont_thaw(Continuation::thaw_kind kind) {
10463     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10464     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10465 
10466     address start = __ pc();
10467 
10468     if (return_barrier) {
10469       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10470       __ mov(sp, rscratch1);
10471     }
10472     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10473 
10474     if (return_barrier) {
10475       // preserve possible return value from a method returning to the return barrier
10476       __ fmovd(rscratch1, v0);
10477       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10478     }
10479 
10480     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10481     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10482     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10483 
10484     if (return_barrier) {
10485       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10486       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10487       __ fmovd(v0, rscratch1);
10488     }
10489     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10490 
10491 
10492     Label thaw_success;
10493     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10494     __ cbnz(rscratch2, thaw_success);
10495     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10496     __ br(rscratch1);
10497     __ bind(thaw_success);
10498 
10499     // make room for the thawed frames
10500     __ sub(rscratch1, sp, rscratch2);
10501     __ andr(rscratch1, rscratch1, -16); // align
10502     __ mov(sp, rscratch1);
10503 
10504     if (return_barrier) {
10505       // save original return value -- again
10506       __ fmovd(rscratch1, v0);
10507       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10508     }
10509 
10510     // If we want, we can templatize thaw by kind, and have three different entries
10511     __ movw(c_rarg1, (uint32_t)kind);
10512 
10513     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10514     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10515 
10516     if (return_barrier) {
10517       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10518       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10519       __ fmovd(v0, rscratch1);
10520     } else {
10521       __ mov(r0, zr); // return 0 (success) from doYield
10522     }
10523 
10524     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10525     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10526     __ mov(rfp, sp);
10527 
10528     if (return_barrier_exception) {
10529       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10530       __ authenticate_return_address(c_rarg1);
10531       __ verify_oop(r0);
10532       // save return value containing the exception oop in callee-saved R19
10533       __ mov(r19, r0);
10534 
10535       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10536 
10537       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10538       // __ reinitialize_ptrue();
10539 
10540       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10541 
10542       __ mov(r1, r0); // the exception handler
10543       __ mov(r0, r19); // restore return value containing the exception oop
10544       __ verify_oop(r0);
10545 
10546       __ leave();
10547       __ mov(r3, lr);
10548       __ br(r1); // the exception handler
10549     } else {
10550       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10551       __ leave();
10552       __ ret(lr);
10553     }
10554 
10555     return start;
10556   }
10557 
10558   address generate_cont_thaw() {
10559     if (!Continuations::enabled()) return nullptr;
10560 
10561     StubId stub_id = StubId::stubgen_cont_thaw_id;
10562     StubCodeMark mark(this, stub_id);
10563     address start = __ pc();
10564     generate_cont_thaw(Continuation::thaw_top);
10565     return start;
10566   }
10567 
10568   address generate_cont_returnBarrier() {
10569     if (!Continuations::enabled()) return nullptr;
10570 
10571     // TODO: will probably need multiple return barriers depending on return type
10572     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10573     StubCodeMark mark(this, stub_id);
10574     address start = __ pc();
10575 
10576     generate_cont_thaw(Continuation::thaw_return_barrier);
10577 
10578     return start;
10579   }
10580 
10581   address generate_cont_returnBarrier_exception() {
10582     if (!Continuations::enabled()) return nullptr;
10583 
10584     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10585     StubCodeMark mark(this, stub_id);
10586     address start = __ pc();
10587 
10588     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10589 
10590     return start;
10591   }
10592 
10593   address generate_cont_preempt_stub() {
10594     if (!Continuations::enabled()) return nullptr;
10595     StubId stub_id = StubId::stubgen_cont_preempt_id;
10596     StubCodeMark mark(this, stub_id);
10597     address start = __ pc();
10598 
10599     __ reset_last_Java_frame(true);
10600 
10601     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10602     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10603     __ mov(sp, rscratch2);
10604 
10605     Label preemption_cancelled;
10606     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10607     __ cbnz(rscratch1, preemption_cancelled);
10608 
10609     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10610     SharedRuntime::continuation_enter_cleanup(_masm);
10611     __ leave();
10612     __ ret(lr);
10613 
10614     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10615     __ bind(preemption_cancelled);
10616     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10617     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10618     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10619     __ ldr(rscratch1, Address(rscratch1));
10620     __ br(rscratch1);
10621 
10622     return start;
10623   }
10624 
10625   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10626   // are represented as long[5], with BITS_PER_LIMB = 26.
10627   // Pack five 26-bit limbs into three 64-bit registers.
10628   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10629     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10630     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10631     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10632     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10633 
10634     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10635     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10636     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10637     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10638 
10639     if (dest2->is_valid()) {
10640       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10641     } else {
10642 #ifdef ASSERT
10643       Label OK;
10644       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10645       __ br(__ EQ, OK);
10646       __ stop("high bits of Poly1305 integer should be zero");
10647       __ should_not_reach_here();
10648       __ bind(OK);
10649 #endif
10650     }
10651   }
10652 
10653   // As above, but return only a 128-bit integer, packed into two
10654   // 64-bit registers.
10655   void pack_26(Register dest0, Register dest1, Register src) {
10656     pack_26(dest0, dest1, noreg, src);
10657   }
10658 
10659   // Multiply and multiply-accumulate unsigned 64-bit registers.
10660   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10661     __ mul(prod_lo, n, m);
10662     __ umulh(prod_hi, n, m);
10663   }
10664   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10665     wide_mul(rscratch1, rscratch2, n, m);
10666     __ adds(sum_lo, sum_lo, rscratch1);
10667     __ adc(sum_hi, sum_hi, rscratch2);
10668   }
10669 
10670   // Poly1305, RFC 7539
10671 
10672   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10673   // description of the tricks used to simplify and accelerate this
10674   // computation.
10675 
10676   address generate_poly1305_processBlocks() {
10677     __ align(CodeEntryAlignment);
10678     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10679     StubCodeMark mark(this, stub_id);
10680     address start = __ pc();
10681     Label here;
10682     __ enter();
10683     RegSet callee_saved = RegSet::range(r19, r28);
10684     __ push(callee_saved, sp);
10685 
10686     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10687 
10688     // Arguments
10689     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10690 
10691     // R_n is the 128-bit randomly-generated key, packed into two
10692     // registers.  The caller passes this key to us as long[5], with
10693     // BITS_PER_LIMB = 26.
10694     const Register R_0 = *++regs, R_1 = *++regs;
10695     pack_26(R_0, R_1, r_start);
10696 
10697     // RR_n is (R_n >> 2) * 5
10698     const Register RR_0 = *++regs, RR_1 = *++regs;
10699     __ lsr(RR_0, R_0, 2);
10700     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10701     __ lsr(RR_1, R_1, 2);
10702     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10703 
10704     // U_n is the current checksum
10705     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10706     pack_26(U_0, U_1, U_2, acc_start);
10707 
10708     static constexpr int BLOCK_LENGTH = 16;
10709     Label DONE, LOOP;
10710 
10711     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10712     __ br(Assembler::LT, DONE); {
10713       __ bind(LOOP);
10714 
10715       // S_n is to be the sum of U_n and the next block of data
10716       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10717       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10718       __ adds(S_0, U_0, S_0);
10719       __ adcs(S_1, U_1, S_1);
10720       __ adc(S_2, U_2, zr);
10721       __ add(S_2, S_2, 1);
10722 
10723       const Register U_0HI = *++regs, U_1HI = *++regs;
10724 
10725       // NB: this logic depends on some of the special properties of
10726       // Poly1305 keys. In particular, because we know that the top
10727       // four bits of R_0 and R_1 are zero, we can add together
10728       // partial products without any risk of needing to propagate a
10729       // carry out.
10730       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10731       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10732       __ andr(U_2, R_0, 3);
10733       __ mul(U_2, S_2, U_2);
10734 
10735       // Recycle registers S_0, S_1, S_2
10736       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10737 
10738       // Partial reduction mod 2**130 - 5
10739       __ adds(U_1, U_0HI, U_1);
10740       __ adc(U_2, U_1HI, U_2);
10741       // Sum now in U_2:U_1:U_0.
10742       // Dead: U_0HI, U_1HI.
10743       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10744 
10745       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10746 
10747       // First, U_2:U_1:U_0 += (U_2 >> 2)
10748       __ lsr(rscratch1, U_2, 2);
10749       __ andr(U_2, U_2, (u8)3);
10750       __ adds(U_0, U_0, rscratch1);
10751       __ adcs(U_1, U_1, zr);
10752       __ adc(U_2, U_2, zr);
10753       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10754       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10755       __ adcs(U_1, U_1, zr);
10756       __ adc(U_2, U_2, zr);
10757 
10758       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10759       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10760       __ br(~ Assembler::LT, LOOP);
10761     }
10762 
10763     // Further reduce modulo 2^130 - 5
10764     __ lsr(rscratch1, U_2, 2);
10765     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10766     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10767     __ adcs(U_1, U_1, zr);
10768     __ andr(U_2, U_2, (u1)3);
10769     __ adc(U_2, U_2, zr);
10770 
10771     // Unpack the sum into five 26-bit limbs and write to memory.
10772     __ ubfiz(rscratch1, U_0, 0, 26);
10773     __ ubfx(rscratch2, U_0, 26, 26);
10774     __ stp(rscratch1, rscratch2, Address(acc_start));
10775     __ ubfx(rscratch1, U_0, 52, 12);
10776     __ bfi(rscratch1, U_1, 12, 14);
10777     __ ubfx(rscratch2, U_1, 14, 26);
10778     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10779     __ ubfx(rscratch1, U_1, 40, 24);
10780     __ bfi(rscratch1, U_2, 24, 3);
10781     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10782 
10783     __ bind(DONE);
10784     __ pop(callee_saved, sp);
10785     __ leave();
10786     __ ret(lr);
10787 
10788     return start;
10789   }
10790 
10791   // exception handler for upcall stubs
10792   address generate_upcall_stub_exception_handler() {
10793     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10794     StubCodeMark mark(this, stub_id);
10795     address start = __ pc();
10796 
10797     // Native caller has no idea how to handle exceptions,
10798     // so we just crash here. Up to callee to catch exceptions.
10799     __ verify_oop(r0);
10800     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10801     __ blr(rscratch1);
10802     __ should_not_reach_here();
10803 
10804     return start;
10805   }
10806 
10807   // load Method* target of MethodHandle
10808   // j_rarg0 = jobject receiver
10809   // rmethod = result
10810   address generate_upcall_stub_load_target() {
10811     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10812     StubCodeMark mark(this, stub_id);
10813     address start = __ pc();
10814 
10815     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10816       // Load target method from receiver
10817     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10818     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10819     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10820     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10821                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10822                       noreg, noreg);
10823     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10824 
10825     __ ret(lr);
10826 
10827     return start;
10828   }
10829 
10830 #undef __
10831 #define __ masm->
10832 
10833   class MontgomeryMultiplyGenerator : public MacroAssembler {
10834 
10835     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10836       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10837 
10838     RegSet _toSave;
10839     bool _squaring;
10840 
10841   public:
10842     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10843       : MacroAssembler(as->code()), _squaring(squaring) {
10844 
10845       // Register allocation
10846 
10847       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10848       Pa_base = *regs;       // Argument registers
10849       if (squaring)
10850         Pb_base = Pa_base;
10851       else
10852         Pb_base = *++regs;
10853       Pn_base = *++regs;
10854       Rlen= *++regs;
10855       inv = *++regs;
10856       Pm_base = *++regs;
10857 
10858                           // Working registers:
10859       Ra =  *++regs;        // The current digit of a, b, n, and m.
10860       Rb =  *++regs;
10861       Rm =  *++regs;
10862       Rn =  *++regs;
10863 
10864       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10865       Pb =  *++regs;
10866       Pm =  *++regs;
10867       Pn =  *++regs;
10868 
10869       t0 =  *++regs;        // Three registers which form a
10870       t1 =  *++regs;        // triple-precision accumuator.
10871       t2 =  *++regs;
10872 
10873       Ri =  *++regs;        // Inner and outer loop indexes.
10874       Rj =  *++regs;
10875 
10876       Rhi_ab = *++regs;     // Product registers: low and high parts
10877       Rlo_ab = *++regs;     // of a*b and m*n.
10878       Rhi_mn = *++regs;
10879       Rlo_mn = *++regs;
10880 
10881       // r19 and up are callee-saved.
10882       _toSave = RegSet::range(r19, *regs) + Pm_base;
10883     }
10884 
10885   private:
10886     void save_regs() {
10887       push(_toSave, sp);
10888     }
10889 
10890     void restore_regs() {
10891       pop(_toSave, sp);
10892     }
10893 
10894     template <typename T>
10895     void unroll_2(Register count, T block) {
10896       Label loop, end, odd;
10897       tbnz(count, 0, odd);
10898       cbz(count, end);
10899       align(16);
10900       bind(loop);
10901       (this->*block)();
10902       bind(odd);
10903       (this->*block)();
10904       subs(count, count, 2);
10905       br(Assembler::GT, loop);
10906       bind(end);
10907     }
10908 
10909     template <typename T>
10910     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10911       Label loop, end, odd;
10912       tbnz(count, 0, odd);
10913       cbz(count, end);
10914       align(16);
10915       bind(loop);
10916       (this->*block)(d, s, tmp);
10917       bind(odd);
10918       (this->*block)(d, s, tmp);
10919       subs(count, count, 2);
10920       br(Assembler::GT, loop);
10921       bind(end);
10922     }
10923 
10924     void pre1(RegisterOrConstant i) {
10925       block_comment("pre1");
10926       // Pa = Pa_base;
10927       // Pb = Pb_base + i;
10928       // Pm = Pm_base;
10929       // Pn = Pn_base + i;
10930       // Ra = *Pa;
10931       // Rb = *Pb;
10932       // Rm = *Pm;
10933       // Rn = *Pn;
10934       ldr(Ra, Address(Pa_base));
10935       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10936       ldr(Rm, Address(Pm_base));
10937       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10938       lea(Pa, Address(Pa_base));
10939       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10940       lea(Pm, Address(Pm_base));
10941       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10942 
10943       // Zero the m*n result.
10944       mov(Rhi_mn, zr);
10945       mov(Rlo_mn, zr);
10946     }
10947 
10948     // The core multiply-accumulate step of a Montgomery
10949     // multiplication.  The idea is to schedule operations as a
10950     // pipeline so that instructions with long latencies (loads and
10951     // multiplies) have time to complete before their results are
10952     // used.  This most benefits in-order implementations of the
10953     // architecture but out-of-order ones also benefit.
10954     void step() {
10955       block_comment("step");
10956       // MACC(Ra, Rb, t0, t1, t2);
10957       // Ra = *++Pa;
10958       // Rb = *--Pb;
10959       umulh(Rhi_ab, Ra, Rb);
10960       mul(Rlo_ab, Ra, Rb);
10961       ldr(Ra, pre(Pa, wordSize));
10962       ldr(Rb, pre(Pb, -wordSize));
10963       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10964                                        // previous iteration.
10965       // MACC(Rm, Rn, t0, t1, t2);
10966       // Rm = *++Pm;
10967       // Rn = *--Pn;
10968       umulh(Rhi_mn, Rm, Rn);
10969       mul(Rlo_mn, Rm, Rn);
10970       ldr(Rm, pre(Pm, wordSize));
10971       ldr(Rn, pre(Pn, -wordSize));
10972       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10973     }
10974 
10975     void post1() {
10976       block_comment("post1");
10977 
10978       // MACC(Ra, Rb, t0, t1, t2);
10979       // Ra = *++Pa;
10980       // Rb = *--Pb;
10981       umulh(Rhi_ab, Ra, Rb);
10982       mul(Rlo_ab, Ra, Rb);
10983       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10984       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10985 
10986       // *Pm = Rm = t0 * inv;
10987       mul(Rm, t0, inv);
10988       str(Rm, Address(Pm));
10989 
10990       // MACC(Rm, Rn, t0, t1, t2);
10991       // t0 = t1; t1 = t2; t2 = 0;
10992       umulh(Rhi_mn, Rm, Rn);
10993 
10994 #ifndef PRODUCT
10995       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10996       {
10997         mul(Rlo_mn, Rm, Rn);
10998         add(Rlo_mn, t0, Rlo_mn);
10999         Label ok;
11000         cbz(Rlo_mn, ok); {
11001           stop("broken Montgomery multiply");
11002         } bind(ok);
11003       }
11004 #endif
11005       // We have very carefully set things up so that
11006       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11007       // the lower half of Rm * Rn because we know the result already:
11008       // it must be -t0.  t0 + (-t0) must generate a carry iff
11009       // t0 != 0.  So, rather than do a mul and an adds we just set
11010       // the carry flag iff t0 is nonzero.
11011       //
11012       // mul(Rlo_mn, Rm, Rn);
11013       // adds(zr, t0, Rlo_mn);
11014       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11015       adcs(t0, t1, Rhi_mn);
11016       adc(t1, t2, zr);
11017       mov(t2, zr);
11018     }
11019 
11020     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11021       block_comment("pre2");
11022       // Pa = Pa_base + i-len;
11023       // Pb = Pb_base + len;
11024       // Pm = Pm_base + i-len;
11025       // Pn = Pn_base + len;
11026 
11027       if (i.is_register()) {
11028         sub(Rj, i.as_register(), len);
11029       } else {
11030         mov(Rj, i.as_constant());
11031         sub(Rj, Rj, len);
11032       }
11033       // Rj == i-len
11034 
11035       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11036       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11037       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11038       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11039 
11040       // Ra = *++Pa;
11041       // Rb = *--Pb;
11042       // Rm = *++Pm;
11043       // Rn = *--Pn;
11044       ldr(Ra, pre(Pa, wordSize));
11045       ldr(Rb, pre(Pb, -wordSize));
11046       ldr(Rm, pre(Pm, wordSize));
11047       ldr(Rn, pre(Pn, -wordSize));
11048 
11049       mov(Rhi_mn, zr);
11050       mov(Rlo_mn, zr);
11051     }
11052 
11053     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11054       block_comment("post2");
11055       if (i.is_constant()) {
11056         mov(Rj, i.as_constant()-len.as_constant());
11057       } else {
11058         sub(Rj, i.as_register(), len);
11059       }
11060 
11061       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11062 
11063       // As soon as we know the least significant digit of our result,
11064       // store it.
11065       // Pm_base[i-len] = t0;
11066       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11067 
11068       // t0 = t1; t1 = t2; t2 = 0;
11069       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11070       adc(t1, t2, zr);
11071       mov(t2, zr);
11072     }
11073 
11074     // A carry in t0 after Montgomery multiplication means that we
11075     // should subtract multiples of n from our result in m.  We'll
11076     // keep doing that until there is no carry.
11077     void normalize(RegisterOrConstant len) {
11078       block_comment("normalize");
11079       // while (t0)
11080       //   t0 = sub(Pm_base, Pn_base, t0, len);
11081       Label loop, post, again;
11082       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11083       cbz(t0, post); {
11084         bind(again); {
11085           mov(i, zr);
11086           mov(cnt, len);
11087           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11088           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11089           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11090           align(16);
11091           bind(loop); {
11092             sbcs(Rm, Rm, Rn);
11093             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11094             add(i, i, 1);
11095             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11096             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11097             sub(cnt, cnt, 1);
11098           } cbnz(cnt, loop);
11099           sbc(t0, t0, zr);
11100         } cbnz(t0, again);
11101       } bind(post);
11102     }
11103 
11104     // Move memory at s to d, reversing words.
11105     //    Increments d to end of copied memory
11106     //    Destroys tmp1, tmp2
11107     //    Preserves len
11108     //    Leaves s pointing to the address which was in d at start
11109     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11110       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11111       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11112 
11113       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11114       mov(tmp1, len);
11115       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11116       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11117     }
11118     // where
11119     void reverse1(Register d, Register s, Register tmp) {
11120       ldr(tmp, pre(s, -wordSize));
11121       ror(tmp, tmp, 32);
11122       str(tmp, post(d, wordSize));
11123     }
11124 
11125     void step_squaring() {
11126       // An extra ACC
11127       step();
11128       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11129     }
11130 
11131     void last_squaring(RegisterOrConstant i) {
11132       Label dont;
11133       // if ((i & 1) == 0) {
11134       tbnz(i.as_register(), 0, dont); {
11135         // MACC(Ra, Rb, t0, t1, t2);
11136         // Ra = *++Pa;
11137         // Rb = *--Pb;
11138         umulh(Rhi_ab, Ra, Rb);
11139         mul(Rlo_ab, Ra, Rb);
11140         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11141       } bind(dont);
11142     }
11143 
11144     void extra_step_squaring() {
11145       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11146 
11147       // MACC(Rm, Rn, t0, t1, t2);
11148       // Rm = *++Pm;
11149       // Rn = *--Pn;
11150       umulh(Rhi_mn, Rm, Rn);
11151       mul(Rlo_mn, Rm, Rn);
11152       ldr(Rm, pre(Pm, wordSize));
11153       ldr(Rn, pre(Pn, -wordSize));
11154     }
11155 
11156     void post1_squaring() {
11157       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11158 
11159       // *Pm = Rm = t0 * inv;
11160       mul(Rm, t0, inv);
11161       str(Rm, Address(Pm));
11162 
11163       // MACC(Rm, Rn, t0, t1, t2);
11164       // t0 = t1; t1 = t2; t2 = 0;
11165       umulh(Rhi_mn, Rm, Rn);
11166 
11167 #ifndef PRODUCT
11168       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11169       {
11170         mul(Rlo_mn, Rm, Rn);
11171         add(Rlo_mn, t0, Rlo_mn);
11172         Label ok;
11173         cbz(Rlo_mn, ok); {
11174           stop("broken Montgomery multiply");
11175         } bind(ok);
11176       }
11177 #endif
11178       // We have very carefully set things up so that
11179       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11180       // the lower half of Rm * Rn because we know the result already:
11181       // it must be -t0.  t0 + (-t0) must generate a carry iff
11182       // t0 != 0.  So, rather than do a mul and an adds we just set
11183       // the carry flag iff t0 is nonzero.
11184       //
11185       // mul(Rlo_mn, Rm, Rn);
11186       // adds(zr, t0, Rlo_mn);
11187       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11188       adcs(t0, t1, Rhi_mn);
11189       adc(t1, t2, zr);
11190       mov(t2, zr);
11191     }
11192 
11193     void acc(Register Rhi, Register Rlo,
11194              Register t0, Register t1, Register t2) {
11195       adds(t0, t0, Rlo);
11196       adcs(t1, t1, Rhi);
11197       adc(t2, t2, zr);
11198     }
11199 
11200   public:
11201     /**
11202      * Fast Montgomery multiplication.  The derivation of the
11203      * algorithm is in A Cryptographic Library for the Motorola
11204      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11205      *
11206      * Arguments:
11207      *
11208      * Inputs for multiplication:
11209      *   c_rarg0   - int array elements a
11210      *   c_rarg1   - int array elements b
11211      *   c_rarg2   - int array elements n (the modulus)
11212      *   c_rarg3   - int length
11213      *   c_rarg4   - int inv
11214      *   c_rarg5   - int array elements m (the result)
11215      *
11216      * Inputs for squaring:
11217      *   c_rarg0   - int array elements a
11218      *   c_rarg1   - int array elements n (the modulus)
11219      *   c_rarg2   - int length
11220      *   c_rarg3   - int inv
11221      *   c_rarg4   - int array elements m (the result)
11222      *
11223      */
11224     address generate_multiply() {
11225       Label argh, nothing;
11226       bind(argh);
11227       stop("MontgomeryMultiply total_allocation must be <= 8192");
11228 
11229       align(CodeEntryAlignment);
11230       address entry = pc();
11231 
11232       cbzw(Rlen, nothing);
11233 
11234       enter();
11235 
11236       // Make room.
11237       cmpw(Rlen, 512);
11238       br(Assembler::HI, argh);
11239       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11240       andr(sp, Ra, -2 * wordSize);
11241 
11242       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11243 
11244       {
11245         // Copy input args, reversing as we go.  We use Ra as a
11246         // temporary variable.
11247         reverse(Ra, Pa_base, Rlen, t0, t1);
11248         if (!_squaring)
11249           reverse(Ra, Pb_base, Rlen, t0, t1);
11250         reverse(Ra, Pn_base, Rlen, t0, t1);
11251       }
11252 
11253       // Push all call-saved registers and also Pm_base which we'll need
11254       // at the end.
11255       save_regs();
11256 
11257 #ifndef PRODUCT
11258       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11259       {
11260         ldr(Rn, Address(Pn_base, 0));
11261         mul(Rlo_mn, Rn, inv);
11262         subs(zr, Rlo_mn, -1);
11263         Label ok;
11264         br(EQ, ok); {
11265           stop("broken inverse in Montgomery multiply");
11266         } bind(ok);
11267       }
11268 #endif
11269 
11270       mov(Pm_base, Ra);
11271 
11272       mov(t0, zr);
11273       mov(t1, zr);
11274       mov(t2, zr);
11275 
11276       block_comment("for (int i = 0; i < len; i++) {");
11277       mov(Ri, zr); {
11278         Label loop, end;
11279         cmpw(Ri, Rlen);
11280         br(Assembler::GE, end);
11281 
11282         bind(loop);
11283         pre1(Ri);
11284 
11285         block_comment("  for (j = i; j; j--) {"); {
11286           movw(Rj, Ri);
11287           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11288         } block_comment("  } // j");
11289 
11290         post1();
11291         addw(Ri, Ri, 1);
11292         cmpw(Ri, Rlen);
11293         br(Assembler::LT, loop);
11294         bind(end);
11295         block_comment("} // i");
11296       }
11297 
11298       block_comment("for (int i = len; i < 2*len; i++) {");
11299       mov(Ri, Rlen); {
11300         Label loop, end;
11301         cmpw(Ri, Rlen, Assembler::LSL, 1);
11302         br(Assembler::GE, end);
11303 
11304         bind(loop);
11305         pre2(Ri, Rlen);
11306 
11307         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11308           lslw(Rj, Rlen, 1);
11309           subw(Rj, Rj, Ri);
11310           subw(Rj, Rj, 1);
11311           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11312         } block_comment("  } // j");
11313 
11314         post2(Ri, Rlen);
11315         addw(Ri, Ri, 1);
11316         cmpw(Ri, Rlen, Assembler::LSL, 1);
11317         br(Assembler::LT, loop);
11318         bind(end);
11319       }
11320       block_comment("} // i");
11321 
11322       normalize(Rlen);
11323 
11324       mov(Ra, Pm_base);  // Save Pm_base in Ra
11325       restore_regs();  // Restore caller's Pm_base
11326 
11327       // Copy our result into caller's Pm_base
11328       reverse(Pm_base, Ra, Rlen, t0, t1);
11329 
11330       leave();
11331       bind(nothing);
11332       ret(lr);
11333 
11334       return entry;
11335     }
11336     // In C, approximately:
11337 
11338     // void
11339     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11340     //                     julong Pn_base[], julong Pm_base[],
11341     //                     julong inv, int len) {
11342     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11343     //   julong *Pa, *Pb, *Pn, *Pm;
11344     //   julong Ra, Rb, Rn, Rm;
11345 
11346     //   int i;
11347 
11348     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11349 
11350     //   for (i = 0; i < len; i++) {
11351     //     int j;
11352 
11353     //     Pa = Pa_base;
11354     //     Pb = Pb_base + i;
11355     //     Pm = Pm_base;
11356     //     Pn = Pn_base + i;
11357 
11358     //     Ra = *Pa;
11359     //     Rb = *Pb;
11360     //     Rm = *Pm;
11361     //     Rn = *Pn;
11362 
11363     //     int iters = i;
11364     //     for (j = 0; iters--; j++) {
11365     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11366     //       MACC(Ra, Rb, t0, t1, t2);
11367     //       Ra = *++Pa;
11368     //       Rb = *--Pb;
11369     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11370     //       MACC(Rm, Rn, t0, t1, t2);
11371     //       Rm = *++Pm;
11372     //       Rn = *--Pn;
11373     //     }
11374 
11375     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11376     //     MACC(Ra, Rb, t0, t1, t2);
11377     //     *Pm = Rm = t0 * inv;
11378     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11379     //     MACC(Rm, Rn, t0, t1, t2);
11380 
11381     //     assert(t0 == 0, "broken Montgomery multiply");
11382 
11383     //     t0 = t1; t1 = t2; t2 = 0;
11384     //   }
11385 
11386     //   for (i = len; i < 2*len; i++) {
11387     //     int j;
11388 
11389     //     Pa = Pa_base + i-len;
11390     //     Pb = Pb_base + len;
11391     //     Pm = Pm_base + i-len;
11392     //     Pn = Pn_base + len;
11393 
11394     //     Ra = *++Pa;
11395     //     Rb = *--Pb;
11396     //     Rm = *++Pm;
11397     //     Rn = *--Pn;
11398 
11399     //     int iters = len*2-i-1;
11400     //     for (j = i-len+1; iters--; j++) {
11401     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11402     //       MACC(Ra, Rb, t0, t1, t2);
11403     //       Ra = *++Pa;
11404     //       Rb = *--Pb;
11405     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11406     //       MACC(Rm, Rn, t0, t1, t2);
11407     //       Rm = *++Pm;
11408     //       Rn = *--Pn;
11409     //     }
11410 
11411     //     Pm_base[i-len] = t0;
11412     //     t0 = t1; t1 = t2; t2 = 0;
11413     //   }
11414 
11415     //   while (t0)
11416     //     t0 = sub(Pm_base, Pn_base, t0, len);
11417     // }
11418 
11419     /**
11420      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11421      * multiplies than Montgomery multiplication so it should be up to
11422      * 25% faster.  However, its loop control is more complex and it
11423      * may actually run slower on some machines.
11424      *
11425      * Arguments:
11426      *
11427      * Inputs:
11428      *   c_rarg0   - int array elements a
11429      *   c_rarg1   - int array elements n (the modulus)
11430      *   c_rarg2   - int length
11431      *   c_rarg3   - int inv
11432      *   c_rarg4   - int array elements m (the result)
11433      *
11434      */
11435     address generate_square() {
11436       Label argh;
11437       bind(argh);
11438       stop("MontgomeryMultiply total_allocation must be <= 8192");
11439 
11440       align(CodeEntryAlignment);
11441       address entry = pc();
11442 
11443       enter();
11444 
11445       // Make room.
11446       cmpw(Rlen, 512);
11447       br(Assembler::HI, argh);
11448       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11449       andr(sp, Ra, -2 * wordSize);
11450 
11451       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11452 
11453       {
11454         // Copy input args, reversing as we go.  We use Ra as a
11455         // temporary variable.
11456         reverse(Ra, Pa_base, Rlen, t0, t1);
11457         reverse(Ra, Pn_base, Rlen, t0, t1);
11458       }
11459 
11460       // Push all call-saved registers and also Pm_base which we'll need
11461       // at the end.
11462       save_regs();
11463 
11464       mov(Pm_base, Ra);
11465 
11466       mov(t0, zr);
11467       mov(t1, zr);
11468       mov(t2, zr);
11469 
11470       block_comment("for (int i = 0; i < len; i++) {");
11471       mov(Ri, zr); {
11472         Label loop, end;
11473         bind(loop);
11474         cmp(Ri, Rlen);
11475         br(Assembler::GE, end);
11476 
11477         pre1(Ri);
11478 
11479         block_comment("for (j = (i+1)/2; j; j--) {"); {
11480           add(Rj, Ri, 1);
11481           lsr(Rj, Rj, 1);
11482           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11483         } block_comment("  } // j");
11484 
11485         last_squaring(Ri);
11486 
11487         block_comment("  for (j = i/2; j; j--) {"); {
11488           lsr(Rj, Ri, 1);
11489           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11490         } block_comment("  } // j");
11491 
11492         post1_squaring();
11493         add(Ri, Ri, 1);
11494         cmp(Ri, Rlen);
11495         br(Assembler::LT, loop);
11496 
11497         bind(end);
11498         block_comment("} // i");
11499       }
11500 
11501       block_comment("for (int i = len; i < 2*len; i++) {");
11502       mov(Ri, Rlen); {
11503         Label loop, end;
11504         bind(loop);
11505         cmp(Ri, Rlen, Assembler::LSL, 1);
11506         br(Assembler::GE, end);
11507 
11508         pre2(Ri, Rlen);
11509 
11510         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11511           lsl(Rj, Rlen, 1);
11512           sub(Rj, Rj, Ri);
11513           sub(Rj, Rj, 1);
11514           lsr(Rj, Rj, 1);
11515           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11516         } block_comment("  } // j");
11517 
11518         last_squaring(Ri);
11519 
11520         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11521           lsl(Rj, Rlen, 1);
11522           sub(Rj, Rj, Ri);
11523           lsr(Rj, Rj, 1);
11524           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11525         } block_comment("  } // j");
11526 
11527         post2(Ri, Rlen);
11528         add(Ri, Ri, 1);
11529         cmp(Ri, Rlen, Assembler::LSL, 1);
11530 
11531         br(Assembler::LT, loop);
11532         bind(end);
11533         block_comment("} // i");
11534       }
11535 
11536       normalize(Rlen);
11537 
11538       mov(Ra, Pm_base);  // Save Pm_base in Ra
11539       restore_regs();  // Restore caller's Pm_base
11540 
11541       // Copy our result into caller's Pm_base
11542       reverse(Pm_base, Ra, Rlen, t0, t1);
11543 
11544       leave();
11545       ret(lr);
11546 
11547       return entry;
11548     }
11549     // In C, approximately:
11550 
11551     // void
11552     // montgomery_square(julong Pa_base[], julong Pn_base[],
11553     //                   julong Pm_base[], julong inv, int len) {
11554     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11555     //   julong *Pa, *Pb, *Pn, *Pm;
11556     //   julong Ra, Rb, Rn, Rm;
11557 
11558     //   int i;
11559 
11560     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11561 
11562     //   for (i = 0; i < len; i++) {
11563     //     int j;
11564 
11565     //     Pa = Pa_base;
11566     //     Pb = Pa_base + i;
11567     //     Pm = Pm_base;
11568     //     Pn = Pn_base + i;
11569 
11570     //     Ra = *Pa;
11571     //     Rb = *Pb;
11572     //     Rm = *Pm;
11573     //     Rn = *Pn;
11574 
11575     //     int iters = (i+1)/2;
11576     //     for (j = 0; iters--; j++) {
11577     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11578     //       MACC2(Ra, Rb, t0, t1, t2);
11579     //       Ra = *++Pa;
11580     //       Rb = *--Pb;
11581     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11582     //       MACC(Rm, Rn, t0, t1, t2);
11583     //       Rm = *++Pm;
11584     //       Rn = *--Pn;
11585     //     }
11586     //     if ((i & 1) == 0) {
11587     //       assert(Ra == Pa_base[j], "must be");
11588     //       MACC(Ra, Ra, t0, t1, t2);
11589     //     }
11590     //     iters = i/2;
11591     //     assert(iters == i-j, "must be");
11592     //     for (; iters--; j++) {
11593     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11594     //       MACC(Rm, Rn, t0, t1, t2);
11595     //       Rm = *++Pm;
11596     //       Rn = *--Pn;
11597     //     }
11598 
11599     //     *Pm = Rm = t0 * inv;
11600     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11601     //     MACC(Rm, Rn, t0, t1, t2);
11602 
11603     //     assert(t0 == 0, "broken Montgomery multiply");
11604 
11605     //     t0 = t1; t1 = t2; t2 = 0;
11606     //   }
11607 
11608     //   for (i = len; i < 2*len; i++) {
11609     //     int start = i-len+1;
11610     //     int end = start + (len - start)/2;
11611     //     int j;
11612 
11613     //     Pa = Pa_base + i-len;
11614     //     Pb = Pa_base + len;
11615     //     Pm = Pm_base + i-len;
11616     //     Pn = Pn_base + len;
11617 
11618     //     Ra = *++Pa;
11619     //     Rb = *--Pb;
11620     //     Rm = *++Pm;
11621     //     Rn = *--Pn;
11622 
11623     //     int iters = (2*len-i-1)/2;
11624     //     assert(iters == end-start, "must be");
11625     //     for (j = start; iters--; j++) {
11626     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11627     //       MACC2(Ra, Rb, t0, t1, t2);
11628     //       Ra = *++Pa;
11629     //       Rb = *--Pb;
11630     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11631     //       MACC(Rm, Rn, t0, t1, t2);
11632     //       Rm = *++Pm;
11633     //       Rn = *--Pn;
11634     //     }
11635     //     if ((i & 1) == 0) {
11636     //       assert(Ra == Pa_base[j], "must be");
11637     //       MACC(Ra, Ra, t0, t1, t2);
11638     //     }
11639     //     iters =  (2*len-i)/2;
11640     //     assert(iters == len-j, "must be");
11641     //     for (; iters--; j++) {
11642     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11643     //       MACC(Rm, Rn, t0, t1, t2);
11644     //       Rm = *++Pm;
11645     //       Rn = *--Pn;
11646     //     }
11647     //     Pm_base[i-len] = t0;
11648     //     t0 = t1; t1 = t2; t2 = 0;
11649     //   }
11650 
11651     //   while (t0)
11652     //     t0 = sub(Pm_base, Pn_base, t0, len);
11653     // }
11654   };
11655 
11656   // Initialization
11657   void generate_preuniverse_stubs() {
11658     // preuniverse stubs are not needed for aarch64
11659   }
11660 
11661   void generate_initial_stubs() {
11662     // Generate initial stubs and initializes the entry points
11663 
11664     // entry points that exist in all platforms Note: This is code
11665     // that could be shared among different platforms - however the
11666     // benefit seems to be smaller than the disadvantage of having a
11667     // much more complicated generator structure. See also comment in
11668     // stubRoutines.hpp.
11669 
11670     StubRoutines::_forward_exception_entry = generate_forward_exception();
11671 
11672     StubRoutines::_call_stub_entry =
11673       generate_call_stub(StubRoutines::_call_stub_return_address);
11674 
11675     // is referenced by megamorphic call
11676     StubRoutines::_catch_exception_entry = generate_catch_exception();
11677 
11678     // Initialize table for copy memory (arraycopy) check.
11679     if (UnsafeMemoryAccess::_table == nullptr) {
11680       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11681     }
11682 
11683     if (UseCRC32Intrinsics) {
11684       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11685     }
11686 
11687     if (UseCRC32CIntrinsics) {
11688       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11689     }
11690 
11691     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11692       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11693     }
11694 
11695     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11696       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11697     }
11698 
11699     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11700         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11701       StubRoutines::_hf2f = generate_float16ToFloat();
11702       StubRoutines::_f2hf = generate_floatToFloat16();
11703     }
11704   }
11705 
11706   void generate_continuation_stubs() {
11707     // Continuation stubs:
11708     StubRoutines::_cont_thaw          = generate_cont_thaw();
11709     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11710     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11711     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11712   }
11713 
11714   void generate_final_stubs() {
11715     // support for verify_oop (must happen after universe_init)
11716     if (VerifyOops) {
11717       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11718     }
11719 
11720     // arraycopy stubs used by compilers
11721     generate_arraycopy_stubs();
11722 
11723     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11724 
11725     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11726 
11727     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11728     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11729 
11730 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11731 
11732     generate_atomic_entry_points();
11733 
11734 #endif // LINUX
11735 
11736 #ifdef COMPILER2
11737     if (UseSecondarySupersTable) {
11738       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11739       if (! InlineSecondarySupersTest) {
11740         generate_lookup_secondary_supers_table_stub();
11741       }
11742     }
11743 #endif
11744 
11745     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11746 
11747     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11748   }
11749 
11750   void generate_compiler_stubs() {
11751 #if COMPILER2_OR_JVMCI
11752 
11753     if (UseSVE == 0) {
11754       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11755     }
11756 
11757     // array equals stub for large arrays.
11758     if (!UseSimpleArrayEquals) {
11759       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11760     }
11761 
11762     // arrays_hascode stub for large arrays.
11763     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11764     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11765     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11766     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11767     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11768 
11769     // byte_array_inflate stub for large arrays.
11770     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11771 
11772     // countPositives stub for large arrays.
11773     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11774 
11775     generate_compare_long_strings();
11776 
11777     generate_string_indexof_stubs();
11778 
11779 #ifdef COMPILER2
11780     if (UseMultiplyToLenIntrinsic) {
11781       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11782     }
11783 
11784     if (UseSquareToLenIntrinsic) {
11785       StubRoutines::_squareToLen = generate_squareToLen();
11786     }
11787 
11788     if (UseMulAddIntrinsic) {
11789       StubRoutines::_mulAdd = generate_mulAdd();
11790     }
11791 
11792     if (UseSIMDForBigIntegerShiftIntrinsics) {
11793       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11794       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11795     }
11796 
11797     if (UseMontgomeryMultiplyIntrinsic) {
11798       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11799       StubCodeMark mark(this, stub_id);
11800       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11801       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11802     }
11803 
11804     if (UseMontgomerySquareIntrinsic) {
11805       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11806       StubCodeMark mark(this, stub_id);
11807       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11808       // We use generate_multiply() rather than generate_square()
11809       // because it's faster for the sizes of modulus we care about.
11810       StubRoutines::_montgomerySquare = g.generate_multiply();
11811     }
11812 
11813 #endif // COMPILER2
11814 
11815     if (UseChaCha20Intrinsics) {
11816       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11817     }
11818 
11819     if (UseKyberIntrinsics) {
11820       StubRoutines::_kyberNtt = generate_kyberNtt();
11821       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11822       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11823       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11824       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11825       StubRoutines::_kyber12To16 = generate_kyber12To16();
11826       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11827     }
11828 
11829     if (UseDilithiumIntrinsics) {
11830       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11831       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11832       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11833       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11834       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11835     }
11836 
11837     if (UseBASE64Intrinsics) {
11838         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11839         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11840     }
11841 
11842     // data cache line writeback
11843     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11844     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11845 
11846     if (UseAESIntrinsics) {
11847       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11848       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11849       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11850       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11851       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11852     }
11853     if (UseGHASHIntrinsics) {
11854       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11855       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11856     }
11857     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11858       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11859     }
11860 
11861     if (UseMD5Intrinsics) {
11862       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11863       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11864     }
11865     if (UseSHA1Intrinsics) {
11866       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11867       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11868     }
11869     if (UseSHA256Intrinsics) {
11870       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11871       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11872     }
11873     if (UseSHA512Intrinsics) {
11874       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11875       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11876     }
11877     if (UseSHA3Intrinsics) {
11878 
11879       StubRoutines::_double_keccak         = generate_double_keccak();
11880       if (UseSIMDForSHA3Intrinsic) {
11881          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11882          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11883       } else {
11884          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11885          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11886       }
11887     }
11888 
11889     if (UsePoly1305Intrinsics) {
11890       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11891     }
11892 
11893     // generate Adler32 intrinsics code
11894     if (UseAdler32Intrinsics) {
11895       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11896     }
11897 
11898 #endif // COMPILER2_OR_JVMCI
11899   }
11900 
11901  public:
11902   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11903     switch(blob_id) {
11904     case BlobId::stubgen_preuniverse_id:
11905       generate_preuniverse_stubs();
11906       break;
11907     case BlobId::stubgen_initial_id:
11908       generate_initial_stubs();
11909       break;
11910      case BlobId::stubgen_continuation_id:
11911       generate_continuation_stubs();
11912       break;
11913     case BlobId::stubgen_compiler_id:
11914       generate_compiler_stubs();
11915       break;
11916     case BlobId::stubgen_final_id:
11917       generate_final_stubs();
11918       break;
11919     default:
11920       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11921       break;
11922     };
11923   }
11924 }; // end class declaration
11925 
11926 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11927   StubGenerator g(code, blob_id);
11928 }
11929 
11930 
11931 #if defined (LINUX)
11932 
11933 // Define pointers to atomic stubs and initialize them to point to the
11934 // code in atomic_aarch64.S.
11935 
11936 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11937   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11938     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11939   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11940     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11941 
11942 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11943 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11944 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11945 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11946 DEFAULT_ATOMIC_OP(xchg, 4, )
11947 DEFAULT_ATOMIC_OP(xchg, 8, )
11948 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11949 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11950 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11951 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11952 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11953 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11954 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11955 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11956 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11957 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11958 
11959 #undef DEFAULT_ATOMIC_OP
11960 
11961 #endif // LINUX