1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubId stub_id = StubId::stubgen_call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     __ ldr(j_rarg2, result);
  332     Label is_long, is_float, is_double, exit;
  333     __ ldr(j_rarg1, result_type);
  334     __ cmp(j_rarg1, (u1)T_OBJECT);
  335     __ br(Assembler::EQ, is_long);
  336     __ cmp(j_rarg1, (u1)T_LONG);
  337     __ br(Assembler::EQ, is_long);
  338     __ cmp(j_rarg1, (u1)T_FLOAT);
  339     __ br(Assembler::EQ, is_float);
  340     __ cmp(j_rarg1, (u1)T_DOUBLE);
  341     __ br(Assembler::EQ, is_double);
  342 
  343     // handle T_INT case
  344     __ strw(r0, Address(j_rarg2));
  345 
  346     __ BIND(exit);
  347 
  348     // pop parameters
  349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  350 
  351 #ifdef ASSERT
  352     // verify that threads correspond
  353     {
  354       Label L, S;
  355       __ ldr(rscratch1, thread);
  356       __ cmp(rthread, rscratch1);
  357       __ br(Assembler::NE, S);
  358       __ get_thread(rscratch1);
  359       __ cmp(rthread, rscratch1);
  360       __ br(Assembler::EQ, L);
  361       __ BIND(S);
  362       __ stop("StubRoutines::call_stub: threads must correspond");
  363       __ BIND(L);
  364     }
  365 #endif
  366 
  367     __ pop_cont_fastpath(rthread);
  368 
  369     // restore callee-save registers
  370     __ ldpd(v15, v14,  d15_save);
  371     __ ldpd(v13, v12,  d13_save);
  372     __ ldpd(v11, v10,  d11_save);
  373     __ ldpd(v9,  v8,   d9_save);
  374 
  375     __ ldp(r28, r27,   r28_save);
  376     __ ldp(r26, r25,   r26_save);
  377     __ ldp(r24, r23,   r24_save);
  378     __ ldp(r22, r21,   r22_save);
  379     __ ldp(r20, r19,   r20_save);
  380 
  381     // restore fpcr
  382     __ ldr(rscratch1,  fpcr_save);
  383     __ set_fpcr(rscratch1);
  384 
  385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  386     __ ldrw(c_rarg2, result_type);
  387     __ ldr(c_rarg3,  method);
  388     __ ldp(c_rarg4, c_rarg5,  entry_point);
  389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  390 
  391     // leave frame and return to caller
  392     __ leave();
  393     __ ret(lr);
  394 
  395     // handle return types different from T_INT
  396 
  397     __ BIND(is_long);
  398     __ str(r0, Address(j_rarg2, 0));
  399     __ br(Assembler::AL, exit);
  400 
  401     __ BIND(is_float);
  402     __ strs(j_farg0, Address(j_rarg2, 0));
  403     __ br(Assembler::AL, exit);
  404 
  405     __ BIND(is_double);
  406     __ strd(j_farg0, Address(j_rarg2, 0));
  407     __ br(Assembler::AL, exit);
  408 
  409     return start;
  410   }
  411 
  412   // Return point for a Java call if there's an exception thrown in
  413   // Java code.  The exception is caught and transformed into a
  414   // pending exception stored in JavaThread that can be tested from
  415   // within the VM.
  416   //
  417   // Note: Usually the parameters are removed by the callee. In case
  418   // of an exception crossing an activation frame boundary, that is
  419   // not the case if the callee is compiled code => need to setup the
  420   // rsp.
  421   //
  422   // r0: exception oop
  423 
  424   address generate_catch_exception() {
  425     StubId stub_id = StubId::stubgen_catch_exception_id;
  426     StubCodeMark mark(this, stub_id);
  427     address start = __ pc();
  428 
  429     // same as in generate_call_stub():
  430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  431     const Address thread        (rfp, thread_off         * wordSize);
  432 
  433 #ifdef ASSERT
  434     // verify that threads correspond
  435     {
  436       Label L, S;
  437       __ ldr(rscratch1, thread);
  438       __ cmp(rthread, rscratch1);
  439       __ br(Assembler::NE, S);
  440       __ get_thread(rscratch1);
  441       __ cmp(rthread, rscratch1);
  442       __ br(Assembler::EQ, L);
  443       __ bind(S);
  444       __ stop("StubRoutines::catch_exception: threads must correspond");
  445       __ bind(L);
  446     }
  447 #endif
  448 
  449     // set pending exception
  450     __ verify_oop(r0);
  451 
  452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  453     __ mov(rscratch1, (address)__FILE__);
  454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  455     __ movw(rscratch1, (int)__LINE__);
  456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  457 
  458     // complete return to VM
  459     assert(StubRoutines::_call_stub_return_address != nullptr,
  460            "_call_stub_return_address must have been generated before");
  461     __ b(StubRoutines::_call_stub_return_address);
  462 
  463     return start;
  464   }
  465 
  466   // Continuation point for runtime calls returning with a pending
  467   // exception.  The pending exception check happened in the runtime
  468   // or native call stub.  The pending exception in Thread is
  469   // converted into a Java-level exception.
  470   //
  471   // Contract with Java-level exception handlers:
  472   // r0: exception
  473   // r3: throwing pc
  474   //
  475   // NOTE: At entry of this stub, exception-pc must be in LR !!
  476 
  477   // NOTE: this is always used as a jump target within generated code
  478   // so it just needs to be generated code with no x86 prolog
  479 
  480   address generate_forward_exception() {
  481     StubId stub_id = StubId::stubgen_forward_exception_id;
  482     StubCodeMark mark(this, stub_id);
  483     address start = __ pc();
  484 
  485     // Upon entry, LR points to the return address returning into
  486     // Java (interpreted or compiled) code; i.e., the return address
  487     // becomes the throwing pc.
  488     //
  489     // Arguments pushed before the runtime call are still on the stack
  490     // but the exception handler will reset the stack pointer ->
  491     // ignore them.  A potential result in registers can be ignored as
  492     // well.
  493 
  494 #ifdef ASSERT
  495     // make sure this code is only executed if there is a pending exception
  496     {
  497       Label L;
  498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  499       __ cbnz(rscratch1, L);
  500       __ stop("StubRoutines::forward exception: no pending exception (1)");
  501       __ bind(L);
  502     }
  503 #endif
  504 
  505     // compute exception handler into r19
  506 
  507     // call the VM to find the handler address associated with the
  508     // caller address. pass thread in r0 and caller pc (ret address)
  509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  510     // the stack.
  511     __ mov(c_rarg1, lr);
  512     // lr will be trashed by the VM call so we move it to R19
  513     // (callee-saved) because we also need to pass it to the handler
  514     // returned by this call.
  515     __ mov(r19, lr);
  516     BLOCK_COMMENT("call exception_handler_for_return_address");
  517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  518                          SharedRuntime::exception_handler_for_return_address),
  519                     rthread, c_rarg1);
  520     // Reinitialize the ptrue predicate register, in case the external runtime
  521     // call clobbers ptrue reg, as we may return to SVE compiled code.
  522     __ reinitialize_ptrue();
  523 
  524     // we should not really care that lr is no longer the callee
  525     // address. we saved the value the handler needs in r19 so we can
  526     // just copy it to r3. however, the C2 handler will push its own
  527     // frame and then calls into the VM and the VM code asserts that
  528     // the PC for the frame above the handler belongs to a compiled
  529     // Java method. So, we restore lr here to satisfy that assert.
  530     __ mov(lr, r19);
  531     // setup r0 & r3 & clear pending exception
  532     __ mov(r3, r19);
  533     __ mov(r19, r0);
  534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  536 
  537 #ifdef ASSERT
  538     // make sure exception is set
  539     {
  540       Label L;
  541       __ cbnz(r0, L);
  542       __ stop("StubRoutines::forward exception: no pending exception (2)");
  543       __ bind(L);
  544     }
  545 #endif
  546 
  547     // continue at exception handler
  548     // r0: exception
  549     // r3: throwing pc
  550     // r19: exception handler
  551     __ verify_oop(r0);
  552     __ br(r19);
  553 
  554     return start;
  555   }
  556 
  557   // Non-destructive plausibility checks for oops
  558   //
  559   // Arguments:
  560   //    r0: oop to verify
  561   //    rscratch1: error message
  562   //
  563   // Stack after saving c_rarg3:
  564   //    [tos + 0]: saved c_rarg3
  565   //    [tos + 1]: saved c_rarg2
  566   //    [tos + 2]: saved lr
  567   //    [tos + 3]: saved rscratch2
  568   //    [tos + 4]: saved r0
  569   //    [tos + 5]: saved rscratch1
  570   address generate_verify_oop() {
  571     StubId stub_id = StubId::stubgen_verify_oop_id;
  572     StubCodeMark mark(this, stub_id);
  573     address start = __ pc();
  574 
  575     Label exit, error;
  576 
  577     // save c_rarg2 and c_rarg3
  578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  579 
  580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ ldr(c_rarg3, Address(c_rarg2));
  583     __ add(c_rarg3, c_rarg3, 1);
  584     __ str(c_rarg3, Address(c_rarg2));
  585 
  586     // object is in r0
  587     // make sure object is 'reasonable'
  588     __ cbz(r0, exit); // if obj is null it is OK
  589 
  590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  592 
  593     // return if everything seems ok
  594     __ bind(exit);
  595 
  596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  597     __ ret(lr);
  598 
  599     // handle errors
  600     __ bind(error);
  601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  602 
  603     __ push(RegSet::range(r0, r29), sp);
  604     // debug(char* msg, int64_t pc, int64_t regs[])
  605     __ mov(c_rarg0, rscratch1);      // pass address of error message
  606     __ mov(c_rarg1, lr);             // pass return address
  607     __ mov(c_rarg2, sp);             // pass address of regs on stack
  608 #ifndef PRODUCT
  609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  610 #endif
  611     BLOCK_COMMENT("call MacroAssembler::debug");
  612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  613     __ blr(rscratch1);
  614     __ hlt(0);
  615 
  616     return start;
  617   }
  618 
  619   // Generate indices for iota vector.
  620   address generate_iota_indices(StubId stub_id) {
  621     __ align(CodeEntryAlignment);
  622     StubCodeMark mark(this, stub_id);
  623     address start = __ pc();
  624     // B
  625     __ emit_data64(0x0706050403020100, relocInfo::none);
  626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  627     // H
  628     __ emit_data64(0x0003000200010000, relocInfo::none);
  629     __ emit_data64(0x0007000600050004, relocInfo::none);
  630     // S
  631     __ emit_data64(0x0000000100000000, relocInfo::none);
  632     __ emit_data64(0x0000000300000002, relocInfo::none);
  633     // D
  634     __ emit_data64(0x0000000000000000, relocInfo::none);
  635     __ emit_data64(0x0000000000000001, relocInfo::none);
  636     // S - FP
  637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  639     // D - FP
  640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  642     return start;
  643   }
  644 
  645   // The inner part of zero_words().  This is the bulk operation,
  646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  647   // caller is responsible for zeroing the last few words.
  648   //
  649   // Inputs:
  650   // r10: the HeapWord-aligned base address of an array to zero.
  651   // r11: the count in HeapWords, r11 > 0.
  652   //
  653   // Returns r10 and r11, adjusted for the caller to clear.
  654   // r10: the base address of the tail of words left to clear.
  655   // r11: the number of words in the tail.
  656   //      r11 < MacroAssembler::zero_words_block_size.
  657 
  658   address generate_zero_blocks() {
  659     Label done;
  660     Label base_aligned;
  661 
  662     Register base = r10, cnt = r11;
  663 
  664     __ align(CodeEntryAlignment);
  665     StubId stub_id = StubId::stubgen_zero_blocks_id;
  666     StubCodeMark mark(this, stub_id);
  667     address start = __ pc();
  668 
  669     if (UseBlockZeroing) {
  670       int zva_length = VM_Version::zva_length();
  671 
  672       // Ensure ZVA length can be divided by 16. This is required by
  673       // the subsequent operations.
  674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  675 
  676       __ tbz(base, 3, base_aligned);
  677       __ str(zr, Address(__ post(base, 8)));
  678       __ sub(cnt, cnt, 1);
  679       __ bind(base_aligned);
  680 
  681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  682       // alignment.
  683       Label small;
  684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  685       __ subs(rscratch1, cnt, low_limit >> 3);
  686       __ br(Assembler::LT, small);
  687       __ zero_dcache_blocks(base, cnt);
  688       __ bind(small);
  689     }
  690 
  691     {
  692       // Number of stp instructions we'll unroll
  693       const int unroll =
  694         MacroAssembler::zero_words_block_size / 2;
  695       // Clear the remaining blocks.
  696       Label loop;
  697       __ subs(cnt, cnt, unroll * 2);
  698       __ br(Assembler::LT, done);
  699       __ bind(loop);
  700       for (int i = 0; i < unroll; i++)
  701         __ stp(zr, zr, __ post(base, 16));
  702       __ subs(cnt, cnt, unroll * 2);
  703       __ br(Assembler::GE, loop);
  704       __ bind(done);
  705       __ add(cnt, cnt, unroll * 2);
  706     }
  707 
  708     __ ret(lr);
  709 
  710     return start;
  711   }
  712 
  713 
  714   typedef enum {
  715     copy_forwards = 1,
  716     copy_backwards = -1
  717   } copy_direction;
  718 
  719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  720   // for arraycopy stubs.
  721   class ArrayCopyBarrierSetHelper : StackObj {
  722     BarrierSetAssembler* _bs_asm;
  723     MacroAssembler* _masm;
  724     DecoratorSet _decorators;
  725     BasicType _type;
  726     Register _gct1;
  727     Register _gct2;
  728     Register _gct3;
  729     FloatRegister _gcvt1;
  730     FloatRegister _gcvt2;
  731     FloatRegister _gcvt3;
  732 
  733   public:
  734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  735                               DecoratorSet decorators,
  736                               BasicType type,
  737                               Register gct1,
  738                               Register gct2,
  739                               Register gct3,
  740                               FloatRegister gcvt1,
  741                               FloatRegister gcvt2,
  742                               FloatRegister gcvt3)
  743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  744         _masm(masm),
  745         _decorators(decorators),
  746         _type(type),
  747         _gct1(gct1),
  748         _gct2(gct2),
  749         _gct3(gct3),
  750         _gcvt1(gcvt1),
  751         _gcvt2(gcvt2),
  752         _gcvt3(gcvt3) {
  753     }
  754 
  755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  757                             dst1, dst2, src,
  758                             _gct1, _gct2, _gcvt1);
  759     }
  760 
  761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  763                              dst, src1, src2,
  764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  765     }
  766 
  767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  769                             dst1, dst2, src,
  770                             _gct1);
  771     }
  772 
  773     void copy_store_at_16(Address dst, Register src1, Register src2) {
  774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  775                              dst, src1, src2,
  776                              _gct1, _gct2, _gct3);
  777     }
  778 
  779     void copy_load_at_8(Register dst, Address src) {
  780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  781                             dst, noreg, src,
  782                             _gct1);
  783     }
  784 
  785     void copy_store_at_8(Address dst, Register src) {
  786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  787                              dst, src, noreg,
  788                              _gct1, _gct2, _gct3);
  789     }
  790   };
  791 
  792   // Bulk copy of blocks of 8 words.
  793   //
  794   // count is a count of words.
  795   //
  796   // Precondition: count >= 8
  797   //
  798   // Postconditions:
  799   //
  800   // The least significant bit of count contains the remaining count
  801   // of words to copy.  The rest of count is trash.
  802   //
  803   // s and d are adjusted to point to the remaining words to copy
  804   //
  805   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
  806     BasicType type;
  807     copy_direction direction;
  808 
  809     switch (stub_id) {
  810     case StubId::stubgen_copy_byte_f_id:
  811       direction = copy_forwards;
  812       type = T_BYTE;
  813       break;
  814     case StubId::stubgen_copy_byte_b_id:
  815       direction = copy_backwards;
  816       type = T_BYTE;
  817       break;
  818     case StubId::stubgen_copy_oop_f_id:
  819       direction = copy_forwards;
  820       type = T_OBJECT;
  821       break;
  822     case StubId::stubgen_copy_oop_b_id:
  823       direction = copy_backwards;
  824       type = T_OBJECT;
  825       break;
  826     case StubId::stubgen_copy_oop_uninit_f_id:
  827       direction = copy_forwards;
  828       type = T_OBJECT;
  829       break;
  830     case StubId::stubgen_copy_oop_uninit_b_id:
  831       direction = copy_backwards;
  832       type = T_OBJECT;
  833       break;
  834     default:
  835       ShouldNotReachHere();
  836     }
  837 
  838     int unit = wordSize * direction;
  839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  840 
  841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  843     const Register stride = r14;
  844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  847 
  848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  849     assert_different_registers(s, d, count, rscratch1, rscratch2);
  850 
  851     Label again, drain;
  852 
  853     __ align(CodeEntryAlignment);
  854 
  855     StubCodeMark mark(this, stub_id);
  856 
  857     address start = __ pc();
  858 
  859     Label unaligned_copy_long;
  860     if (AvoidUnalignedAccesses) {
  861       __ tbnz(d, 3, unaligned_copy_long);
  862     }
  863 
  864     if (direction == copy_forwards) {
  865       __ sub(s, s, bias);
  866       __ sub(d, d, bias);
  867     }
  868 
  869 #ifdef ASSERT
  870     // Make sure we are never given < 8 words
  871     {
  872       Label L;
  873       __ cmp(count, (u1)8);
  874       __ br(Assembler::GE, L);
  875       __ stop("genrate_copy_longs called with < 8 words");
  876       __ bind(L);
  877     }
  878 #endif
  879 
  880     // Fill 8 registers
  881     if (UseSIMDForMemoryOps) {
  882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  884     } else {
  885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  889     }
  890 
  891     __ subs(count, count, 16);
  892     __ br(Assembler::LO, drain);
  893 
  894     int prefetch = PrefetchCopyIntervalInBytes;
  895     bool use_stride = false;
  896     if (direction == copy_backwards) {
  897       use_stride = prefetch > 256;
  898       prefetch = -prefetch;
  899       if (use_stride) __ mov(stride, prefetch);
  900     }
  901 
  902     __ bind(again);
  903 
  904     if (PrefetchCopyIntervalInBytes > 0)
  905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  906 
  907     if (UseSIMDForMemoryOps) {
  908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  912     } else {
  913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  921     }
  922 
  923     __ subs(count, count, 8);
  924     __ br(Assembler::HS, again);
  925 
  926     // Drain
  927     __ bind(drain);
  928     if (UseSIMDForMemoryOps) {
  929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  931     } else {
  932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936     }
  937 
  938     {
  939       Label L1, L2;
  940       __ tbz(count, exact_log2(4), L1);
  941       if (UseSIMDForMemoryOps) {
  942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  944       } else {
  945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  949       }
  950       __ bind(L1);
  951 
  952       if (direction == copy_forwards) {
  953         __ add(s, s, bias);
  954         __ add(d, d, bias);
  955       }
  956 
  957       __ tbz(count, 1, L2);
  958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  960       __ bind(L2);
  961     }
  962 
  963     __ ret(lr);
  964 
  965     if (AvoidUnalignedAccesses) {
  966       Label drain, again;
  967       // Register order for storing. Order is different for backward copy.
  968 
  969       __ bind(unaligned_copy_long);
  970 
  971       // source address is even aligned, target odd aligned
  972       //
  973       // when forward copying word pairs we read long pairs at offsets
  974       // {0, 2, 4, 6} (in long words). when backwards copying we read
  975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  976       // address by -2 in the forwards case so we can compute the
  977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  978       // or -1.
  979       //
  980       // when forward copying we need to store 1 word, 3 pairs and
  981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  982       // zero offset We adjust the destination by -1 which means we
  983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  984       //
  985       // When backwards copyng we need to store 1 word, 3 pairs and
  986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  987       // offsets {1, 3, 5, 7, 8} * unit.
  988 
  989       if (direction == copy_forwards) {
  990         __ sub(s, s, 16);
  991         __ sub(d, d, 8);
  992       }
  993 
  994       // Fill 8 registers
  995       //
  996       // for forwards copy s was offset by -16 from the original input
  997       // value of s so the register contents are at these offsets
  998       // relative to the 64 bit block addressed by that original input
  999       // and so on for each successive 64 byte block when s is updated
 1000       //
 1001       // t0 at offset 0,  t1 at offset 8
 1002       // t2 at offset 16, t3 at offset 24
 1003       // t4 at offset 32, t5 at offset 40
 1004       // t6 at offset 48, t7 at offset 56
 1005 
 1006       // for backwards copy s was not offset so the register contents
 1007       // are at these offsets into the preceding 64 byte block
 1008       // relative to that original input and so on for each successive
 1009       // preceding 64 byte block when s is updated. this explains the
 1010       // slightly counter-intuitive looking pattern of register usage
 1011       // in the stp instructions for backwards copy.
 1012       //
 1013       // t0 at offset -16, t1 at offset -8
 1014       // t2 at offset -32, t3 at offset -24
 1015       // t4 at offset -48, t5 at offset -40
 1016       // t6 at offset -64, t7 at offset -56
 1017 
 1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1022 
 1023       __ subs(count, count, 16);
 1024       __ br(Assembler::LO, drain);
 1025 
 1026       int prefetch = PrefetchCopyIntervalInBytes;
 1027       bool use_stride = false;
 1028       if (direction == copy_backwards) {
 1029         use_stride = prefetch > 256;
 1030         prefetch = -prefetch;
 1031         if (use_stride) __ mov(stride, prefetch);
 1032       }
 1033 
 1034       __ bind(again);
 1035 
 1036       if (PrefetchCopyIntervalInBytes > 0)
 1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1038 
 1039       if (direction == copy_forwards) {
 1040         // allowing for the offset of -8 the store instructions place
 1041         // registers into the target 64 bit block at the following
 1042         // offsets
 1043         //
 1044         // t0 at offset 0
 1045         // t1 at offset 8,  t2 at offset 16
 1046         // t3 at offset 24, t4 at offset 32
 1047         // t5 at offset 40, t6 at offset 48
 1048         // t7 at offset 56
 1049 
 1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1059       } else {
 1060         // d was not offset when we started so the registers are
 1061         // written into the 64 bit block preceding d with the following
 1062         // offsets
 1063         //
 1064         // t1 at offset -8
 1065         // t3 at offset -24, t0 at offset -16
 1066         // t5 at offset -48, t2 at offset -32
 1067         // t7 at offset -56, t4 at offset -48
 1068         //                   t6 at offset -64
 1069         //
 1070         // note that this matches the offsets previously noted for the
 1071         // loads
 1072 
 1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1082       }
 1083 
 1084       __ subs(count, count, 8);
 1085       __ br(Assembler::HS, again);
 1086 
 1087       // Drain
 1088       //
 1089       // this uses the same pattern of offsets and register arguments
 1090       // as above
 1091       __ bind(drain);
 1092       if (direction == copy_forwards) {
 1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1098       } else {
 1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1104       }
 1105       // now we need to copy any remaining part block which may
 1106       // include a 4 word block subblock and/or a 2 word subblock.
 1107       // bits 2 and 1 in the count are the tell-tale for whether we
 1108       // have each such subblock
 1109       {
 1110         Label L1, L2;
 1111         __ tbz(count, exact_log2(4), L1);
 1112         // this is the same as above but copying only 4 longs hence
 1113         // with only one intervening stp between the str instructions
 1114         // but note that the offsets and registers still follow the
 1115         // same pattern
 1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1118         if (direction == copy_forwards) {
 1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1122         } else {
 1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1126         }
 1127         __ bind(L1);
 1128 
 1129         __ tbz(count, 1, L2);
 1130         // this is the same as above but copying only 2 longs hence
 1131         // there is no intervening stp between the str instructions
 1132         // but note that the offset and register patterns are still
 1133         // the same
 1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1135         if (direction == copy_forwards) {
 1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1141         }
 1142         __ bind(L2);
 1143 
 1144         // for forwards copy we need to re-adjust the offsets we
 1145         // applied so that s and d are follow the last words written
 1146 
 1147         if (direction == copy_forwards) {
 1148           __ add(s, s, 16);
 1149           __ add(d, d, 8);
 1150         }
 1151 
 1152       }
 1153 
 1154       __ ret(lr);
 1155     }
 1156 
 1157     return start;
 1158   }
 1159 
 1160   // Small copy: less than 16 bytes.
 1161   //
 1162   // NB: Ignores all of the bits of count which represent more than 15
 1163   // bytes, so a caller doesn't have to mask them.
 1164 
 1165   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1166     bool is_backwards = step < 0;
 1167     size_t granularity = g_uabs(step);
 1168     int direction = is_backwards ? -1 : 1;
 1169 
 1170     Label Lword, Lint, Lshort, Lbyte;
 1171 
 1172     assert(granularity
 1173            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1174 
 1175     const Register t0 = r3;
 1176     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1177     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1178 
 1179     // ??? I don't know if this bit-test-and-branch is the right thing
 1180     // to do.  It does a lot of jumping, resulting in several
 1181     // mispredicted branches.  It might make more sense to do this
 1182     // with something like Duff's device with a single computed branch.
 1183 
 1184     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1185     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1186     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1187     __ bind(Lword);
 1188 
 1189     if (granularity <= sizeof (jint)) {
 1190       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1191       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1192       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1193       __ bind(Lint);
 1194     }
 1195 
 1196     if (granularity <= sizeof (jshort)) {
 1197       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1198       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1199       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1200       __ bind(Lshort);
 1201     }
 1202 
 1203     if (granularity <= sizeof (jbyte)) {
 1204       __ tbz(count, 0, Lbyte);
 1205       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1206       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1207       __ bind(Lbyte);
 1208     }
 1209   }
 1210 
 1211   // All-singing all-dancing memory copy.
 1212   //
 1213   // Copy count units of memory from s to d.  The size of a unit is
 1214   // step, which can be positive or negative depending on the direction
 1215   // of copy.  If is_aligned is false, we align the source address.
 1216   //
 1217 
 1218   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1219                    Register s, Register d, Register count, int step) {
 1220     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1221     bool is_backwards = step < 0;
 1222     unsigned int granularity = g_uabs(step);
 1223     const Register t0 = r3, t1 = r4;
 1224 
 1225     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1226     // load all the data before writing anything
 1227     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1228     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1229     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1230     const Register send = r17, dend = r16;
 1231     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1232     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1233     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1234 
 1235     if (PrefetchCopyIntervalInBytes > 0)
 1236       __ prfm(Address(s, 0), PLDL1KEEP);
 1237     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1238     __ br(Assembler::HI, copy_big);
 1239 
 1240     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1241     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1242 
 1243     __ cmp(count, u1(16/granularity));
 1244     __ br(Assembler::LS, copy16);
 1245 
 1246     __ cmp(count, u1(64/granularity));
 1247     __ br(Assembler::HI, copy80);
 1248 
 1249     __ cmp(count, u1(32/granularity));
 1250     __ br(Assembler::LS, copy32);
 1251 
 1252     // 33..64 bytes
 1253     if (UseSIMDForMemoryOps) {
 1254       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1255       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1256       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1257       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1258     } else {
 1259       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1260       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1261       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1262       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1263 
 1264       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1265       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1266       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1267       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1268     }
 1269     __ b(finish);
 1270 
 1271     // 17..32 bytes
 1272     __ bind(copy32);
 1273     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1274     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1275 
 1276     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1277     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1278     __ b(finish);
 1279 
 1280     // 65..80/96 bytes
 1281     // (96 bytes if SIMD because we do 32 byes per instruction)
 1282     __ bind(copy80);
 1283     if (UseSIMDForMemoryOps) {
 1284       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1285       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1286       // Unaligned pointers can be an issue for copying.
 1287       // The issue has more chances to happen when granularity of data is
 1288       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1289       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1290       // The most performance drop has been seen for the range 65-80 bytes.
 1291       // For such cases using the pair of ldp/stp instead of the third pair of
 1292       // ldpq/stpq fixes the performance issue.
 1293       if (granularity < sizeof (jint)) {
 1294         Label copy96;
 1295         __ cmp(count, u1(80/granularity));
 1296         __ br(Assembler::HI, copy96);
 1297         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1298 
 1299         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1300         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1301 
 1302         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1303         __ b(finish);
 1304 
 1305         __ bind(copy96);
 1306       }
 1307       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1308 
 1309       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1310       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1311 
 1312       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1313     } else {
 1314       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1315       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1316       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1317       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1318       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1319 
 1320       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1321       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1322       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1323       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1324       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1325     }
 1326     __ b(finish);
 1327 
 1328     // 0..16 bytes
 1329     __ bind(copy16);
 1330     __ cmp(count, u1(8/granularity));
 1331     __ br(Assembler::LO, copy8);
 1332 
 1333     // 8..16 bytes
 1334     bs.copy_load_at_8(t0, Address(s, 0));
 1335     bs.copy_load_at_8(t1, Address(send, -8));
 1336     bs.copy_store_at_8(Address(d, 0), t0);
 1337     bs.copy_store_at_8(Address(dend, -8), t1);
 1338     __ b(finish);
 1339 
 1340     if (granularity < 8) {
 1341       // 4..7 bytes
 1342       __ bind(copy8);
 1343       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1344       __ ldrw(t0, Address(s, 0));
 1345       __ ldrw(t1, Address(send, -4));
 1346       __ strw(t0, Address(d, 0));
 1347       __ strw(t1, Address(dend, -4));
 1348       __ b(finish);
 1349       if (granularity < 4) {
 1350         // 0..3 bytes
 1351         __ bind(copy4);
 1352         __ cbz(count, finish); // get rid of 0 case
 1353         if (granularity == 2) {
 1354           __ ldrh(t0, Address(s, 0));
 1355           __ strh(t0, Address(d, 0));
 1356         } else { // granularity == 1
 1357           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1358           // the first and last byte.
 1359           // Handle the 3 byte case by loading and storing base + count/2
 1360           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1361           // This does means in the 1 byte case we load/store the same
 1362           // byte 3 times.
 1363           __ lsr(count, count, 1);
 1364           __ ldrb(t0, Address(s, 0));
 1365           __ ldrb(t1, Address(send, -1));
 1366           __ ldrb(t2, Address(s, count));
 1367           __ strb(t0, Address(d, 0));
 1368           __ strb(t1, Address(dend, -1));
 1369           __ strb(t2, Address(d, count));
 1370         }
 1371         __ b(finish);
 1372       }
 1373     }
 1374 
 1375     __ bind(copy_big);
 1376     if (is_backwards) {
 1377       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1378       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1379     }
 1380 
 1381     // Now we've got the small case out of the way we can align the
 1382     // source address on a 2-word boundary.
 1383 
 1384     // Here we will materialize a count in r15, which is used by copy_memory_small
 1385     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1386     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1387     // can not be used as a temp register, as it contains the count.
 1388 
 1389     Label aligned;
 1390 
 1391     if (is_aligned) {
 1392       // We may have to adjust by 1 word to get s 2-word-aligned.
 1393       __ tbz(s, exact_log2(wordSize), aligned);
 1394       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1395       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1396       __ sub(count, count, wordSize/granularity);
 1397     } else {
 1398       if (is_backwards) {
 1399         __ andr(r15, s, 2 * wordSize - 1);
 1400       } else {
 1401         __ neg(r15, s);
 1402         __ andr(r15, r15, 2 * wordSize - 1);
 1403       }
 1404       // r15 is the byte adjustment needed to align s.
 1405       __ cbz(r15, aligned);
 1406       int shift = exact_log2(granularity);
 1407       if (shift > 0) {
 1408         __ lsr(r15, r15, shift);
 1409       }
 1410       __ sub(count, count, r15);
 1411 
 1412 #if 0
 1413       // ?? This code is only correct for a disjoint copy.  It may or
 1414       // may not make sense to use it in that case.
 1415 
 1416       // Copy the first pair; s and d may not be aligned.
 1417       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1418       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1419 
 1420       // Align s and d, adjust count
 1421       if (is_backwards) {
 1422         __ sub(s, s, r15);
 1423         __ sub(d, d, r15);
 1424       } else {
 1425         __ add(s, s, r15);
 1426         __ add(d, d, r15);
 1427       }
 1428 #else
 1429       copy_memory_small(decorators, type, s, d, r15, step);
 1430 #endif
 1431     }
 1432 
 1433     __ bind(aligned);
 1434 
 1435     // s is now 2-word-aligned.
 1436 
 1437     // We have a count of units and some trailing bytes. Adjust the
 1438     // count and do a bulk copy of words. If the shift is zero
 1439     // perform a move instead to benefit from zero latency moves.
 1440     int shift = exact_log2(wordSize/granularity);
 1441     if (shift > 0) {
 1442       __ lsr(r15, count, shift);
 1443     } else {
 1444       __ mov(r15, count);
 1445     }
 1446     if (direction == copy_forwards) {
 1447       if (type != T_OBJECT) {
 1448         __ bl(StubRoutines::aarch64::copy_byte_f());
 1449       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1450         __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
 1451       } else {
 1452         __ bl(StubRoutines::aarch64::copy_oop_f());
 1453       }
 1454     } else {
 1455       if (type != T_OBJECT) {
 1456         __ bl(StubRoutines::aarch64::copy_byte_b());
 1457       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1458         __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
 1459       } else {
 1460         __ bl(StubRoutines::aarch64::copy_oop_b());
 1461       }
 1462     }
 1463 
 1464     // And the tail.
 1465     copy_memory_small(decorators, type, s, d, count, step);
 1466 
 1467     if (granularity >= 8) __ bind(copy8);
 1468     if (granularity >= 4) __ bind(copy4);
 1469     __ bind(finish);
 1470   }
 1471 
 1472 
 1473   void clobber_registers() {
 1474 #ifdef ASSERT
 1475     RegSet clobbered
 1476       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1477     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1478     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1479     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1480       __ mov(*it, rscratch1);
 1481     }
 1482 #endif
 1483 
 1484   }
 1485 
 1486   // Scan over array at a for count oops, verifying each one.
 1487   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1488   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1489     Label loop, end;
 1490     __ mov(rscratch1, a);
 1491     __ mov(rscratch2, zr);
 1492     __ bind(loop);
 1493     __ cmp(rscratch2, count);
 1494     __ br(Assembler::HS, end);
 1495     if (size == wordSize) {
 1496       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1497       __ verify_oop(temp);
 1498     } else {
 1499       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1500       __ decode_heap_oop(temp); // calls verify_oop
 1501     }
 1502     __ add(rscratch2, rscratch2, 1);
 1503     __ b(loop);
 1504     __ bind(end);
 1505   }
 1506 
 1507   // Arguments:
 1508   //   stub_id - is used to name the stub and identify all details of
 1509   //             how to perform the copy.
 1510   //
 1511   //   entry - is assigned to the stub's post push entry point unless
 1512   //           it is null
 1513   //
 1514   // Inputs:
 1515   //   c_rarg0   - source array address
 1516   //   c_rarg1   - destination array address
 1517   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1518   //
 1519   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1520   // the hardware handle it.  The two dwords within qwords that span
 1521   // cache line boundaries will still be loaded and stored atomically.
 1522   //
 1523   // Side Effects: nopush_entry is set to the (post push) entry point
 1524   //               so it can be used by the corresponding conjoint
 1525   //               copy method
 1526   //
 1527   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1528     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1529     RegSet saved_reg = RegSet::of(s, d, count);
 1530     int size;
 1531     bool aligned;
 1532     bool is_oop;
 1533     bool dest_uninitialized;
 1534     switch (stub_id) {
 1535     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1536       size = sizeof(jbyte);
 1537       aligned = false;
 1538       is_oop = false;
 1539       dest_uninitialized = false;
 1540       break;
 1541     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1542       size = sizeof(jbyte);
 1543       aligned = true;
 1544       is_oop = false;
 1545       dest_uninitialized = false;
 1546       break;
 1547     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1548       size = sizeof(jshort);
 1549       aligned = false;
 1550       is_oop = false;
 1551       dest_uninitialized = false;
 1552       break;
 1553     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1554       size = sizeof(jshort);
 1555       aligned = true;
 1556       is_oop = false;
 1557       dest_uninitialized = false;
 1558       break;
 1559     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1560       size = sizeof(jint);
 1561       aligned = false;
 1562       is_oop = false;
 1563       dest_uninitialized = false;
 1564       break;
 1565     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1566       size = sizeof(jint);
 1567       aligned = true;
 1568       is_oop = false;
 1569       dest_uninitialized = false;
 1570       break;
 1571     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1572       // since this is always aligned we can (should!) use the same
 1573       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1574       ShouldNotReachHere();
 1575       break;
 1576     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1577       size = sizeof(jlong);
 1578       aligned = true;
 1579       is_oop = false;
 1580       dest_uninitialized = false;
 1581       break;
 1582     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1583       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1584       aligned = !UseCompressedOops;
 1585       is_oop = true;
 1586       dest_uninitialized = false;
 1587       break;
 1588     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1589       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1590       aligned = !UseCompressedOops;
 1591       is_oop = true;
 1592       dest_uninitialized = false;
 1593       break;
 1594     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1595       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1596       aligned = !UseCompressedOops;
 1597       is_oop = true;
 1598       dest_uninitialized = true;
 1599       break;
 1600     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1601       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1602       aligned = !UseCompressedOops;
 1603       is_oop = true;
 1604       dest_uninitialized = true;
 1605       break;
 1606     default:
 1607       ShouldNotReachHere();
 1608       break;
 1609     }
 1610 
 1611     __ align(CodeEntryAlignment);
 1612     StubCodeMark mark(this, stub_id);
 1613     address start = __ pc();
 1614     __ enter();
 1615 
 1616     if (nopush_entry != nullptr) {
 1617       *nopush_entry = __ pc();
 1618       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1619       BLOCK_COMMENT("Entry:");
 1620     }
 1621 
 1622     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1623     if (dest_uninitialized) {
 1624       decorators |= IS_DEST_UNINITIALIZED;
 1625     }
 1626     if (aligned) {
 1627       decorators |= ARRAYCOPY_ALIGNED;
 1628     }
 1629 
 1630     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1631     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1632 
 1633     if (is_oop) {
 1634       // save regs before copy_memory
 1635       __ push(RegSet::of(d, count), sp);
 1636     }
 1637     {
 1638       // UnsafeMemoryAccess page error: continue after unsafe access
 1639       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1640       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1641       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1642     }
 1643 
 1644     if (is_oop) {
 1645       __ pop(RegSet::of(d, count), sp);
 1646       if (VerifyOops)
 1647         verify_oop_array(size, d, count, r16);
 1648     }
 1649 
 1650     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1651 
 1652     __ leave();
 1653     __ mov(r0, zr); // return 0
 1654     __ ret(lr);
 1655     return start;
 1656   }
 1657 
 1658   // Arguments:
 1659   //   stub_id - is used to name the stub and identify all details of
 1660   //             how to perform the copy.
 1661   //
 1662   //   nooverlap_target - identifes the (post push) entry for the
 1663   //             corresponding disjoint copy routine which can be
 1664   //             jumped to if the ranges do not actually overlap
 1665   //
 1666   //   entry - is assigned to the stub's post push entry point unless
 1667   //           it is null
 1668   //
 1669   //
 1670   // Inputs:
 1671   //   c_rarg0   - source array address
 1672   //   c_rarg1   - destination array address
 1673   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1674   //
 1675   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1676   // the hardware handle it.  The two dwords within qwords that span
 1677   // cache line boundaries will still be loaded and stored atomically.
 1678   //
 1679   // Side Effects:
 1680   //   nopush_entry is set to the no-overlap entry point so it can be
 1681   //   used by some other conjoint copy method
 1682   //
 1683   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1684     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1685     RegSet saved_regs = RegSet::of(s, d, count);
 1686     int size;
 1687     bool aligned;
 1688     bool is_oop;
 1689     bool dest_uninitialized;
 1690     switch (stub_id) {
 1691     case StubId::stubgen_jbyte_arraycopy_id:
 1692       size = sizeof(jbyte);
 1693       aligned = false;
 1694       is_oop = false;
 1695       dest_uninitialized = false;
 1696       break;
 1697     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1698       size = sizeof(jbyte);
 1699       aligned = true;
 1700       is_oop = false;
 1701       dest_uninitialized = false;
 1702       break;
 1703     case StubId::stubgen_jshort_arraycopy_id:
 1704       size = sizeof(jshort);
 1705       aligned = false;
 1706       is_oop = false;
 1707       dest_uninitialized = false;
 1708       break;
 1709     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1710       size = sizeof(jshort);
 1711       aligned = true;
 1712       is_oop = false;
 1713       dest_uninitialized = false;
 1714       break;
 1715     case StubId::stubgen_jint_arraycopy_id:
 1716       size = sizeof(jint);
 1717       aligned = false;
 1718       is_oop = false;
 1719       dest_uninitialized = false;
 1720       break;
 1721     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1722       size = sizeof(jint);
 1723       aligned = true;
 1724       is_oop = false;
 1725       dest_uninitialized = false;
 1726       break;
 1727     case StubId::stubgen_jlong_arraycopy_id:
 1728       // since this is always aligned we can (should!) use the same
 1729       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1730       ShouldNotReachHere();
 1731       break;
 1732     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1733       size = sizeof(jlong);
 1734       aligned = true;
 1735       is_oop = false;
 1736       dest_uninitialized = false;
 1737       break;
 1738     case StubId::stubgen_oop_arraycopy_id:
 1739       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1740       aligned = !UseCompressedOops;
 1741       is_oop = true;
 1742       dest_uninitialized = false;
 1743       break;
 1744     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1745       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1746       aligned = !UseCompressedOops;
 1747       is_oop = true;
 1748       dest_uninitialized = false;
 1749       break;
 1750     case StubId::stubgen_oop_arraycopy_uninit_id:
 1751       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1752       aligned = !UseCompressedOops;
 1753       is_oop = true;
 1754       dest_uninitialized = true;
 1755       break;
 1756     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1757       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1758       aligned = !UseCompressedOops;
 1759       is_oop = true;
 1760       dest_uninitialized = true;
 1761       break;
 1762     default:
 1763       ShouldNotReachHere();
 1764     }
 1765 
 1766     StubCodeMark mark(this, stub_id);
 1767     address start = __ pc();
 1768     __ enter();
 1769 
 1770     if (nopush_entry != nullptr) {
 1771       *nopush_entry = __ pc();
 1772       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1773       BLOCK_COMMENT("Entry:");
 1774     }
 1775 
 1776     // use fwd copy when (d-s) above_equal (count*size)
 1777     Label L_overlapping;
 1778     __ sub(rscratch1, d, s);
 1779     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1780     __ br(Assembler::LO, L_overlapping);
 1781     __ b(RuntimeAddress(nooverlap_target));
 1782     __ bind(L_overlapping);
 1783 
 1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1785     if (dest_uninitialized) {
 1786       decorators |= IS_DEST_UNINITIALIZED;
 1787     }
 1788     if (aligned) {
 1789       decorators |= ARRAYCOPY_ALIGNED;
 1790     }
 1791 
 1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1794 
 1795     if (is_oop) {
 1796       // save regs before copy_memory
 1797       __ push(RegSet::of(d, count), sp);
 1798     }
 1799     {
 1800       // UnsafeMemoryAccess page error: continue after unsafe access
 1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1804     }
 1805     if (is_oop) {
 1806       __ pop(RegSet::of(d, count), sp);
 1807       if (VerifyOops)
 1808         verify_oop_array(size, d, count, r16);
 1809     }
 1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1811     __ leave();
 1812     __ mov(r0, zr); // return 0
 1813     __ ret(lr);
 1814     return start;
 1815   }
 1816 
 1817   // Helper for generating a dynamic type check.
 1818   // Smashes rscratch1, rscratch2.
 1819   void generate_type_check(Register sub_klass,
 1820                            Register super_check_offset,
 1821                            Register super_klass,
 1822                            Register temp1,
 1823                            Register temp2,
 1824                            Register result,
 1825                            Label& L_success) {
 1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1827 
 1828     BLOCK_COMMENT("type_check:");
 1829 
 1830     Label L_miss;
 1831 
 1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1833                                      super_check_offset);
 1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1835 
 1836     // Fall through on failure!
 1837     __ BIND(L_miss);
 1838   }
 1839 
 1840   //
 1841   //  Generate checkcasting array copy stub
 1842   //
 1843   //  Input:
 1844   //    c_rarg0   - source array address
 1845   //    c_rarg1   - destination array address
 1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1847   //    c_rarg3   - size_t ckoff (super_check_offset)
 1848   //    c_rarg4   - oop ckval (super_klass)
 1849   //
 1850   //  Output:
 1851   //    r0 ==  0  -  success
 1852   //    r0 == -1^K - failure, where K is partial transfer count
 1853   //
 1854   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 1855     bool dest_uninitialized;
 1856     switch (stub_id) {
 1857     case StubId::stubgen_checkcast_arraycopy_id:
 1858       dest_uninitialized = false;
 1859       break;
 1860     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1861       dest_uninitialized = true;
 1862       break;
 1863     default:
 1864       ShouldNotReachHere();
 1865     }
 1866 
 1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1868 
 1869     // Input registers (after setup_arg_regs)
 1870     const Register from        = c_rarg0;   // source array address
 1871     const Register to          = c_rarg1;   // destination array address
 1872     const Register count       = c_rarg2;   // elementscount
 1873     const Register ckoff       = c_rarg3;   // super_check_offset
 1874     const Register ckval       = c_rarg4;   // super_klass
 1875 
 1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1877     RegSet wb_post_saved_regs = RegSet::of(count);
 1878 
 1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1880     const Register copied_oop  = r22;       // actual oop copied
 1881     const Register count_save  = r21;       // orig elementscount
 1882     const Register start_to    = r20;       // destination array start address
 1883     const Register r19_klass   = r19;       // oop._klass
 1884 
 1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1887 
 1888     //---------------------------------------------------------------
 1889     // Assembler stub will be used for this call to arraycopy
 1890     // if the two arrays are subtypes of Object[] but the
 1891     // destination array type is not equal to or a supertype
 1892     // of the source type.  Each element must be separately
 1893     // checked.
 1894 
 1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1896                                copied_oop, r19_klass, count_save);
 1897 
 1898     __ align(CodeEntryAlignment);
 1899     StubCodeMark mark(this, stub_id);
 1900     address start = __ pc();
 1901 
 1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1903 
 1904 #ifdef ASSERT
 1905     // caller guarantees that the arrays really are different
 1906     // otherwise, we would have to make conjoint checks
 1907     { Label L;
 1908       __ b(L);                  // conjoint check not yet implemented
 1909       __ stop("checkcast_copy within a single array");
 1910       __ bind(L);
 1911     }
 1912 #endif //ASSERT
 1913 
 1914     // Caller of this entry point must set up the argument registers.
 1915     if (nopush_entry != nullptr) {
 1916       *nopush_entry = __ pc();
 1917       BLOCK_COMMENT("Entry:");
 1918     }
 1919 
 1920      // Empty array:  Nothing to do.
 1921     __ cbz(count, L_done);
 1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1923 
 1924 #ifdef ASSERT
 1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1926     // The ckoff and ckval must be mutually consistent,
 1927     // even though caller generates both.
 1928     { Label L;
 1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1930       __ ldrw(start_to, Address(ckval, sco_offset));
 1931       __ cmpw(ckoff, start_to);
 1932       __ br(Assembler::EQ, L);
 1933       __ stop("super_check_offset inconsistent");
 1934       __ bind(L);
 1935     }
 1936 #endif //ASSERT
 1937 
 1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1939     bool is_oop = true;
 1940     int element_size = UseCompressedOops ? 4 : 8;
 1941     if (dest_uninitialized) {
 1942       decorators |= IS_DEST_UNINITIALIZED;
 1943     }
 1944 
 1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1947 
 1948     // save the original count
 1949     __ mov(count_save, count);
 1950 
 1951     // Copy from low to high addresses
 1952     __ mov(start_to, to);              // Save destination array start address
 1953     __ b(L_load_element);
 1954 
 1955     // ======== begin loop ========
 1956     // (Loop is rotated; its entry is L_load_element.)
 1957     // Loop control:
 1958     //   for (; count != 0; count--) {
 1959     //     copied_oop = load_heap_oop(from++);
 1960     //     ... generate_type_check ...;
 1961     //     store_heap_oop(to++, copied_oop);
 1962     //   }
 1963     __ align(OptoLoopAlignment);
 1964 
 1965     __ BIND(L_store_element);
 1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1967                       __ post(to, element_size), copied_oop, noreg,
 1968                       gct1, gct2, gct3);
 1969     __ sub(count, count, 1);
 1970     __ cbz(count, L_do_card_marks);
 1971 
 1972     // ======== loop entry is here ========
 1973     __ BIND(L_load_element);
 1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1975                      copied_oop, noreg, __ post(from, element_size),
 1976                      gct1);
 1977     __ cbz(copied_oop, L_store_element);
 1978 
 1979     __ load_klass(r19_klass, copied_oop);// query the object klass
 1980 
 1981     BLOCK_COMMENT("type_check:");
 1982     generate_type_check(/*sub_klass*/r19_klass,
 1983                         /*super_check_offset*/ckoff,
 1984                         /*super_klass*/ckval,
 1985                         /*r_array_base*/gct1,
 1986                         /*temp2*/gct2,
 1987                         /*result*/r10, L_store_element);
 1988 
 1989     // Fall through on failure!
 1990 
 1991     // ======== end loop ========
 1992 
 1993     // It was a real error; we must depend on the caller to finish the job.
 1994     // Register count = remaining oops, count_orig = total oops.
 1995     // Emit GC store barriers for the oops we have copied and report
 1996     // their number to the caller.
 1997 
 1998     __ subs(count, count_save, count);     // K = partially copied oop count
 1999     __ eon(count, count, zr);              // report (-1^K) to caller
 2000     __ br(Assembler::EQ, L_done_pop);
 2001 
 2002     __ BIND(L_do_card_marks);
 2003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2004 
 2005     __ bind(L_done_pop);
 2006     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2007     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2008 
 2009     __ bind(L_done);
 2010     __ mov(r0, count);
 2011     __ leave();
 2012     __ ret(lr);
 2013 
 2014     return start;
 2015   }
 2016 
 2017   // Perform range checks on the proposed arraycopy.
 2018   // Kills temp, but nothing else.
 2019   // Also, clean the sign bits of src_pos and dst_pos.
 2020   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2021                               Register src_pos, // source position (c_rarg1)
 2022                               Register dst,     // destination array oo (c_rarg2)
 2023                               Register dst_pos, // destination position (c_rarg3)
 2024                               Register length,
 2025                               Register temp,
 2026                               Label& L_failed) {
 2027     BLOCK_COMMENT("arraycopy_range_checks:");
 2028 
 2029     assert_different_registers(rscratch1, temp);
 2030 
 2031     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2032     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2033     __ addw(temp, length, src_pos);
 2034     __ cmpw(temp, rscratch1);
 2035     __ br(Assembler::HI, L_failed);
 2036 
 2037     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2038     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2039     __ addw(temp, length, dst_pos);
 2040     __ cmpw(temp, rscratch1);
 2041     __ br(Assembler::HI, L_failed);
 2042 
 2043     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2044     __ movw(src_pos, src_pos);
 2045     __ movw(dst_pos, dst_pos);
 2046 
 2047     BLOCK_COMMENT("arraycopy_range_checks done");
 2048   }
 2049 
 2050   // These stubs get called from some dumb test routine.
 2051   // I'll write them properly when they're called from
 2052   // something that's actually doing something.
 2053   static void fake_arraycopy_stub(address src, address dst, int count) {
 2054     assert(count == 0, "huh?");
 2055   }
 2056 
 2057 
 2058   //
 2059   //  Generate 'unsafe' array copy stub
 2060   //  Though just as safe as the other stubs, it takes an unscaled
 2061   //  size_t argument instead of an element count.
 2062   //
 2063   //  Input:
 2064   //    c_rarg0   - source array address
 2065   //    c_rarg1   - destination array address
 2066   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2067   //
 2068   // Examines the alignment of the operands and dispatches
 2069   // to a long, int, short, or byte copy loop.
 2070   //
 2071   address generate_unsafe_copy(address byte_copy_entry,
 2072                                address short_copy_entry,
 2073                                address int_copy_entry,
 2074                                address long_copy_entry) {
 2075     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2076 
 2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2079 
 2080     __ align(CodeEntryAlignment);
 2081     StubCodeMark mark(this, stub_id);
 2082     address start = __ pc();
 2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2084 
 2085     // bump this on entry, not on exit:
 2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2087 
 2088     __ orr(rscratch1, s, d);
 2089     __ orr(rscratch1, rscratch1, count);
 2090 
 2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2092     __ cbz(rscratch1, L_long_aligned);
 2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2094     __ cbz(rscratch1, L_int_aligned);
 2095     __ tbz(rscratch1, 0, L_short_aligned);
 2096     __ b(RuntimeAddress(byte_copy_entry));
 2097 
 2098     __ BIND(L_short_aligned);
 2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2100     __ b(RuntimeAddress(short_copy_entry));
 2101     __ BIND(L_int_aligned);
 2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2103     __ b(RuntimeAddress(int_copy_entry));
 2104     __ BIND(L_long_aligned);
 2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2106     __ b(RuntimeAddress(long_copy_entry));
 2107 
 2108     return start;
 2109   }
 2110 
 2111   //
 2112   //  Generate generic array copy stubs
 2113   //
 2114   //  Input:
 2115   //    c_rarg0    -  src oop
 2116   //    c_rarg1    -  src_pos (32-bits)
 2117   //    c_rarg2    -  dst oop
 2118   //    c_rarg3    -  dst_pos (32-bits)
 2119   //    c_rarg4    -  element count (32-bits)
 2120   //
 2121   //  Output:
 2122   //    r0 ==  0  -  success
 2123   //    r0 == -1^K - failure, where K is partial transfer count
 2124   //
 2125   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2126                                 address int_copy_entry, address oop_copy_entry,
 2127                                 address long_copy_entry, address checkcast_copy_entry) {
 2128     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2129 
 2130     Label L_failed, L_objArray;
 2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2132 
 2133     // Input registers
 2134     const Register src        = c_rarg0;  // source array oop
 2135     const Register src_pos    = c_rarg1;  // source position
 2136     const Register dst        = c_rarg2;  // destination array oop
 2137     const Register dst_pos    = c_rarg3;  // destination position
 2138     const Register length     = c_rarg4;
 2139 
 2140 
 2141     // Registers used as temps
 2142     const Register dst_klass  = c_rarg5;
 2143 
 2144     __ align(CodeEntryAlignment);
 2145 
 2146     StubCodeMark mark(this, stub_id);
 2147 
 2148     address start = __ pc();
 2149 
 2150     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2151 
 2152     // bump this on entry, not on exit:
 2153     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2154 
 2155     //-----------------------------------------------------------------------
 2156     // Assembler stub will be used for this call to arraycopy
 2157     // if the following conditions are met:
 2158     //
 2159     // (1) src and dst must not be null.
 2160     // (2) src_pos must not be negative.
 2161     // (3) dst_pos must not be negative.
 2162     // (4) length  must not be negative.
 2163     // (5) src klass and dst klass should be the same and not null.
 2164     // (6) src and dst should be arrays.
 2165     // (7) src_pos + length must not exceed length of src.
 2166     // (8) dst_pos + length must not exceed length of dst.
 2167     //
 2168 
 2169     //  if (src == nullptr) return -1;
 2170     __ cbz(src, L_failed);
 2171 
 2172     //  if (src_pos < 0) return -1;
 2173     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2174 
 2175     //  if (dst == nullptr) return -1;
 2176     __ cbz(dst, L_failed);
 2177 
 2178     //  if (dst_pos < 0) return -1;
 2179     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2180 
 2181     // registers used as temp
 2182     const Register scratch_length    = r16; // elements count to copy
 2183     const Register scratch_src_klass = r17; // array klass
 2184     const Register lh                = r15; // layout helper
 2185 
 2186     //  if (length < 0) return -1;
 2187     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2188     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     __ load_klass(scratch_src_klass, src);
 2191 #ifdef ASSERT
 2192     //  assert(src->klass() != nullptr);
 2193     {
 2194       BLOCK_COMMENT("assert klasses not null {");
 2195       Label L1, L2;
 2196       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2197       __ bind(L1);
 2198       __ stop("broken null klass");
 2199       __ bind(L2);
 2200       __ load_klass(rscratch1, dst);
 2201       __ cbz(rscratch1, L1);     // this would be broken also
 2202       BLOCK_COMMENT("} assert klasses not null done");
 2203     }
 2204 #endif
 2205 
 2206     // Load layout helper (32-bits)
 2207     //
 2208     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2209     // 32        30    24            16              8     2                 0
 2210     //
 2211     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2212     //
 2213 
 2214     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2215 
 2216     // Handle objArrays completely differently...
 2217     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2218     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2219     __ movw(rscratch1, objArray_lh);
 2220     __ eorw(rscratch2, lh, rscratch1);
 2221     __ cbzw(rscratch2, L_objArray);
 2222 
 2223     //  if (src->klass() != dst->klass()) return -1;
 2224     __ load_klass(rscratch2, dst);
 2225     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2226     __ cbnz(rscratch2, L_failed);
 2227 
 2228     //  if (!src->is_Array()) return -1;
 2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2230 
 2231     // At this point, it is known to be a typeArray (array_tag 0x3).
 2232 #ifdef ASSERT
 2233     {
 2234       BLOCK_COMMENT("assert primitive array {");
 2235       Label L;
 2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2237       __ cmpw(lh, rscratch2);
 2238       __ br(Assembler::GE, L);
 2239       __ stop("must be a primitive array");
 2240       __ bind(L);
 2241       BLOCK_COMMENT("} assert primitive array done");
 2242     }
 2243 #endif
 2244 
 2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2246                            rscratch2, L_failed);
 2247 
 2248     // TypeArrayKlass
 2249     //
 2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2252     //
 2253 
 2254     const Register rscratch1_offset = rscratch1;    // array offset
 2255     const Register r15_elsize = lh; // element size
 2256 
 2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2259     __ add(src, src, rscratch1_offset);           // src array offset
 2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2261     BLOCK_COMMENT("choose copy loop based on element size");
 2262 
 2263     // next registers should be set before the jump to corresponding stub
 2264     const Register from     = c_rarg0;  // source array address
 2265     const Register to       = c_rarg1;  // destination array address
 2266     const Register count    = c_rarg2;  // elements count
 2267 
 2268     // 'from', 'to', 'count' registers should be set in such order
 2269     // since they are the same as 'src', 'src_pos', 'dst'.
 2270 
 2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2272 
 2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2274     // size in bytes).  We do a simple bitwise binary search.
 2275   __ BIND(L_copy_bytes);
 2276     __ tbnz(r15_elsize, 1, L_copy_ints);
 2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2278     __ lea(from, Address(src, src_pos));// src_addr
 2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2280     __ movw(count, scratch_length); // length
 2281     __ b(RuntimeAddress(byte_copy_entry));
 2282 
 2283   __ BIND(L_copy_shorts);
 2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2286     __ movw(count, scratch_length); // length
 2287     __ b(RuntimeAddress(short_copy_entry));
 2288 
 2289   __ BIND(L_copy_ints);
 2290     __ tbnz(r15_elsize, 0, L_copy_longs);
 2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2293     __ movw(count, scratch_length); // length
 2294     __ b(RuntimeAddress(int_copy_entry));
 2295 
 2296   __ BIND(L_copy_longs);
 2297 #ifdef ASSERT
 2298     {
 2299       BLOCK_COMMENT("assert long copy {");
 2300       Label L;
 2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2302       __ cmpw(r15_elsize, LogBytesPerLong);
 2303       __ br(Assembler::EQ, L);
 2304       __ stop("must be long copy, but elsize is wrong");
 2305       __ bind(L);
 2306       BLOCK_COMMENT("} assert long copy done");
 2307     }
 2308 #endif
 2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2311     __ movw(count, scratch_length); // length
 2312     __ b(RuntimeAddress(long_copy_entry));
 2313 
 2314     // ObjArrayKlass
 2315   __ BIND(L_objArray);
 2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2317 
 2318     Label L_plain_copy, L_checkcast_copy;
 2319     //  test array classes for subtyping
 2320     __ load_klass(r15, dst);
 2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2322     __ br(Assembler::NE, L_checkcast_copy);
 2323 
 2324     // Identically typed arrays can be copied without element-wise checks.
 2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2326                            rscratch2, L_failed);
 2327 
 2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2332     __ movw(count, scratch_length); // length
 2333   __ BIND(L_plain_copy);
 2334     __ b(RuntimeAddress(oop_copy_entry));
 2335 
 2336   __ BIND(L_checkcast_copy);
 2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2338     {
 2339       // Before looking at dst.length, make sure dst is also an objArray.
 2340       __ ldrw(rscratch1, Address(r15, lh_offset));
 2341       __ movw(rscratch2, objArray_lh);
 2342       __ eorw(rscratch1, rscratch1, rscratch2);
 2343       __ cbnzw(rscratch1, L_failed);
 2344 
 2345       // It is safe to examine both src.length and dst.length.
 2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                              r15, L_failed);
 2348 
 2349       __ load_klass(dst_klass, dst); // reload
 2350 
 2351       // Marshal the base address arguments now, freeing registers.
 2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2356       __ movw(count, length);           // length (reloaded)
 2357       Register sco_temp = c_rarg3;      // this register is free now
 2358       assert_different_registers(from, to, count, sco_temp,
 2359                                  dst_klass, scratch_src_klass);
 2360       // assert_clean_int(count, sco_temp);
 2361 
 2362       // Generate the type check.
 2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2365 
 2366       // Smashes rscratch1, rscratch2
 2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2368                           L_plain_copy);
 2369 
 2370       // Fetch destination element klass from the ObjArrayKlass header.
 2371       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2372       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2373       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2374 
 2375       // the checkcast_copy loop needs two extra arguments:
 2376       assert(c_rarg3 == sco_temp, "#3 already in place");
 2377       // Set up arguments for checkcast_copy_entry.
 2378       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2379       __ b(RuntimeAddress(checkcast_copy_entry));
 2380     }
 2381 
 2382   __ BIND(L_failed);
 2383     __ mov(r0, -1);
 2384     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2385     __ ret(lr);
 2386 
 2387     return start;
 2388   }
 2389 
 2390   //
 2391   // Generate stub for array fill. If "aligned" is true, the
 2392   // "to" address is assumed to be heapword aligned.
 2393   //
 2394   // Arguments for generated stub:
 2395   //   to:    c_rarg0
 2396   //   value: c_rarg1
 2397   //   count: c_rarg2 treated as signed
 2398   //
 2399   address generate_fill(StubId stub_id) {
 2400     BasicType t;
 2401     bool aligned;
 2402 
 2403     switch (stub_id) {
 2404     case StubId::stubgen_jbyte_fill_id:
 2405       t = T_BYTE;
 2406       aligned = false;
 2407       break;
 2408     case StubId::stubgen_jshort_fill_id:
 2409       t = T_SHORT;
 2410       aligned = false;
 2411       break;
 2412     case StubId::stubgen_jint_fill_id:
 2413       t = T_INT;
 2414       aligned = false;
 2415       break;
 2416     case StubId::stubgen_arrayof_jbyte_fill_id:
 2417       t = T_BYTE;
 2418       aligned = true;
 2419       break;
 2420     case StubId::stubgen_arrayof_jshort_fill_id:
 2421       t = T_SHORT;
 2422       aligned = true;
 2423       break;
 2424     case StubId::stubgen_arrayof_jint_fill_id:
 2425       t = T_INT;
 2426       aligned = true;
 2427       break;
 2428     default:
 2429       ShouldNotReachHere();
 2430     };
 2431 
 2432     __ align(CodeEntryAlignment);
 2433     StubCodeMark mark(this, stub_id);
 2434     address start = __ pc();
 2435 
 2436     BLOCK_COMMENT("Entry:");
 2437 
 2438     const Register to        = c_rarg0;  // source array address
 2439     const Register value     = c_rarg1;  // value
 2440     const Register count     = c_rarg2;  // elements count
 2441 
 2442     const Register bz_base = r10;        // base for block_zero routine
 2443     const Register cnt_words = r11;      // temp register
 2444 
 2445     __ enter();
 2446 
 2447     Label L_fill_elements, L_exit1;
 2448 
 2449     int shift = -1;
 2450     switch (t) {
 2451       case T_BYTE:
 2452         shift = 0;
 2453         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2454         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2455         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2456         __ br(Assembler::LO, L_fill_elements);
 2457         break;
 2458       case T_SHORT:
 2459         shift = 1;
 2460         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2461         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2462         __ br(Assembler::LO, L_fill_elements);
 2463         break;
 2464       case T_INT:
 2465         shift = 2;
 2466         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2467         __ br(Assembler::LO, L_fill_elements);
 2468         break;
 2469       default: ShouldNotReachHere();
 2470     }
 2471 
 2472     // Align source address at 8 bytes address boundary.
 2473     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2474     if (!aligned) {
 2475       switch (t) {
 2476         case T_BYTE:
 2477           // One byte misalignment happens only for byte arrays.
 2478           __ tbz(to, 0, L_skip_align1);
 2479           __ strb(value, Address(__ post(to, 1)));
 2480           __ subw(count, count, 1);
 2481           __ bind(L_skip_align1);
 2482           // Fallthrough
 2483         case T_SHORT:
 2484           // Two bytes misalignment happens only for byte and short (char) arrays.
 2485           __ tbz(to, 1, L_skip_align2);
 2486           __ strh(value, Address(__ post(to, 2)));
 2487           __ subw(count, count, 2 >> shift);
 2488           __ bind(L_skip_align2);
 2489           // Fallthrough
 2490         case T_INT:
 2491           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2492           __ tbz(to, 2, L_skip_align4);
 2493           __ strw(value, Address(__ post(to, 4)));
 2494           __ subw(count, count, 4 >> shift);
 2495           __ bind(L_skip_align4);
 2496           break;
 2497         default: ShouldNotReachHere();
 2498       }
 2499     }
 2500 
 2501     //
 2502     //  Fill large chunks
 2503     //
 2504     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2505     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2506     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2507     if (UseBlockZeroing) {
 2508       Label non_block_zeroing, rest;
 2509       // If the fill value is zero we can use the fast zero_words().
 2510       __ cbnz(value, non_block_zeroing);
 2511       __ mov(bz_base, to);
 2512       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2513       address tpc = __ zero_words(bz_base, cnt_words);
 2514       if (tpc == nullptr) {
 2515         fatal("CodeCache is full at generate_fill");
 2516       }
 2517       __ b(rest);
 2518       __ bind(non_block_zeroing);
 2519       __ fill_words(to, cnt_words, value);
 2520       __ bind(rest);
 2521     } else {
 2522       __ fill_words(to, cnt_words, value);
 2523     }
 2524 
 2525     // Remaining count is less than 8 bytes. Fill it by a single store.
 2526     // Note that the total length is no less than 8 bytes.
 2527     if (t == T_BYTE || t == T_SHORT) {
 2528       Label L_exit1;
 2529       __ cbzw(count, L_exit1);
 2530       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2531       __ str(value, Address(to, -8));    // overwrite some elements
 2532       __ bind(L_exit1);
 2533       __ leave();
 2534       __ ret(lr);
 2535     }
 2536 
 2537     // Handle copies less than 8 bytes.
 2538     Label L_fill_2, L_fill_4, L_exit2;
 2539     __ bind(L_fill_elements);
 2540     switch (t) {
 2541       case T_BYTE:
 2542         __ tbz(count, 0, L_fill_2);
 2543         __ strb(value, Address(__ post(to, 1)));
 2544         __ bind(L_fill_2);
 2545         __ tbz(count, 1, L_fill_4);
 2546         __ strh(value, Address(__ post(to, 2)));
 2547         __ bind(L_fill_4);
 2548         __ tbz(count, 2, L_exit2);
 2549         __ strw(value, Address(to));
 2550         break;
 2551       case T_SHORT:
 2552         __ tbz(count, 0, L_fill_4);
 2553         __ strh(value, Address(__ post(to, 2)));
 2554         __ bind(L_fill_4);
 2555         __ tbz(count, 1, L_exit2);
 2556         __ strw(value, Address(to));
 2557         break;
 2558       case T_INT:
 2559         __ cbzw(count, L_exit2);
 2560         __ strw(value, Address(to));
 2561         break;
 2562       default: ShouldNotReachHere();
 2563     }
 2564     __ bind(L_exit2);
 2565     __ leave();
 2566     __ ret(lr);
 2567     return start;
 2568   }
 2569 
 2570   address generate_unsafecopy_common_error_exit() {
 2571     address start_pc = __ pc();
 2572       __ leave();
 2573       __ mov(r0, 0);
 2574       __ ret(lr);
 2575     return start_pc;
 2576   }
 2577 
 2578   //
 2579   //  Generate 'unsafe' set memory stub
 2580   //  Though just as safe as the other stubs, it takes an unscaled
 2581   //  size_t (# bytes) argument instead of an element count.
 2582   //
 2583   //  This fill operation is atomicity preserving: as long as the
 2584   //  address supplied is sufficiently aligned, all writes of up to 64
 2585   //  bits in size are single-copy atomic.
 2586   //
 2587   //  Input:
 2588   //    c_rarg0   - destination array address
 2589   //    c_rarg1   - byte count (size_t)
 2590   //    c_rarg2   - byte value
 2591   //
 2592   address generate_unsafe_setmemory() {
 2593     __ align(CodeEntryAlignment);
 2594     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2595     address start = __ pc();
 2596 
 2597     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2598     Label tail;
 2599 
 2600     UnsafeMemoryAccessMark umam(this, true, false);
 2601 
 2602     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2603 
 2604     __ dup(v0, __ T16B, value);
 2605 
 2606     if (AvoidUnalignedAccesses) {
 2607       __ cmp(count, (u1)16);
 2608       __ br(__ LO, tail);
 2609 
 2610       __ mov(rscratch1, 16);
 2611       __ andr(rscratch2, dest, 15);
 2612       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2613       __ strq(v0, Address(dest));
 2614       __ sub(count, count, rscratch1);
 2615       __ add(dest, dest, rscratch1);
 2616     }
 2617 
 2618     __ subs(count, count, (u1)64);
 2619     __ br(__ LO, tail);
 2620     {
 2621       Label again;
 2622       __ bind(again);
 2623       __ stpq(v0, v0, Address(dest));
 2624       __ stpq(v0, v0, Address(dest, 32));
 2625 
 2626       __ subs(count, count, 64);
 2627       __ add(dest, dest, 64);
 2628       __ br(__ HS, again);
 2629     }
 2630 
 2631     __ bind(tail);
 2632     // The count of bytes is off by 64, but we don't need to correct
 2633     // it because we're only going to use the least-significant few
 2634     // count bits from here on.
 2635     // __ add(count, count, 64);
 2636 
 2637     {
 2638       Label dont;
 2639       __ tbz(count, exact_log2(32), dont);
 2640       __ stpq(v0, v0, __ post(dest, 32));
 2641       __ bind(dont);
 2642     }
 2643     {
 2644       Label dont;
 2645       __ tbz(count, exact_log2(16), dont);
 2646       __ strq(v0, __ post(dest, 16));
 2647       __ bind(dont);
 2648     }
 2649     {
 2650       Label dont;
 2651       __ tbz(count, exact_log2(8), dont);
 2652       __ strd(v0, __ post(dest, 8));
 2653       __ bind(dont);
 2654     }
 2655 
 2656     Label finished;
 2657     __ tst(count, 7);
 2658     __ br(__ EQ, finished);
 2659 
 2660     {
 2661       Label dont;
 2662       __ tbz(count, exact_log2(4), dont);
 2663       __ strs(v0, __ post(dest, 4));
 2664       __ bind(dont);
 2665     }
 2666     {
 2667       Label dont;
 2668       __ tbz(count, exact_log2(2), dont);
 2669       __ bfi(value, value, 8, 8);
 2670       __ strh(value, __ post(dest, 2));
 2671       __ bind(dont);
 2672     }
 2673     {
 2674       Label dont;
 2675       __ tbz(count, exact_log2(1), dont);
 2676       __ strb(value, Address(dest));
 2677       __ bind(dont);
 2678     }
 2679 
 2680     __ bind(finished);
 2681     __ leave();
 2682     __ ret(lr);
 2683 
 2684     return start;
 2685   }
 2686 
 2687   address generate_data_cache_writeback() {
 2688     const Register line        = c_rarg0;  // address of line to write back
 2689 
 2690     __ align(CodeEntryAlignment);
 2691 
 2692     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2693     StubCodeMark mark(this, stub_id);
 2694 
 2695     address start = __ pc();
 2696     __ enter();
 2697     __ cache_wb(Address(line, 0));
 2698     __ leave();
 2699     __ ret(lr);
 2700 
 2701     return start;
 2702   }
 2703 
 2704   address generate_data_cache_writeback_sync() {
 2705     const Register is_pre     = c_rarg0;  // pre or post sync
 2706 
 2707     __ align(CodeEntryAlignment);
 2708 
 2709     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2710     StubCodeMark mark(this, stub_id);
 2711 
 2712     // pre wbsync is a no-op
 2713     // post wbsync translates to an sfence
 2714 
 2715     Label skip;
 2716     address start = __ pc();
 2717     __ enter();
 2718     __ cbnz(is_pre, skip);
 2719     __ cache_wbsync(false);
 2720     __ bind(skip);
 2721     __ leave();
 2722     __ ret(lr);
 2723 
 2724     return start;
 2725   }
 2726 
 2727   void generate_arraycopy_stubs() {
 2728     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 2729     // entry immediately following their stack push. This can be used
 2730     // as a post-push branch target for compatible stubs when they
 2731     // identify a special case that can be handled by the fallback
 2732     // stub e.g a disjoint copy stub may be use as a special case
 2733     // fallback for its compatible conjoint copy stub.
 2734     //
 2735     // A no push entry is always returned in the following local and
 2736     // then published by assigning to the appropriate entry field in
 2737     // class StubRoutines. The entry value is then passed to the
 2738     // generator for the compatible stub. That means the entry must be
 2739     // listed when saving to/restoring from the AOT cache, ensuring
 2740     // that the inter-stub jumps are noted at AOT-cache save and
 2741     // relocated at AOT cache load.
 2742     address nopush_entry;
 2743 
 2744     // generate the common exit first so later stubs can rely on it if
 2745     // they want an UnsafeMemoryAccess exit non-local to the stub
 2746     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2747     // register the stub as the default exit with class UnsafeMemoryAccess
 2748     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2749 
 2750     // generate and publish arch64-specific bulk copy routines first
 2751     // so we can call them from other copy stubs
 2752     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2753     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2754 
 2755     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2756     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2757 
 2758     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2759     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2760 
 2761     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2762 
 2763     //*** jbyte
 2764     // Always need aligned and unaligned versions
 2765     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2766     // disjoint nopush entry is needed by conjoint copy
 2767     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2768     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 2769     // conjoint nopush entry is needed by generic/unsafe copy
 2770     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 2771     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2772     // disjoint arrayof nopush entry is needed by conjoint copy
 2773     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2774     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 2775 
 2776     //*** jshort
 2777     // Always need aligned and unaligned versions
 2778     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 2779     // disjoint nopush entry is needed by conjoint copy
 2780     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 2781     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 2782     // conjoint nopush entry is used by generic/unsafe copy
 2783     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 2784     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 2785     // disjoint arrayof nopush entry is needed by conjoint copy
 2786     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 2787     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 2788 
 2789     //*** jint
 2790     // Aligned versions
 2791     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 2792     // disjoint arrayof nopush entry is needed by conjoint copy
 2793     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 2794     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 2795     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2796     // jint_arraycopy_nopush always points to the unaligned version
 2797     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 2798     // disjoint nopush entry is needed by conjoint copy
 2799     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 2800     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 2801     // conjoint nopush entry is needed by generic/unsafe copy
 2802     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 2803 
 2804     //*** jlong
 2805     // It is always aligned
 2806     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 2807     // disjoint arrayof nopush entry is needed by conjoint copy
 2808     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 2809     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 2810     // conjoint nopush entry is needed by generic/unsafe copy
 2811     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 2812     // disjoint normal/nopush and conjoint normal entries are not
 2813     // generated since the arrayof versions are the same
 2814     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2815     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 2816     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2817 
 2818     //*** oops
 2819     {
 2820       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2821         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 2822       // disjoint arrayof nopush entry is needed by conjoint copy
 2823       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 2824       StubRoutines::_arrayof_oop_arraycopy
 2825         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 2826       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 2827       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 2828       // Aligned versions without pre-barriers
 2829       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2830         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 2831       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 2832       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 2833       // note that we don't need a returned nopush entry because the
 2834       // generic/unsafe copy does not cater for uninit arrays.
 2835       StubRoutines::_arrayof_oop_arraycopy_uninit
 2836         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 2837     }
 2838 
 2839     // for oop copies reuse arrayof entries for non-arrayof cases
 2840     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2841     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 2842     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2843     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2844     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 2845     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2846 
 2847     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 2848     // checkcast nopush entry is needed by generic copy
 2849     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 2850     // note that we don't need a returned nopush entry because the
 2851     // generic copy does not cater for uninit arrays.
 2852     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2853 
 2854     // unsafe arraycopy may fallback on conjoint stubs
 2855     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2856                                                               StubRoutines::_jshort_arraycopy_nopush,
 2857                                                               StubRoutines::_jint_arraycopy_nopush,
 2858                                                               StubRoutines::_jlong_arraycopy_nopush);
 2859 
 2860     // generic arraycopy may fallback on conjoint stubs
 2861     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2862                                                                StubRoutines::_jshort_arraycopy_nopush,
 2863                                                                StubRoutines::_jint_arraycopy_nopush,
 2864                                                                StubRoutines::_oop_arraycopy_nopush,
 2865                                                                StubRoutines::_jlong_arraycopy_nopush,
 2866                                                                StubRoutines::_checkcast_arraycopy_nopush);
 2867 
 2868     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2869     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2870     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2871     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2872     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2873     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2874   }
 2875 
 2876   void generate_math_stubs() { Unimplemented(); }
 2877 
 2878   // Arguments:
 2879   //
 2880   // Inputs:
 2881   //   c_rarg0   - source byte array address
 2882   //   c_rarg1   - destination byte array address
 2883   //   c_rarg2   - K (key) in little endian int array
 2884   //
 2885   address generate_aescrypt_encryptBlock() {
 2886     __ align(CodeEntryAlignment);
 2887     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2888     StubCodeMark mark(this, stub_id);
 2889 
 2890     const Register from        = c_rarg0;  // source array address
 2891     const Register to          = c_rarg1;  // destination array address
 2892     const Register key         = c_rarg2;  // key array address
 2893     const Register keylen      = rscratch1;
 2894 
 2895     address start = __ pc();
 2896     __ enter();
 2897 
 2898     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2899 
 2900     __ aesenc_loadkeys(key, keylen);
 2901     __ aesecb_encrypt(from, to, keylen);
 2902 
 2903     __ mov(r0, 0);
 2904 
 2905     __ leave();
 2906     __ ret(lr);
 2907 
 2908     return start;
 2909   }
 2910 
 2911   // Arguments:
 2912   //
 2913   // Inputs:
 2914   //   c_rarg0   - source byte array address
 2915   //   c_rarg1   - destination byte array address
 2916   //   c_rarg2   - K (key) in little endian int array
 2917   //
 2918   address generate_aescrypt_decryptBlock() {
 2919     assert(UseAES, "need AES cryptographic extension support");
 2920     __ align(CodeEntryAlignment);
 2921     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2922     StubCodeMark mark(this, stub_id);
 2923     Label L_doLast;
 2924 
 2925     const Register from        = c_rarg0;  // source array address
 2926     const Register to          = c_rarg1;  // destination array address
 2927     const Register key         = c_rarg2;  // key array address
 2928     const Register keylen      = rscratch1;
 2929 
 2930     address start = __ pc();
 2931     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2932 
 2933     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2934 
 2935     __ aesecb_decrypt(from, to, key, keylen);
 2936 
 2937     __ mov(r0, 0);
 2938 
 2939     __ leave();
 2940     __ ret(lr);
 2941 
 2942     return start;
 2943   }
 2944 
 2945   // Arguments:
 2946   //
 2947   // Inputs:
 2948   //   c_rarg0   - source byte array address
 2949   //   c_rarg1   - destination byte array address
 2950   //   c_rarg2   - K (key) in little endian int array
 2951   //   c_rarg3   - r vector byte array address
 2952   //   c_rarg4   - input length
 2953   //
 2954   // Output:
 2955   //   x0        - input length
 2956   //
 2957   address generate_cipherBlockChaining_encryptAESCrypt() {
 2958     assert(UseAES, "need AES cryptographic extension support");
 2959     __ align(CodeEntryAlignment);
 2960     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2961     StubCodeMark mark(this, stub_id);
 2962 
 2963     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2964 
 2965     const Register from        = c_rarg0;  // source array address
 2966     const Register to          = c_rarg1;  // destination array address
 2967     const Register key         = c_rarg2;  // key array address
 2968     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2969                                            // and left with the results of the last encryption block
 2970     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2971     const Register keylen      = rscratch1;
 2972 
 2973     address start = __ pc();
 2974 
 2975       __ enter();
 2976 
 2977       __ movw(rscratch2, len_reg);
 2978 
 2979       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2980 
 2981       __ ld1(v0, __ T16B, rvec);
 2982 
 2983       __ cmpw(keylen, 52);
 2984       __ br(Assembler::CC, L_loadkeys_44);
 2985       __ br(Assembler::EQ, L_loadkeys_52);
 2986 
 2987       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2988       __ rev32(v17, __ T16B, v17);
 2989       __ rev32(v18, __ T16B, v18);
 2990     __ BIND(L_loadkeys_52);
 2991       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2992       __ rev32(v19, __ T16B, v19);
 2993       __ rev32(v20, __ T16B, v20);
 2994     __ BIND(L_loadkeys_44);
 2995       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2996       __ rev32(v21, __ T16B, v21);
 2997       __ rev32(v22, __ T16B, v22);
 2998       __ rev32(v23, __ T16B, v23);
 2999       __ rev32(v24, __ T16B, v24);
 3000       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3001       __ rev32(v25, __ T16B, v25);
 3002       __ rev32(v26, __ T16B, v26);
 3003       __ rev32(v27, __ T16B, v27);
 3004       __ rev32(v28, __ T16B, v28);
 3005       __ ld1(v29, v30, v31, __ T16B, key);
 3006       __ rev32(v29, __ T16B, v29);
 3007       __ rev32(v30, __ T16B, v30);
 3008       __ rev32(v31, __ T16B, v31);
 3009 
 3010     __ BIND(L_aes_loop);
 3011       __ ld1(v1, __ T16B, __ post(from, 16));
 3012       __ eor(v0, __ T16B, v0, v1);
 3013 
 3014       __ br(Assembler::CC, L_rounds_44);
 3015       __ br(Assembler::EQ, L_rounds_52);
 3016 
 3017       __ aese(v0, v17); __ aesmc(v0, v0);
 3018       __ aese(v0, v18); __ aesmc(v0, v0);
 3019     __ BIND(L_rounds_52);
 3020       __ aese(v0, v19); __ aesmc(v0, v0);
 3021       __ aese(v0, v20); __ aesmc(v0, v0);
 3022     __ BIND(L_rounds_44);
 3023       __ aese(v0, v21); __ aesmc(v0, v0);
 3024       __ aese(v0, v22); __ aesmc(v0, v0);
 3025       __ aese(v0, v23); __ aesmc(v0, v0);
 3026       __ aese(v0, v24); __ aesmc(v0, v0);
 3027       __ aese(v0, v25); __ aesmc(v0, v0);
 3028       __ aese(v0, v26); __ aesmc(v0, v0);
 3029       __ aese(v0, v27); __ aesmc(v0, v0);
 3030       __ aese(v0, v28); __ aesmc(v0, v0);
 3031       __ aese(v0, v29); __ aesmc(v0, v0);
 3032       __ aese(v0, v30);
 3033       __ eor(v0, __ T16B, v0, v31);
 3034 
 3035       __ st1(v0, __ T16B, __ post(to, 16));
 3036 
 3037       __ subw(len_reg, len_reg, 16);
 3038       __ cbnzw(len_reg, L_aes_loop);
 3039 
 3040       __ st1(v0, __ T16B, rvec);
 3041 
 3042       __ mov(r0, rscratch2);
 3043 
 3044       __ leave();
 3045       __ ret(lr);
 3046 
 3047       return start;
 3048   }
 3049 
 3050   // Arguments:
 3051   //
 3052   // Inputs:
 3053   //   c_rarg0   - source byte array address
 3054   //   c_rarg1   - destination byte array address
 3055   //   c_rarg2   - K (key) in little endian int array
 3056   //   c_rarg3   - r vector byte array address
 3057   //   c_rarg4   - input length
 3058   //
 3059   // Output:
 3060   //   r0        - input length
 3061   //
 3062   address generate_cipherBlockChaining_decryptAESCrypt() {
 3063     assert(UseAES, "need AES cryptographic extension support");
 3064     __ align(CodeEntryAlignment);
 3065     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3066     StubCodeMark mark(this, stub_id);
 3067 
 3068     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3069 
 3070     const Register from        = c_rarg0;  // source array address
 3071     const Register to          = c_rarg1;  // destination array address
 3072     const Register key         = c_rarg2;  // key array address
 3073     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3074                                            // and left with the results of the last encryption block
 3075     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3076     const Register keylen      = rscratch1;
 3077 
 3078     address start = __ pc();
 3079 
 3080       __ enter();
 3081 
 3082       __ movw(rscratch2, len_reg);
 3083 
 3084       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3085 
 3086       __ ld1(v2, __ T16B, rvec);
 3087 
 3088       __ ld1(v31, __ T16B, __ post(key, 16));
 3089       __ rev32(v31, __ T16B, v31);
 3090 
 3091       __ cmpw(keylen, 52);
 3092       __ br(Assembler::CC, L_loadkeys_44);
 3093       __ br(Assembler::EQ, L_loadkeys_52);
 3094 
 3095       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3096       __ rev32(v17, __ T16B, v17);
 3097       __ rev32(v18, __ T16B, v18);
 3098     __ BIND(L_loadkeys_52);
 3099       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3100       __ rev32(v19, __ T16B, v19);
 3101       __ rev32(v20, __ T16B, v20);
 3102     __ BIND(L_loadkeys_44);
 3103       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3104       __ rev32(v21, __ T16B, v21);
 3105       __ rev32(v22, __ T16B, v22);
 3106       __ rev32(v23, __ T16B, v23);
 3107       __ rev32(v24, __ T16B, v24);
 3108       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3109       __ rev32(v25, __ T16B, v25);
 3110       __ rev32(v26, __ T16B, v26);
 3111       __ rev32(v27, __ T16B, v27);
 3112       __ rev32(v28, __ T16B, v28);
 3113       __ ld1(v29, v30, __ T16B, key);
 3114       __ rev32(v29, __ T16B, v29);
 3115       __ rev32(v30, __ T16B, v30);
 3116 
 3117     __ BIND(L_aes_loop);
 3118       __ ld1(v0, __ T16B, __ post(from, 16));
 3119       __ orr(v1, __ T16B, v0, v0);
 3120 
 3121       __ br(Assembler::CC, L_rounds_44);
 3122       __ br(Assembler::EQ, L_rounds_52);
 3123 
 3124       __ aesd(v0, v17); __ aesimc(v0, v0);
 3125       __ aesd(v0, v18); __ aesimc(v0, v0);
 3126     __ BIND(L_rounds_52);
 3127       __ aesd(v0, v19); __ aesimc(v0, v0);
 3128       __ aesd(v0, v20); __ aesimc(v0, v0);
 3129     __ BIND(L_rounds_44);
 3130       __ aesd(v0, v21); __ aesimc(v0, v0);
 3131       __ aesd(v0, v22); __ aesimc(v0, v0);
 3132       __ aesd(v0, v23); __ aesimc(v0, v0);
 3133       __ aesd(v0, v24); __ aesimc(v0, v0);
 3134       __ aesd(v0, v25); __ aesimc(v0, v0);
 3135       __ aesd(v0, v26); __ aesimc(v0, v0);
 3136       __ aesd(v0, v27); __ aesimc(v0, v0);
 3137       __ aesd(v0, v28); __ aesimc(v0, v0);
 3138       __ aesd(v0, v29); __ aesimc(v0, v0);
 3139       __ aesd(v0, v30);
 3140       __ eor(v0, __ T16B, v0, v31);
 3141       __ eor(v0, __ T16B, v0, v2);
 3142 
 3143       __ st1(v0, __ T16B, __ post(to, 16));
 3144       __ orr(v2, __ T16B, v1, v1);
 3145 
 3146       __ subw(len_reg, len_reg, 16);
 3147       __ cbnzw(len_reg, L_aes_loop);
 3148 
 3149       __ st1(v2, __ T16B, rvec);
 3150 
 3151       __ mov(r0, rscratch2);
 3152 
 3153       __ leave();
 3154       __ ret(lr);
 3155 
 3156     return start;
 3157   }
 3158 
 3159   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3160   // Inputs: 128-bits. in is preserved.
 3161   // The least-significant 64-bit word is in the upper dword of each vector.
 3162   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3163   // Output: result
 3164   void be_add_128_64(FloatRegister result, FloatRegister in,
 3165                      FloatRegister inc, FloatRegister tmp) {
 3166     assert_different_registers(result, tmp, inc);
 3167 
 3168     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3169                                            // input
 3170     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3171     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3172                                            // MSD == 0 (must be!) to LSD
 3173     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3174   }
 3175 
 3176   // CTR AES crypt.
 3177   // Arguments:
 3178   //
 3179   // Inputs:
 3180   //   c_rarg0   - source byte array address
 3181   //   c_rarg1   - destination byte array address
 3182   //   c_rarg2   - K (key) in little endian int array
 3183   //   c_rarg3   - counter vector byte array address
 3184   //   c_rarg4   - input length
 3185   //   c_rarg5   - saved encryptedCounter start
 3186   //   c_rarg6   - saved used length
 3187   //
 3188   // Output:
 3189   //   r0       - input length
 3190   //
 3191   address generate_counterMode_AESCrypt() {
 3192     const Register in = c_rarg0;
 3193     const Register out = c_rarg1;
 3194     const Register key = c_rarg2;
 3195     const Register counter = c_rarg3;
 3196     const Register saved_len = c_rarg4, len = r10;
 3197     const Register saved_encrypted_ctr = c_rarg5;
 3198     const Register used_ptr = c_rarg6, used = r12;
 3199 
 3200     const Register offset = r7;
 3201     const Register keylen = r11;
 3202 
 3203     const unsigned char block_size = 16;
 3204     const int bulk_width = 4;
 3205     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3206     // performance with larger data sizes, but it also means that the
 3207     // fast path isn't used until you have at least 8 blocks, and up
 3208     // to 127 bytes of data will be executed on the slow path. For
 3209     // that reason, and also so as not to blow away too much icache, 4
 3210     // blocks seems like a sensible compromise.
 3211 
 3212     // Algorithm:
 3213     //
 3214     //    if (len == 0) {
 3215     //        goto DONE;
 3216     //    }
 3217     //    int result = len;
 3218     //    do {
 3219     //        if (used >= blockSize) {
 3220     //            if (len >= bulk_width * blockSize) {
 3221     //                CTR_large_block();
 3222     //                if (len == 0)
 3223     //                    goto DONE;
 3224     //            }
 3225     //            for (;;) {
 3226     //                16ByteVector v0 = counter;
 3227     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3228     //                used = 0;
 3229     //                if (len < blockSize)
 3230     //                    break;    /* goto NEXT */
 3231     //                16ByteVector v1 = load16Bytes(in, offset);
 3232     //                v1 = v1 ^ encryptedCounter;
 3233     //                store16Bytes(out, offset);
 3234     //                used = blockSize;
 3235     //                offset += blockSize;
 3236     //                len -= blockSize;
 3237     //                if (len == 0)
 3238     //                    goto DONE;
 3239     //            }
 3240     //        }
 3241     //      NEXT:
 3242     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3243     //        len--;
 3244     //    } while (len != 0);
 3245     //  DONE:
 3246     //    return result;
 3247     //
 3248     // CTR_large_block()
 3249     //    Wide bulk encryption of whole blocks.
 3250 
 3251     __ align(CodeEntryAlignment);
 3252     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3253     StubCodeMark mark(this, stub_id);
 3254     const address start = __ pc();
 3255     __ enter();
 3256 
 3257     Label DONE, CTR_large_block, large_block_return;
 3258     __ ldrw(used, Address(used_ptr));
 3259     __ cbzw(saved_len, DONE);
 3260 
 3261     __ mov(len, saved_len);
 3262     __ mov(offset, 0);
 3263 
 3264     // Compute #rounds for AES based on the length of the key array
 3265     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3266 
 3267     __ aesenc_loadkeys(key, keylen);
 3268 
 3269     {
 3270       Label L_CTR_loop, NEXT;
 3271 
 3272       __ bind(L_CTR_loop);
 3273 
 3274       __ cmp(used, block_size);
 3275       __ br(__ LO, NEXT);
 3276 
 3277       // Maybe we have a lot of data
 3278       __ subsw(rscratch1, len, bulk_width * block_size);
 3279       __ br(__ HS, CTR_large_block);
 3280       __ BIND(large_block_return);
 3281       __ cbzw(len, DONE);
 3282 
 3283       // Setup the counter
 3284       __ movi(v4, __ T4S, 0);
 3285       __ movi(v5, __ T4S, 1);
 3286       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3287 
 3288       // 128-bit big-endian increment
 3289       __ ld1(v0, __ T16B, counter);
 3290       __ rev64(v16, __ T16B, v0);
 3291       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3292       __ rev64(v16, __ T16B, v16);
 3293       __ st1(v16, __ T16B, counter);
 3294       // Previous counter value is in v0
 3295       // v4 contains { 0, 1 }
 3296 
 3297       {
 3298         // We have fewer than bulk_width blocks of data left. Encrypt
 3299         // them one by one until there is less than a full block
 3300         // remaining, being careful to save both the encrypted counter
 3301         // and the counter.
 3302 
 3303         Label inner_loop;
 3304         __ bind(inner_loop);
 3305         // Counter to encrypt is in v0
 3306         __ aesecb_encrypt(noreg, noreg, keylen);
 3307         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3308 
 3309         // Do we have a remaining full block?
 3310 
 3311         __ mov(used, 0);
 3312         __ cmp(len, block_size);
 3313         __ br(__ LO, NEXT);
 3314 
 3315         // Yes, we have a full block
 3316         __ ldrq(v1, Address(in, offset));
 3317         __ eor(v1, __ T16B, v1, v0);
 3318         __ strq(v1, Address(out, offset));
 3319         __ mov(used, block_size);
 3320         __ add(offset, offset, block_size);
 3321 
 3322         __ subw(len, len, block_size);
 3323         __ cbzw(len, DONE);
 3324 
 3325         // Increment the counter, store it back
 3326         __ orr(v0, __ T16B, v16, v16);
 3327         __ rev64(v16, __ T16B, v16);
 3328         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3329         __ rev64(v16, __ T16B, v16);
 3330         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3331 
 3332         __ b(inner_loop);
 3333       }
 3334 
 3335       __ BIND(NEXT);
 3336 
 3337       // Encrypt a single byte, and loop.
 3338       // We expect this to be a rare event.
 3339       __ ldrb(rscratch1, Address(in, offset));
 3340       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3341       __ eor(rscratch1, rscratch1, rscratch2);
 3342       __ strb(rscratch1, Address(out, offset));
 3343       __ add(offset, offset, 1);
 3344       __ add(used, used, 1);
 3345       __ subw(len, len,1);
 3346       __ cbnzw(len, L_CTR_loop);
 3347     }
 3348 
 3349     __ bind(DONE);
 3350     __ strw(used, Address(used_ptr));
 3351     __ mov(r0, saved_len);
 3352 
 3353     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3354     __ ret(lr);
 3355 
 3356     // Bulk encryption
 3357 
 3358     __ BIND (CTR_large_block);
 3359     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3360 
 3361     if (bulk_width == 8) {
 3362       __ sub(sp, sp, 4 * 16);
 3363       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3364     }
 3365     __ sub(sp, sp, 4 * 16);
 3366     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3367     RegSet saved_regs = (RegSet::of(in, out, offset)
 3368                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3369     __ push(saved_regs, sp);
 3370     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3371     __ add(in, in, offset);
 3372     __ add(out, out, offset);
 3373 
 3374     // Keys should already be loaded into the correct registers
 3375 
 3376     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3377     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3378 
 3379     // AES/CTR loop
 3380     {
 3381       Label L_CTR_loop;
 3382       __ BIND(L_CTR_loop);
 3383 
 3384       // Setup the counters
 3385       __ movi(v8, __ T4S, 0);
 3386       __ movi(v9, __ T4S, 1);
 3387       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3388 
 3389       for (int i = 0; i < bulk_width; i++) {
 3390         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3391         __ rev64(v0_ofs, __ T16B, v16);
 3392         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3393       }
 3394 
 3395       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3396 
 3397       // Encrypt the counters
 3398       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3399 
 3400       if (bulk_width == 8) {
 3401         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3402       }
 3403 
 3404       // XOR the encrypted counters with the inputs
 3405       for (int i = 0; i < bulk_width; i++) {
 3406         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3407         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3408         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3409       }
 3410 
 3411       // Write the encrypted data
 3412       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3413       if (bulk_width == 8) {
 3414         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3415       }
 3416 
 3417       __ subw(len, len, 16 * bulk_width);
 3418       __ cbnzw(len, L_CTR_loop);
 3419     }
 3420 
 3421     // Save the counter back where it goes
 3422     __ rev64(v16, __ T16B, v16);
 3423     __ st1(v16, __ T16B, counter);
 3424 
 3425     __ pop(saved_regs, sp);
 3426 
 3427     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3428     if (bulk_width == 8) {
 3429       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3430     }
 3431 
 3432     __ andr(rscratch1, len, -16 * bulk_width);
 3433     __ sub(len, len, rscratch1);
 3434     __ add(offset, offset, rscratch1);
 3435     __ mov(used, 16);
 3436     __ strw(used, Address(used_ptr));
 3437     __ b(large_block_return);
 3438 
 3439     return start;
 3440   }
 3441 
 3442   // Vector AES Galois Counter Mode implementation. Parameters:
 3443   //
 3444   // in = c_rarg0
 3445   // len = c_rarg1
 3446   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3447   // out = c_rarg3
 3448   // key = c_rarg4
 3449   // state = c_rarg5 - GHASH.state
 3450   // subkeyHtbl = c_rarg6 - powers of H
 3451   // counter = c_rarg7 - 16 bytes of CTR
 3452   // return - number of processed bytes
 3453   address generate_galoisCounterMode_AESCrypt() {
 3454     Label ghash_polynomial; // local data generated after code
 3455 
 3456    __ align(CodeEntryAlignment);
 3457     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3458     StubCodeMark mark(this, stub_id);
 3459     address start = __ pc();
 3460     __ enter();
 3461 
 3462     const Register in = c_rarg0;
 3463     const Register len = c_rarg1;
 3464     const Register ct = c_rarg2;
 3465     const Register out = c_rarg3;
 3466     // and updated with the incremented counter in the end
 3467 
 3468     const Register key = c_rarg4;
 3469     const Register state = c_rarg5;
 3470 
 3471     const Register subkeyHtbl = c_rarg6;
 3472 
 3473     const Register counter = c_rarg7;
 3474 
 3475     const Register keylen = r10;
 3476     // Save state before entering routine
 3477     __ sub(sp, sp, 4 * 16);
 3478     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3479     __ sub(sp, sp, 4 * 16);
 3480     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3481 
 3482     // __ andr(len, len, -512);
 3483     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3484     __ str(len, __ pre(sp, -2 * wordSize));
 3485 
 3486     Label DONE;
 3487     __ cbz(len, DONE);
 3488 
 3489     // Compute #rounds for AES based on the length of the key array
 3490     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3491 
 3492     __ aesenc_loadkeys(key, keylen);
 3493     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3494     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3495 
 3496     // AES/CTR loop
 3497     {
 3498       Label L_CTR_loop;
 3499       __ BIND(L_CTR_loop);
 3500 
 3501       // Setup the counters
 3502       __ movi(v8, __ T4S, 0);
 3503       __ movi(v9, __ T4S, 1);
 3504       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3505 
 3506       assert(v0->encoding() < v8->encoding(), "");
 3507       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3508         FloatRegister f = as_FloatRegister(i);
 3509         __ rev32(f, __ T16B, v16);
 3510         __ addv(v16, __ T4S, v16, v8);
 3511       }
 3512 
 3513       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3514 
 3515       // Encrypt the counters
 3516       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3517 
 3518       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3519 
 3520       // XOR the encrypted counters with the inputs
 3521       for (int i = 0; i < 8; i++) {
 3522         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3523         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3524         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3525       }
 3526       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3527       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3528 
 3529       __ subw(len, len, 16 * 8);
 3530       __ cbnzw(len, L_CTR_loop);
 3531     }
 3532 
 3533     __ rev32(v16, __ T16B, v16);
 3534     __ st1(v16, __ T16B, counter);
 3535 
 3536     __ ldr(len, Address(sp));
 3537     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3538 
 3539     // GHASH/CTR loop
 3540     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3541                                 len, /*unrolls*/4);
 3542 
 3543 #ifdef ASSERT
 3544     { Label L;
 3545       __ cmp(len, (unsigned char)0);
 3546       __ br(Assembler::EQ, L);
 3547       __ stop("stubGenerator: abort");
 3548       __ bind(L);
 3549   }
 3550 #endif
 3551 
 3552   __ bind(DONE);
 3553     // Return the number of bytes processed
 3554     __ ldr(r0, __ post(sp, 2 * wordSize));
 3555 
 3556     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3557     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3558 
 3559     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3560     __ ret(lr);
 3561 
 3562     // bind label and generate polynomial data
 3563     __ align(wordSize * 2);
 3564     __ bind(ghash_polynomial);
 3565     __ emit_int64(0x87);  // The low-order bits of the field
 3566                           // polynomial (i.e. p = z^7+z^2+z+1)
 3567                           // repeated in the low and high parts of a
 3568                           // 128-bit vector
 3569     __ emit_int64(0x87);
 3570 
 3571     return start;
 3572   }
 3573 
 3574   class Cached64Bytes {
 3575   private:
 3576     MacroAssembler *_masm;
 3577     Register _regs[8];
 3578 
 3579   public:
 3580     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3581       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3582       auto it = rs.begin();
 3583       for (auto &r: _regs) {
 3584         r = *it;
 3585         ++it;
 3586       }
 3587     }
 3588 
 3589     void gen_loads(Register base) {
 3590       for (int i = 0; i < 8; i += 2) {
 3591         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3592       }
 3593     }
 3594 
 3595     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3596     void extract_u32(Register dest, int i) {
 3597       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3598     }
 3599   };
 3600 
 3601   // Utility routines for md5.
 3602   // Clobbers r10 and r11.
 3603   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3604               int k, int s, int t) {
 3605     Register rscratch3 = r10;
 3606     Register rscratch4 = r11;
 3607 
 3608     __ eorw(rscratch3, r3, r4);
 3609     __ movw(rscratch2, t);
 3610     __ andw(rscratch3, rscratch3, r2);
 3611     __ addw(rscratch4, r1, rscratch2);
 3612     reg_cache.extract_u32(rscratch1, k);
 3613     __ eorw(rscratch3, rscratch3, r4);
 3614     __ addw(rscratch4, rscratch4, rscratch1);
 3615     __ addw(rscratch3, rscratch3, rscratch4);
 3616     __ rorw(rscratch2, rscratch3, 32 - s);
 3617     __ addw(r1, rscratch2, r2);
 3618   }
 3619 
 3620   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3621               int k, int s, int t) {
 3622     Register rscratch3 = r10;
 3623     Register rscratch4 = r11;
 3624 
 3625     reg_cache.extract_u32(rscratch1, k);
 3626     __ movw(rscratch2, t);
 3627     __ addw(rscratch4, r1, rscratch2);
 3628     __ addw(rscratch4, rscratch4, rscratch1);
 3629     __ bicw(rscratch2, r3, r4);
 3630     __ andw(rscratch3, r2, r4);
 3631     __ addw(rscratch2, rscratch2, rscratch4);
 3632     __ addw(rscratch2, rscratch2, rscratch3);
 3633     __ rorw(rscratch2, rscratch2, 32 - s);
 3634     __ addw(r1, rscratch2, r2);
 3635   }
 3636 
 3637   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3638               int k, int s, int t) {
 3639     Register rscratch3 = r10;
 3640     Register rscratch4 = r11;
 3641 
 3642     __ eorw(rscratch3, r3, r4);
 3643     __ movw(rscratch2, t);
 3644     __ addw(rscratch4, r1, rscratch2);
 3645     reg_cache.extract_u32(rscratch1, k);
 3646     __ eorw(rscratch3, rscratch3, r2);
 3647     __ addw(rscratch4, rscratch4, rscratch1);
 3648     __ addw(rscratch3, rscratch3, rscratch4);
 3649     __ rorw(rscratch2, rscratch3, 32 - s);
 3650     __ addw(r1, rscratch2, r2);
 3651   }
 3652 
 3653   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3654               int k, int s, int t) {
 3655     Register rscratch3 = r10;
 3656     Register rscratch4 = r11;
 3657 
 3658     __ movw(rscratch3, t);
 3659     __ ornw(rscratch2, r2, r4);
 3660     __ addw(rscratch4, r1, rscratch3);
 3661     reg_cache.extract_u32(rscratch1, k);
 3662     __ eorw(rscratch3, rscratch2, r3);
 3663     __ addw(rscratch4, rscratch4, rscratch1);
 3664     __ addw(rscratch3, rscratch3, rscratch4);
 3665     __ rorw(rscratch2, rscratch3, 32 - s);
 3666     __ addw(r1, rscratch2, r2);
 3667   }
 3668 
 3669   // Arguments:
 3670   //
 3671   // Inputs:
 3672   //   c_rarg0   - byte[]  source+offset
 3673   //   c_rarg1   - int[]   SHA.state
 3674   //   c_rarg2   - int     offset
 3675   //   c_rarg3   - int     limit
 3676   //
 3677   address generate_md5_implCompress(StubId stub_id) {
 3678     bool multi_block;
 3679     switch (stub_id) {
 3680     case StubId::stubgen_md5_implCompress_id:
 3681       multi_block = false;
 3682       break;
 3683     case StubId::stubgen_md5_implCompressMB_id:
 3684       multi_block = true;
 3685       break;
 3686     default:
 3687       ShouldNotReachHere();
 3688     }
 3689     __ align(CodeEntryAlignment);
 3690 
 3691     StubCodeMark mark(this, stub_id);
 3692     address start = __ pc();
 3693 
 3694     Register buf       = c_rarg0;
 3695     Register state     = c_rarg1;
 3696     Register ofs       = c_rarg2;
 3697     Register limit     = c_rarg3;
 3698     Register a         = r4;
 3699     Register b         = r5;
 3700     Register c         = r6;
 3701     Register d         = r7;
 3702     Register rscratch3 = r10;
 3703     Register rscratch4 = r11;
 3704 
 3705     Register state_regs[2] = { r12, r13 };
 3706     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3707     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3708 
 3709     __ push(saved_regs, sp);
 3710 
 3711     __ ldp(state_regs[0], state_regs[1], Address(state));
 3712     __ ubfx(a, state_regs[0],  0, 32);
 3713     __ ubfx(b, state_regs[0], 32, 32);
 3714     __ ubfx(c, state_regs[1],  0, 32);
 3715     __ ubfx(d, state_regs[1], 32, 32);
 3716 
 3717     Label md5_loop;
 3718     __ BIND(md5_loop);
 3719 
 3720     reg_cache.gen_loads(buf);
 3721 
 3722     // Round 1
 3723     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3724     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3725     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3726     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3727     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3728     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3729     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3730     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3731     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3732     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3733     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3734     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3735     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3736     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3737     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3738     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3739 
 3740     // Round 2
 3741     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3742     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3743     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3744     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3745     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3746     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3747     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3748     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3749     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3750     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3751     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3752     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3753     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3754     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3755     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3756     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3757 
 3758     // Round 3
 3759     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3760     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3761     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3762     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3763     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3764     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3765     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3766     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3767     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3768     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3769     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3770     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3771     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3772     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3773     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3774     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3775 
 3776     // Round 4
 3777     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3778     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3779     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3780     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3781     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3782     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3783     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3784     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3785     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3786     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3787     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3788     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3789     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3790     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3791     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3792     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3793 
 3794     __ addw(a, state_regs[0], a);
 3795     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3796     __ addw(b, rscratch2, b);
 3797     __ addw(c, state_regs[1], c);
 3798     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3799     __ addw(d, rscratch4, d);
 3800 
 3801     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3802     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3803 
 3804     if (multi_block) {
 3805       __ add(buf, buf, 64);
 3806       __ add(ofs, ofs, 64);
 3807       __ cmp(ofs, limit);
 3808       __ br(Assembler::LE, md5_loop);
 3809       __ mov(c_rarg0, ofs); // return ofs
 3810     }
 3811 
 3812     // write hash values back in the correct order
 3813     __ stp(state_regs[0], state_regs[1], Address(state));
 3814 
 3815     __ pop(saved_regs, sp);
 3816 
 3817     __ ret(lr);
 3818 
 3819     return start;
 3820   }
 3821 
 3822   // Arguments:
 3823   //
 3824   // Inputs:
 3825   //   c_rarg0   - byte[]  source+offset
 3826   //   c_rarg1   - int[]   SHA.state
 3827   //   c_rarg2   - int     offset
 3828   //   c_rarg3   - int     limit
 3829   //
 3830   address generate_sha1_implCompress(StubId stub_id) {
 3831     bool multi_block;
 3832     switch (stub_id) {
 3833     case StubId::stubgen_sha1_implCompress_id:
 3834       multi_block = false;
 3835       break;
 3836     case StubId::stubgen_sha1_implCompressMB_id:
 3837       multi_block = true;
 3838       break;
 3839     default:
 3840       ShouldNotReachHere();
 3841     }
 3842 
 3843     __ align(CodeEntryAlignment);
 3844 
 3845     StubCodeMark mark(this, stub_id);
 3846     address start = __ pc();
 3847 
 3848     Register buf   = c_rarg0;
 3849     Register state = c_rarg1;
 3850     Register ofs   = c_rarg2;
 3851     Register limit = c_rarg3;
 3852 
 3853     Label keys;
 3854     Label sha1_loop;
 3855 
 3856     // load the keys into v0..v3
 3857     __ adr(rscratch1, keys);
 3858     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3859     // load 5 words state into v6, v7
 3860     __ ldrq(v6, Address(state, 0));
 3861     __ ldrs(v7, Address(state, 16));
 3862 
 3863 
 3864     __ BIND(sha1_loop);
 3865     // load 64 bytes of data into v16..v19
 3866     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3867     __ rev32(v16, __ T16B, v16);
 3868     __ rev32(v17, __ T16B, v17);
 3869     __ rev32(v18, __ T16B, v18);
 3870     __ rev32(v19, __ T16B, v19);
 3871 
 3872     // do the sha1
 3873     __ addv(v4, __ T4S, v16, v0);
 3874     __ orr(v20, __ T16B, v6, v6);
 3875 
 3876     FloatRegister d0 = v16;
 3877     FloatRegister d1 = v17;
 3878     FloatRegister d2 = v18;
 3879     FloatRegister d3 = v19;
 3880 
 3881     for (int round = 0; round < 20; round++) {
 3882       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3883       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3884       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3885       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3886       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3887 
 3888       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3889       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3890       __ sha1h(tmp2, __ T4S, v20);
 3891       if (round < 5)
 3892         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3893       else if (round < 10 || round >= 15)
 3894         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3895       else
 3896         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3897       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3898 
 3899       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3900     }
 3901 
 3902     __ addv(v7, __ T2S, v7, v21);
 3903     __ addv(v6, __ T4S, v6, v20);
 3904 
 3905     if (multi_block) {
 3906       __ add(ofs, ofs, 64);
 3907       __ cmp(ofs, limit);
 3908       __ br(Assembler::LE, sha1_loop);
 3909       __ mov(c_rarg0, ofs); // return ofs
 3910     }
 3911 
 3912     __ strq(v6, Address(state, 0));
 3913     __ strs(v7, Address(state, 16));
 3914 
 3915     __ ret(lr);
 3916 
 3917     __ bind(keys);
 3918     __ emit_int32(0x5a827999);
 3919     __ emit_int32(0x6ed9eba1);
 3920     __ emit_int32(0x8f1bbcdc);
 3921     __ emit_int32(0xca62c1d6);
 3922 
 3923     return start;
 3924   }
 3925 
 3926 
 3927   // Arguments:
 3928   //
 3929   // Inputs:
 3930   //   c_rarg0   - byte[]  source+offset
 3931   //   c_rarg1   - int[]   SHA.state
 3932   //   c_rarg2   - int     offset
 3933   //   c_rarg3   - int     limit
 3934   //
 3935   address generate_sha256_implCompress(StubId stub_id) {
 3936     bool multi_block;
 3937     switch (stub_id) {
 3938     case StubId::stubgen_sha256_implCompress_id:
 3939       multi_block = false;
 3940       break;
 3941     case StubId::stubgen_sha256_implCompressMB_id:
 3942       multi_block = true;
 3943       break;
 3944     default:
 3945       ShouldNotReachHere();
 3946     }
 3947 
 3948     static const uint32_t round_consts[64] = {
 3949       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3950       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3951       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3952       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3953       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3954       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3955       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3956       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3957       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3958       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3959       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3960       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3961       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3962       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3963       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3964       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3965     };
 3966 
 3967     __ align(CodeEntryAlignment);
 3968 
 3969     StubCodeMark mark(this, stub_id);
 3970     address start = __ pc();
 3971 
 3972     Register buf   = c_rarg0;
 3973     Register state = c_rarg1;
 3974     Register ofs   = c_rarg2;
 3975     Register limit = c_rarg3;
 3976 
 3977     Label sha1_loop;
 3978 
 3979     __ stpd(v8, v9, __ pre(sp, -32));
 3980     __ stpd(v10, v11, Address(sp, 16));
 3981 
 3982 // dga == v0
 3983 // dgb == v1
 3984 // dg0 == v2
 3985 // dg1 == v3
 3986 // dg2 == v4
 3987 // t0 == v6
 3988 // t1 == v7
 3989 
 3990     // load 16 keys to v16..v31
 3991     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3992     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3993     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3994     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3995     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3996 
 3997     // load 8 words (256 bits) state
 3998     __ ldpq(v0, v1, state);
 3999 
 4000     __ BIND(sha1_loop);
 4001     // load 64 bytes of data into v8..v11
 4002     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4003     __ rev32(v8, __ T16B, v8);
 4004     __ rev32(v9, __ T16B, v9);
 4005     __ rev32(v10, __ T16B, v10);
 4006     __ rev32(v11, __ T16B, v11);
 4007 
 4008     __ addv(v6, __ T4S, v8, v16);
 4009     __ orr(v2, __ T16B, v0, v0);
 4010     __ orr(v3, __ T16B, v1, v1);
 4011 
 4012     FloatRegister d0 = v8;
 4013     FloatRegister d1 = v9;
 4014     FloatRegister d2 = v10;
 4015     FloatRegister d3 = v11;
 4016 
 4017 
 4018     for (int round = 0; round < 16; round++) {
 4019       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4020       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4021       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4022       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4023 
 4024       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4025        __ orr(v4, __ T16B, v2, v2);
 4026       if (round < 15)
 4027         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4028       __ sha256h(v2, __ T4S, v3, tmp2);
 4029       __ sha256h2(v3, __ T4S, v4, tmp2);
 4030       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4031 
 4032       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4033     }
 4034 
 4035     __ addv(v0, __ T4S, v0, v2);
 4036     __ addv(v1, __ T4S, v1, v3);
 4037 
 4038     if (multi_block) {
 4039       __ add(ofs, ofs, 64);
 4040       __ cmp(ofs, limit);
 4041       __ br(Assembler::LE, sha1_loop);
 4042       __ mov(c_rarg0, ofs); // return ofs
 4043     }
 4044 
 4045     __ ldpd(v10, v11, Address(sp, 16));
 4046     __ ldpd(v8, v9, __ post(sp, 32));
 4047 
 4048     __ stpq(v0, v1, state);
 4049 
 4050     __ ret(lr);
 4051 
 4052     return start;
 4053   }
 4054 
 4055   // Double rounds for sha512.
 4056   void sha512_dround(int dr,
 4057                      FloatRegister vi0, FloatRegister vi1,
 4058                      FloatRegister vi2, FloatRegister vi3,
 4059                      FloatRegister vi4, FloatRegister vrc0,
 4060                      FloatRegister vrc1, FloatRegister vin0,
 4061                      FloatRegister vin1, FloatRegister vin2,
 4062                      FloatRegister vin3, FloatRegister vin4) {
 4063       if (dr < 36) {
 4064         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4065       }
 4066       __ addv(v5, __ T2D, vrc0, vin0);
 4067       __ ext(v6, __ T16B, vi2, vi3, 8);
 4068       __ ext(v5, __ T16B, v5, v5, 8);
 4069       __ ext(v7, __ T16B, vi1, vi2, 8);
 4070       __ addv(vi3, __ T2D, vi3, v5);
 4071       if (dr < 32) {
 4072         __ ext(v5, __ T16B, vin3, vin4, 8);
 4073         __ sha512su0(vin0, __ T2D, vin1);
 4074       }
 4075       __ sha512h(vi3, __ T2D, v6, v7);
 4076       if (dr < 32) {
 4077         __ sha512su1(vin0, __ T2D, vin2, v5);
 4078       }
 4079       __ addv(vi4, __ T2D, vi1, vi3);
 4080       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4081   }
 4082 
 4083   // Arguments:
 4084   //
 4085   // Inputs:
 4086   //   c_rarg0   - byte[]  source+offset
 4087   //   c_rarg1   - int[]   SHA.state
 4088   //   c_rarg2   - int     offset
 4089   //   c_rarg3   - int     limit
 4090   //
 4091   address generate_sha512_implCompress(StubId stub_id) {
 4092     bool multi_block;
 4093     switch (stub_id) {
 4094     case StubId::stubgen_sha512_implCompress_id:
 4095       multi_block = false;
 4096       break;
 4097     case StubId::stubgen_sha512_implCompressMB_id:
 4098       multi_block = true;
 4099       break;
 4100     default:
 4101       ShouldNotReachHere();
 4102     }
 4103 
 4104     static const uint64_t round_consts[80] = {
 4105       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4106       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4107       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4108       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4109       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4110       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4111       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4112       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4113       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4114       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4115       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4116       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4117       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4118       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4119       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4120       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4121       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4122       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4123       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4124       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4125       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4126       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4127       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4128       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4129       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4130       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4131       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4132     };
 4133 
 4134     __ align(CodeEntryAlignment);
 4135 
 4136     StubCodeMark mark(this, stub_id);
 4137     address start = __ pc();
 4138 
 4139     Register buf   = c_rarg0;
 4140     Register state = c_rarg1;
 4141     Register ofs   = c_rarg2;
 4142     Register limit = c_rarg3;
 4143 
 4144     __ stpd(v8, v9, __ pre(sp, -64));
 4145     __ stpd(v10, v11, Address(sp, 16));
 4146     __ stpd(v12, v13, Address(sp, 32));
 4147     __ stpd(v14, v15, Address(sp, 48));
 4148 
 4149     Label sha512_loop;
 4150 
 4151     // load state
 4152     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4153 
 4154     // load first 4 round constants
 4155     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4156     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4157 
 4158     __ BIND(sha512_loop);
 4159     // load 128B of data into v12..v19
 4160     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4161     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4162     __ rev64(v12, __ T16B, v12);
 4163     __ rev64(v13, __ T16B, v13);
 4164     __ rev64(v14, __ T16B, v14);
 4165     __ rev64(v15, __ T16B, v15);
 4166     __ rev64(v16, __ T16B, v16);
 4167     __ rev64(v17, __ T16B, v17);
 4168     __ rev64(v18, __ T16B, v18);
 4169     __ rev64(v19, __ T16B, v19);
 4170 
 4171     __ mov(rscratch2, rscratch1);
 4172 
 4173     __ mov(v0, __ T16B, v8);
 4174     __ mov(v1, __ T16B, v9);
 4175     __ mov(v2, __ T16B, v10);
 4176     __ mov(v3, __ T16B, v11);
 4177 
 4178     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4179     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4180     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4181     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4182     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4183     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4184     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4185     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4186     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4187     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4188     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4189     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4190     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4191     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4192     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4193     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4194     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4195     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4196     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4197     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4198     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4199     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4200     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4201     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4202     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4203     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4204     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4205     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4206     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4207     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4208     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4209     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4210     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4211     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4212     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4213     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4214     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4215     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4216     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4217     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4218 
 4219     __ addv(v8, __ T2D, v8, v0);
 4220     __ addv(v9, __ T2D, v9, v1);
 4221     __ addv(v10, __ T2D, v10, v2);
 4222     __ addv(v11, __ T2D, v11, v3);
 4223 
 4224     if (multi_block) {
 4225       __ add(ofs, ofs, 128);
 4226       __ cmp(ofs, limit);
 4227       __ br(Assembler::LE, sha512_loop);
 4228       __ mov(c_rarg0, ofs); // return ofs
 4229     }
 4230 
 4231     __ st1(v8, v9, v10, v11, __ T2D, state);
 4232 
 4233     __ ldpd(v14, v15, Address(sp, 48));
 4234     __ ldpd(v12, v13, Address(sp, 32));
 4235     __ ldpd(v10, v11, Address(sp, 16));
 4236     __ ldpd(v8, v9, __ post(sp, 64));
 4237 
 4238     __ ret(lr);
 4239 
 4240     return start;
 4241   }
 4242 
 4243   // Execute one round of keccak of two computations in parallel.
 4244   // One of the states should be loaded into the lower halves of
 4245   // the vector registers v0-v24, the other should be loaded into
 4246   // the upper halves of those registers. The ld1r instruction loads
 4247   // the round constant into both halves of register v31.
 4248   // Intermediate results c0...c5 and d0...d5 are computed
 4249   // in registers v25...v30.
 4250   // All vector instructions that are used operate on both register
 4251   // halves in parallel.
 4252   // If only a single computation is needed, one can only load the lower halves.
 4253   void keccak_round(Register rscratch1) {
 4254   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4255   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4256   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4257   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4258   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4259   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4260   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4261   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4262   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4263   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4264 
 4265   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4266   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4267   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4268   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4269   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4270 
 4271   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4272   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4273   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4274   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4275   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4276   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4277   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4278   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4279   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4280   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4281   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4282   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4283   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4284   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4285   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4286   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4287   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4288   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4289   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4290   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4291   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4292   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4293   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4294   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4295   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4296 
 4297   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4298   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4299   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4300   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4301   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4302 
 4303   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4304 
 4305   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4306   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4307   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4308   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4309   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4310 
 4311   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4312   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4313   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4314   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4315   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4316 
 4317   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4318   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4319   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4320   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4321   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4322 
 4323   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4324   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4325   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4326   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4327   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4328 
 4329   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4330   }
 4331 
 4332   // Arguments:
 4333   //
 4334   // Inputs:
 4335   //   c_rarg0   - byte[]  source+offset
 4336   //   c_rarg1   - byte[]  SHA.state
 4337   //   c_rarg2   - int     block_size
 4338   //   c_rarg3   - int     offset
 4339   //   c_rarg4   - int     limit
 4340   //
 4341   address generate_sha3_implCompress(StubId stub_id) {
 4342     bool multi_block;
 4343     switch (stub_id) {
 4344     case StubId::stubgen_sha3_implCompress_id:
 4345       multi_block = false;
 4346       break;
 4347     case StubId::stubgen_sha3_implCompressMB_id:
 4348       multi_block = true;
 4349       break;
 4350     default:
 4351       ShouldNotReachHere();
 4352     }
 4353 
 4354     static const uint64_t round_consts[24] = {
 4355       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4356       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4357       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4358       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4359       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4360       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4361       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4362       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4363     };
 4364 
 4365     __ align(CodeEntryAlignment);
 4366 
 4367     StubCodeMark mark(this, stub_id);
 4368     address start = __ pc();
 4369 
 4370     Register buf           = c_rarg0;
 4371     Register state         = c_rarg1;
 4372     Register block_size    = c_rarg2;
 4373     Register ofs           = c_rarg3;
 4374     Register limit         = c_rarg4;
 4375 
 4376     Label sha3_loop, rounds24_loop;
 4377     Label sha3_512_or_sha3_384, shake128;
 4378 
 4379     __ stpd(v8, v9, __ pre(sp, -64));
 4380     __ stpd(v10, v11, Address(sp, 16));
 4381     __ stpd(v12, v13, Address(sp, 32));
 4382     __ stpd(v14, v15, Address(sp, 48));
 4383 
 4384     // load state
 4385     __ add(rscratch1, state, 32);
 4386     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4387     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4388     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4389     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4390     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4391     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4392     __ ld1(v24, __ T1D, rscratch1);
 4393 
 4394     __ BIND(sha3_loop);
 4395 
 4396     // 24 keccak rounds
 4397     __ movw(rscratch2, 24);
 4398 
 4399     // load round_constants base
 4400     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4401 
 4402     // load input
 4403     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4404     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4405     __ eor(v0, __ T8B, v0, v25);
 4406     __ eor(v1, __ T8B, v1, v26);
 4407     __ eor(v2, __ T8B, v2, v27);
 4408     __ eor(v3, __ T8B, v3, v28);
 4409     __ eor(v4, __ T8B, v4, v29);
 4410     __ eor(v5, __ T8B, v5, v30);
 4411     __ eor(v6, __ T8B, v6, v31);
 4412 
 4413     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4414     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4415 
 4416     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4417     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4418     __ eor(v7, __ T8B, v7, v25);
 4419     __ eor(v8, __ T8B, v8, v26);
 4420     __ eor(v9, __ T8B, v9, v27);
 4421     __ eor(v10, __ T8B, v10, v28);
 4422     __ eor(v11, __ T8B, v11, v29);
 4423     __ eor(v12, __ T8B, v12, v30);
 4424     __ eor(v13, __ T8B, v13, v31);
 4425 
 4426     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4427     __ eor(v14, __ T8B, v14, v25);
 4428     __ eor(v15, __ T8B, v15, v26);
 4429     __ eor(v16, __ T8B, v16, v27);
 4430 
 4431     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4432     __ andw(c_rarg5, block_size, 48);
 4433     __ cbzw(c_rarg5, rounds24_loop);
 4434 
 4435     __ tbnz(block_size, 5, shake128);
 4436     // block_size == 144, bit5 == 0, SHA3-224
 4437     __ ldrd(v28, __ post(buf, 8));
 4438     __ eor(v17, __ T8B, v17, v28);
 4439     __ b(rounds24_loop);
 4440 
 4441     __ BIND(shake128);
 4442     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4443     __ eor(v17, __ T8B, v17, v28);
 4444     __ eor(v18, __ T8B, v18, v29);
 4445     __ eor(v19, __ T8B, v19, v30);
 4446     __ eor(v20, __ T8B, v20, v31);
 4447     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4448 
 4449     __ BIND(sha3_512_or_sha3_384);
 4450     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4451     __ eor(v7, __ T8B, v7, v25);
 4452     __ eor(v8, __ T8B, v8, v26);
 4453     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4454 
 4455     // SHA3-384
 4456     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4457     __ eor(v9,  __ T8B, v9,  v27);
 4458     __ eor(v10, __ T8B, v10, v28);
 4459     __ eor(v11, __ T8B, v11, v29);
 4460     __ eor(v12, __ T8B, v12, v30);
 4461 
 4462     __ BIND(rounds24_loop);
 4463     __ subw(rscratch2, rscratch2, 1);
 4464 
 4465     keccak_round(rscratch1);
 4466 
 4467     __ cbnzw(rscratch2, rounds24_loop);
 4468 
 4469     if (multi_block) {
 4470       __ add(ofs, ofs, block_size);
 4471       __ cmp(ofs, limit);
 4472       __ br(Assembler::LE, sha3_loop);
 4473       __ mov(c_rarg0, ofs); // return ofs
 4474     }
 4475 
 4476     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4477     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4478     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4479     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4480     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4481     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4482     __ st1(v24, __ T1D, state);
 4483 
 4484     // restore callee-saved registers
 4485     __ ldpd(v14, v15, Address(sp, 48));
 4486     __ ldpd(v12, v13, Address(sp, 32));
 4487     __ ldpd(v10, v11, Address(sp, 16));
 4488     __ ldpd(v8, v9, __ post(sp, 64));
 4489 
 4490     __ ret(lr);
 4491 
 4492     return start;
 4493   }
 4494 
 4495   // Inputs:
 4496   //   c_rarg0   - long[]  state0
 4497   //   c_rarg1   - long[]  state1
 4498   address generate_double_keccak() {
 4499     static const uint64_t round_consts[24] = {
 4500       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4501       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4502       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4503       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4504       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4505       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4506       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4507       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4508     };
 4509 
 4510     // Implements the double_keccak() method of the
 4511     // sun.secyrity.provider.SHA3Parallel class
 4512     __ align(CodeEntryAlignment);
 4513     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4514     address start = __ pc();
 4515     __ enter();
 4516 
 4517     Register state0        = c_rarg0;
 4518     Register state1        = c_rarg1;
 4519 
 4520     Label rounds24_loop;
 4521 
 4522     // save callee-saved registers
 4523     __ stpd(v8, v9, __ pre(sp, -64));
 4524     __ stpd(v10, v11, Address(sp, 16));
 4525     __ stpd(v12, v13, Address(sp, 32));
 4526     __ stpd(v14, v15, Address(sp, 48));
 4527 
 4528     // load states
 4529     __ add(rscratch1, state0, 32);
 4530     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4531     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4532     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4533     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4534     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4535     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4536     __ ld1(v24, __ D, 0, rscratch1);
 4537     __ add(rscratch1, state1, 32);
 4538     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4539     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4540     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4541     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4542     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4543     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4544     __ ld1(v24, __ D, 1, rscratch1);
 4545 
 4546     // 24 keccak rounds
 4547     __ movw(rscratch2, 24);
 4548 
 4549     // load round_constants base
 4550     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4551 
 4552     __ BIND(rounds24_loop);
 4553     __ subw(rscratch2, rscratch2, 1);
 4554     keccak_round(rscratch1);
 4555     __ cbnzw(rscratch2, rounds24_loop);
 4556 
 4557     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4558     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4559     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4560     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4561     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4562     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4563     __ st1(v24, __ D, 0, state0);
 4564     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4565     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4566     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4567     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4568     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4569     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4570     __ st1(v24, __ D, 1, state1);
 4571 
 4572     // restore callee-saved vector registers
 4573     __ ldpd(v14, v15, Address(sp, 48));
 4574     __ ldpd(v12, v13, Address(sp, 32));
 4575     __ ldpd(v10, v11, Address(sp, 16));
 4576     __ ldpd(v8, v9, __ post(sp, 64));
 4577 
 4578     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4579     __ mov(r0, zr); // return 0
 4580     __ ret(lr);
 4581 
 4582     return start;
 4583   }
 4584 
 4585   // ChaCha20 block function.  This version parallelizes the 32-bit
 4586   // state elements on each of 16 vectors, producing 4 blocks of
 4587   // keystream at a time.
 4588   //
 4589   // state (int[16]) = c_rarg0
 4590   // keystream (byte[256]) = c_rarg1
 4591   // return - number of bytes of produced keystream (always 256)
 4592   //
 4593   // This implementation takes each 32-bit integer from the state
 4594   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4595   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4596   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4597   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4598   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4599   // operations represented by one QUARTERROUND function, we instead stack all
 4600   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4601   // and then do the same for the second set of 4 quarter rounds.  This removes
 4602   // some latency that would otherwise be incurred by waiting for an add to
 4603   // complete before performing an xor (which depends on the result of the
 4604   // add), etc. An adjustment happens between the first and second groups of 4
 4605   // quarter rounds, but this is done only in the inputs to the macro functions
 4606   // that generate the assembly instructions - these adjustments themselves are
 4607   // not part of the resulting assembly.
 4608   // The 4 registers v0-v3 are used during the quarter round operations as
 4609   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4610   // registers become the vectors involved in adding the start state back onto
 4611   // the post-QR working state.  After the adds are complete, each of the 16
 4612   // vectors write their first lane back to the keystream buffer, followed
 4613   // by the second lane from all vectors and so on.
 4614   address generate_chacha20Block_blockpar() {
 4615     Label L_twoRounds, L_cc20_const;
 4616     __ align(CodeEntryAlignment);
 4617     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4618     StubCodeMark mark(this, stub_id);
 4619     address start = __ pc();
 4620     __ enter();
 4621 
 4622     int i, j;
 4623     const Register state = c_rarg0;
 4624     const Register keystream = c_rarg1;
 4625     const Register loopCtr = r10;
 4626     const Register tmpAddr = r11;
 4627     const FloatRegister ctrAddOverlay = v28;
 4628     const FloatRegister lrot8Tbl = v29;
 4629 
 4630     // Organize SIMD registers in an array that facilitates
 4631     // putting repetitive opcodes into loop structures.  It is
 4632     // important that each grouping of 4 registers is monotonically
 4633     // increasing to support the requirements of multi-register
 4634     // instructions (e.g. ld4r, st4, etc.)
 4635     const FloatRegister workSt[16] = {
 4636          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4637         v20, v21, v22, v23, v24, v25, v26, v27
 4638     };
 4639 
 4640     // Pull in constant data.  The first 16 bytes are the add overlay
 4641     // which is applied to the vector holding the counter (state[12]).
 4642     // The second 16 bytes is the index register for the 8-bit left
 4643     // rotation tbl instruction.
 4644     __ adr(tmpAddr, L_cc20_const);
 4645     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4646 
 4647     // Load from memory and interlace across 16 SIMD registers,
 4648     // With each word from memory being broadcast to all lanes of
 4649     // each successive SIMD register.
 4650     //      Addr(0) -> All lanes in workSt[i]
 4651     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4652     __ mov(tmpAddr, state);
 4653     for (i = 0; i < 16; i += 4) {
 4654       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4655           __ post(tmpAddr, 16));
 4656     }
 4657     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4658 
 4659     // Before entering the loop, create 5 4-register arrays.  These
 4660     // will hold the 4 registers that represent the a/b/c/d fields
 4661     // in the quarter round operation.  For instance the "b" field
 4662     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4663     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4664     // since it is part of a diagonal organization.  The aSet and scratch
 4665     // register sets are defined at declaration time because they do not change
 4666     // organization at any point during the 20-round processing.
 4667     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4668     FloatRegister bSet[4];
 4669     FloatRegister cSet[4];
 4670     FloatRegister dSet[4];
 4671     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4672 
 4673     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4674     __ mov(loopCtr, 10);
 4675     __ BIND(L_twoRounds);
 4676 
 4677     // Set to columnar organization and do the following 4 quarter-rounds:
 4678     // QUARTERROUND(0, 4, 8, 12)
 4679     // QUARTERROUND(1, 5, 9, 13)
 4680     // QUARTERROUND(2, 6, 10, 14)
 4681     // QUARTERROUND(3, 7, 11, 15)
 4682     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4683     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4684     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4685 
 4686     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4687     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4688     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4689 
 4690     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4691     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4692     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4693 
 4694     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4695     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4696     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4697 
 4698     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4699     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4700     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4701 
 4702     // Set to diagonal organization and do the next 4 quarter-rounds:
 4703     // QUARTERROUND(0, 5, 10, 15)
 4704     // QUARTERROUND(1, 6, 11, 12)
 4705     // QUARTERROUND(2, 7, 8, 13)
 4706     // QUARTERROUND(3, 4, 9, 14)
 4707     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4708     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4709     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4710 
 4711     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4712     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4713     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4714 
 4715     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4716     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4717     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4718 
 4719     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4720     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4721     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4722 
 4723     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4724     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4725     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4726 
 4727     // Decrement and iterate
 4728     __ sub(loopCtr, loopCtr, 1);
 4729     __ cbnz(loopCtr, L_twoRounds);
 4730 
 4731     __ mov(tmpAddr, state);
 4732 
 4733     // Add the starting state back to the post-loop keystream
 4734     // state.  We read/interlace the state array from memory into
 4735     // 4 registers similar to what we did in the beginning.  Then
 4736     // add the counter overlay onto workSt[12] at the end.
 4737     for (i = 0; i < 16; i += 4) {
 4738       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4739       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4740       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4741       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4742       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4743     }
 4744     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4745 
 4746     // Write working state into the keystream buffer.  This is accomplished
 4747     // by taking the lane "i" from each of the four vectors and writing
 4748     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4749     // repeating with the next 4 vectors until all 16 vectors have been used.
 4750     // Then move to the next lane and repeat the process until all lanes have
 4751     // been written.
 4752     for (i = 0; i < 4; i++) {
 4753       for (j = 0; j < 16; j += 4) {
 4754         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4755             __ post(keystream, 16));
 4756       }
 4757     }
 4758 
 4759     __ mov(r0, 256);             // Return length of output keystream
 4760     __ leave();
 4761     __ ret(lr);
 4762 
 4763     // bind label and generate local constant data used by this stub
 4764     // The constant data is broken into two 128-bit segments to be loaded
 4765     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4766     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4767     // The second 128-bits is a table constant used for 8-bit left rotations.
 4768     __ BIND(L_cc20_const);
 4769     __ emit_int64(0x0000000100000000UL);
 4770     __ emit_int64(0x0000000300000002UL);
 4771     __ emit_int64(0x0605040702010003UL);
 4772     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4773 
 4774     return start;
 4775   }
 4776 
 4777   // Helpers to schedule parallel operation bundles across vector
 4778   // register sequences of size 2, 4 or 8.
 4779 
 4780   // Implement various primitive computations across vector sequences
 4781 
 4782   template<int N>
 4783   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4784                const VSeq<N>& v1, const VSeq<N>& v2) {
 4785     // output must not be constant
 4786     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4787     // output cannot overwrite pending inputs
 4788     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4789     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4790     for (int i = 0; i < N; i++) {
 4791       __ addv(v[i], T, v1[i], v2[i]);
 4792     }
 4793   }
 4794 
 4795   template<int N>
 4796   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4797                const VSeq<N>& v1, const VSeq<N>& v2) {
 4798     // output must not be constant
 4799     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4800     // output cannot overwrite pending inputs
 4801     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4802     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4803     for (int i = 0; i < N; i++) {
 4804       __ subv(v[i], T, v1[i], v2[i]);
 4805     }
 4806   }
 4807 
 4808   template<int N>
 4809   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4810                const VSeq<N>& v1, const VSeq<N>& v2) {
 4811     // output must not be constant
 4812     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4813     // output cannot overwrite pending inputs
 4814     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4815     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4816     for (int i = 0; i < N; i++) {
 4817       __ mulv(v[i], T, v1[i], v2[i]);
 4818     }
 4819   }
 4820 
 4821   template<int N>
 4822   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4823     // output must not be constant
 4824     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4825     // output cannot overwrite pending inputs
 4826     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4827     for (int i = 0; i < N; i++) {
 4828       __ negr(v[i], T, v1[i]);
 4829     }
 4830   }
 4831 
 4832   template<int N>
 4833   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4834                const VSeq<N>& v1, int shift) {
 4835     // output must not be constant
 4836     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4837     // output cannot overwrite pending inputs
 4838     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4839     for (int i = 0; i < N; i++) {
 4840       __ sshr(v[i], T, v1[i], shift);
 4841     }
 4842   }
 4843 
 4844   template<int N>
 4845   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4846     // output must not be constant
 4847     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4848     // output cannot overwrite pending inputs
 4849     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4850     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4851     for (int i = 0; i < N; i++) {
 4852       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4853     }
 4854   }
 4855 
 4856   template<int N>
 4857   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4858     // output must not be constant
 4859     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4860     // output cannot overwrite pending inputs
 4861     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4862     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4863     for (int i = 0; i < N; i++) {
 4864       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4865     }
 4866   }
 4867 
 4868   template<int N>
 4869   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4870     // output must not be constant
 4871     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4872     // output cannot overwrite pending inputs
 4873     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4874     for (int i = 0; i < N; i++) {
 4875       __ notr(v[i], __ T16B, v1[i]);
 4876     }
 4877   }
 4878 
 4879   template<int N>
 4880   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4881     // output must not be constant
 4882     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4883     // output cannot overwrite pending inputs
 4884     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4885     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4886     for (int i = 0; i < N; i++) {
 4887       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4888     }
 4889   }
 4890 
 4891   template<int N>
 4892   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4893     // output must not be constant
 4894     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4895     // output cannot overwrite pending inputs
 4896     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4897     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4898     for (int i = 0; i < N; i++) {
 4899       __ mlsv(v[i], T, v1[i], v2[i]);
 4900     }
 4901   }
 4902 
 4903   // load N/2 successive pairs of quadword values from memory in order
 4904   // into N successive vector registers of the sequence via the
 4905   // address supplied in base.
 4906   template<int N>
 4907   void vs_ldpq(const VSeq<N>& v, Register base) {
 4908     for (int i = 0; i < N; i += 2) {
 4909       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4910     }
 4911   }
 4912 
 4913   // load N/2 successive pairs of quadword values from memory in order
 4914   // into N vector registers of the sequence via the address supplied
 4915   // in base using post-increment addressing
 4916   template<int N>
 4917   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4918     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4919     for (int i = 0; i < N; i += 2) {
 4920       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4921     }
 4922   }
 4923 
 4924   // store N successive vector registers of the sequence into N/2
 4925   // successive pairs of quadword memory locations via the address
 4926   // supplied in base using post-increment addressing
 4927   template<int N>
 4928   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4929     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4930     for (int i = 0; i < N; i += 2) {
 4931       __ stpq(v[i], v[i+1], __ post(base, 32));
 4932     }
 4933   }
 4934 
 4935   // load N/2 pairs of quadword values from memory de-interleaved into
 4936   // N vector registers 2 at a time via the address supplied in base
 4937   // using post-increment addressing.
 4938   template<int N>
 4939   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4940     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4941     for (int i = 0; i < N; i += 2) {
 4942       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4943     }
 4944   }
 4945 
 4946   // store N vector registers interleaved into N/2 pairs of quadword
 4947   // memory locations via the address supplied in base using
 4948   // post-increment addressing.
 4949   template<int N>
 4950   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4951     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4952     for (int i = 0; i < N; i += 2) {
 4953       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4954     }
 4955   }
 4956 
 4957   // load N quadword values from memory de-interleaved into N vector
 4958   // registers 3 elements at a time via the address supplied in base.
 4959   template<int N>
 4960   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4961     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4962     for (int i = 0; i < N; i += 3) {
 4963       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4964     }
 4965   }
 4966 
 4967   // load N quadword values from memory de-interleaved into N vector
 4968   // registers 3 elements at a time via the address supplied in base
 4969   // using post-increment addressing.
 4970   template<int N>
 4971   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4972     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4973     for (int i = 0; i < N; i += 3) {
 4974       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4975     }
 4976   }
 4977 
 4978   // load N/2 pairs of quadword values from memory into N vector
 4979   // registers via the address supplied in base with each pair indexed
 4980   // using the the start offset plus the corresponding entry in the
 4981   // offsets array
 4982   template<int N>
 4983   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4984     for (int i = 0; i < N/2; i++) {
 4985       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4986     }
 4987   }
 4988 
 4989   // store N vector registers into N/2 pairs of quadword memory
 4990   // locations via the address supplied in base with each pair indexed
 4991   // using the the start offset plus the corresponding entry in the
 4992   // offsets array
 4993   template<int N>
 4994   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4995     for (int i = 0; i < N/2; i++) {
 4996       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4997     }
 4998   }
 4999 
 5000   // load N single quadword values from memory into N vector registers
 5001   // via the address supplied in base with each value indexed using
 5002   // the the start offset plus the corresponding entry in the offsets
 5003   // array
 5004   template<int N>
 5005   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5006                       int start, int (&offsets)[N]) {
 5007     for (int i = 0; i < N; i++) {
 5008       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5009     }
 5010   }
 5011 
 5012   // store N vector registers into N single quadword memory locations
 5013   // via the address supplied in base with each value indexed using
 5014   // the the start offset plus the corresponding entry in the offsets
 5015   // array
 5016   template<int N>
 5017   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5018                       int start, int (&offsets)[N]) {
 5019     for (int i = 0; i < N; i++) {
 5020       __ str(v[i], T, Address(base, start + offsets[i]));
 5021     }
 5022   }
 5023 
 5024   // load N/2 pairs of quadword values from memory de-interleaved into
 5025   // N vector registers 2 at a time via the address supplied in base
 5026   // with each pair indexed using the the start offset plus the
 5027   // corresponding entry in the offsets array
 5028   template<int N>
 5029   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5030                       Register tmp, int start, int (&offsets)[N/2]) {
 5031     for (int i = 0; i < N/2; i++) {
 5032       __ add(tmp, base, start + offsets[i]);
 5033       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5034     }
 5035   }
 5036 
 5037   // store N vector registers 2 at a time interleaved into N/2 pairs
 5038   // of quadword memory locations via the address supplied in base
 5039   // with each pair indexed using the the start offset plus the
 5040   // corresponding entry in the offsets array
 5041   template<int N>
 5042   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5043                       Register tmp, int start, int (&offsets)[N/2]) {
 5044     for (int i = 0; i < N/2; i++) {
 5045       __ add(tmp, base, start + offsets[i]);
 5046       __ st2(v[2*i], v[2*i+1], T, tmp);
 5047     }
 5048   }
 5049 
 5050   // Helper routines for various flavours of Montgomery multiply
 5051 
 5052   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5053   // multiplications in parallel
 5054   //
 5055 
 5056   // See the montMul() method of the sun.security.provider.ML_DSA
 5057   // class.
 5058   //
 5059   // Computes 4x4S results or 8x8H results
 5060   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5061   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5062   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5063   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5064   // Outputs: va - 4x4S or 4x8H vector register sequences
 5065   // vb, vc, vtmp and vq must all be disjoint
 5066   // va must be disjoint from all other inputs/temps or must equal vc
 5067   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5068   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5069   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5070                    Assembler::SIMD_Arrangement T,
 5071                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5072     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5073     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5074     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5075     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5076 
 5077     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5078     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5079 
 5080     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5081 
 5082     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5083     assert(vs_disjoint(va, vb), "va and vb overlap");
 5084     assert(vs_disjoint(va, vq), "va and vq overlap");
 5085     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5086     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5087 
 5088     // schedule 4 streams of instructions across the vector sequences
 5089     for (int i = 0; i < 4; i++) {
 5090       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5091       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5092     }
 5093 
 5094     for (int i = 0; i < 4; i++) {
 5095       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5096     }
 5097 
 5098     for (int i = 0; i < 4; i++) {
 5099       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5100     }
 5101 
 5102     for (int i = 0; i < 4; i++) {
 5103       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5104     }
 5105   }
 5106 
 5107   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5108   // multiplications in parallel
 5109   //
 5110 
 5111   // See the montMul() method of the sun.security.provider.ML_DSA
 5112   // class.
 5113   //
 5114   // Computes 4x4S results or 8x8H results
 5115   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5116   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5117   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5118   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5119   // Outputs: va - 4x4S or 4x8H vector register sequences
 5120   // vb, vc, vtmp and vq must all be disjoint
 5121   // va must be disjoint from all other inputs/temps or must equal vc
 5122   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5123   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5124   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5125                    Assembler::SIMD_Arrangement T,
 5126                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5127     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5128     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5129     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5130     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5131 
 5132     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5133     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5134 
 5135     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5136 
 5137     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5138     assert(vs_disjoint(va, vb), "va and vb overlap");
 5139     assert(vs_disjoint(va, vq), "va and vq overlap");
 5140     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5141     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5142 
 5143     // schedule 2 streams of instructions across the vector sequences
 5144     for (int i = 0; i < 2; i++) {
 5145       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5146       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5147     }
 5148 
 5149     for (int i = 0; i < 2; i++) {
 5150       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5151     }
 5152 
 5153     for (int i = 0; i < 2; i++) {
 5154       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5155     }
 5156 
 5157     for (int i = 0; i < 2; i++) {
 5158       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5159     }
 5160   }
 5161 
 5162   // Perform 16 16-bit Montgomery multiplications in parallel.
 5163   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5164                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5165     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5166     // It will assert that the register use is valid
 5167     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5168   }
 5169 
 5170   // Perform 32 16-bit Montgomery multiplications in parallel.
 5171   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5172                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5173     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5174     // It will assert that the register use is valid
 5175     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5176   }
 5177 
 5178   // Perform 64 16-bit Montgomery multiplications in parallel.
 5179   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5180                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5181     // Schedule two successive 4x8H multiplies via the montmul helper
 5182     // on the front and back halves of va, vb and vc. The helper will
 5183     // assert that the register use has no overlap conflicts on each
 5184     // individual call but we also need to ensure that the necessary
 5185     // disjoint/equality constraints are met across both calls.
 5186 
 5187     // vb, vc, vtmp and vq must be disjoint. va must either be
 5188     // disjoint from all other registers or equal vc
 5189 
 5190     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5191     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5192     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5193 
 5194     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5195     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5196 
 5197     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5198 
 5199     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5200     assert(vs_disjoint(va, vb), "va and vb overlap");
 5201     assert(vs_disjoint(va, vq), "va and vq overlap");
 5202     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5203 
 5204     // we multiply the front and back halves of each sequence 4 at a
 5205     // time because
 5206     //
 5207     // 1) we are currently only able to get 4-way instruction
 5208     // parallelism at best
 5209     //
 5210     // 2) we need registers for the constants in vq and temporary
 5211     // scratch registers to hold intermediate results so vtmp can only
 5212     // be a VSeq<4> which means we only have 4 scratch slots
 5213 
 5214     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5215     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5216   }
 5217 
 5218   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5219                                const VSeq<4>& vc,
 5220                                const VSeq<4>& vtmp,
 5221                                const VSeq<2>& vq) {
 5222     // compute a = montmul(a1, c)
 5223     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5224     // ouptut a1 = a0 - a
 5225     vs_subv(va1, __ T8H, va0, vc);
 5226     //    and a0 = a0 + a
 5227     vs_addv(va0, __ T8H, va0, vc);
 5228   }
 5229 
 5230   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5231                                const VSeq<4>& vb,
 5232                                const VSeq<4>& vtmp1,
 5233                                const VSeq<4>& vtmp2,
 5234                                const VSeq<2>& vq) {
 5235     // compute c = a0 - a1
 5236     vs_subv(vtmp1, __ T8H, va0, va1);
 5237     // output a0 = a0 + a1
 5238     vs_addv(va0, __ T8H, va0, va1);
 5239     // output a1 = b montmul c
 5240     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5241   }
 5242 
 5243   void load64shorts(const VSeq<8>& v, Register shorts) {
 5244     vs_ldpq_post(v, shorts);
 5245   }
 5246 
 5247   void load32shorts(const VSeq<4>& v, Register shorts) {
 5248     vs_ldpq_post(v, shorts);
 5249   }
 5250 
 5251   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5252     vs_stpq_post(v, tmpAddr);
 5253   }
 5254 
 5255   // Kyber NTT function.
 5256   // Implements
 5257   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5258   //
 5259   // coeffs (short[256]) = c_rarg0
 5260   // ntt_zetas (short[256]) = c_rarg1
 5261   address generate_kyberNtt() {
 5262 
 5263     __ align(CodeEntryAlignment);
 5264     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5265     StubCodeMark mark(this, stub_id);
 5266     address start = __ pc();
 5267     __ enter();
 5268 
 5269     const Register coeffs = c_rarg0;
 5270     const Register zetas = c_rarg1;
 5271 
 5272     const Register kyberConsts = r10;
 5273     const Register tmpAddr = r11;
 5274 
 5275     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5276     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5277     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5278 
 5279     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5280     // load the montmul constants
 5281     vs_ldpq(vq, kyberConsts);
 5282 
 5283     // Each level corresponds to an iteration of the outermost loop of the
 5284     // Java method seilerNTT(int[] coeffs). There are some differences
 5285     // from what is done in the seilerNTT() method, though:
 5286     // 1. The computation is using 16-bit signed values, we do not convert them
 5287     // to ints here.
 5288     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5289     // this array for each level, it is easier that way to fill up the vector
 5290     // registers.
 5291     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5292     // multiplications (this is because that way there should not be any
 5293     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5294     // that we can use the 16-bit arithmetic in the vector unit.
 5295     //
 5296     // On each level, we fill up the vector registers in such a way that the
 5297     // array elements that need to be multiplied by the zetas go into one
 5298     // set of vector registers while the corresponding ones that don't need to
 5299     // be multiplied, go into another set.
 5300     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5301     // registers interleaving the steps of 4 identical computations,
 5302     // each done on 8 16-bit values per register.
 5303 
 5304     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5305     // to the zetas occur in discrete blocks whose size is some multiple
 5306     // of 32.
 5307 
 5308     // level 0
 5309     __ add(tmpAddr, coeffs, 256);
 5310     load64shorts(vs1, tmpAddr);
 5311     load64shorts(vs2, zetas);
 5312     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5313     __ add(tmpAddr, coeffs, 0);
 5314     load64shorts(vs1, tmpAddr);
 5315     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5316     vs_addv(vs1, __ T8H, vs1, vs2);
 5317     __ add(tmpAddr, coeffs, 0);
 5318     vs_stpq_post(vs1, tmpAddr);
 5319     __ add(tmpAddr, coeffs, 256);
 5320     vs_stpq_post(vs3, tmpAddr);
 5321     // restore montmul constants
 5322     vs_ldpq(vq, kyberConsts);
 5323     load64shorts(vs1, tmpAddr);
 5324     load64shorts(vs2, zetas);
 5325     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5326     __ add(tmpAddr, coeffs, 128);
 5327     load64shorts(vs1, tmpAddr);
 5328     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5329     vs_addv(vs1, __ T8H, vs1, vs2);
 5330     __ add(tmpAddr, coeffs, 128);
 5331     store64shorts(vs1, tmpAddr);
 5332     __ add(tmpAddr, coeffs, 384);
 5333     store64shorts(vs3, tmpAddr);
 5334 
 5335     // level 1
 5336     // restore montmul constants
 5337     vs_ldpq(vq, kyberConsts);
 5338     __ add(tmpAddr, coeffs, 128);
 5339     load64shorts(vs1, tmpAddr);
 5340     load64shorts(vs2, zetas);
 5341     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5342     __ add(tmpAddr, coeffs, 0);
 5343     load64shorts(vs1, tmpAddr);
 5344     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5345     vs_addv(vs1, __ T8H, vs1, vs2);
 5346     __ add(tmpAddr, coeffs, 0);
 5347     store64shorts(vs1, tmpAddr);
 5348     store64shorts(vs3, tmpAddr);
 5349     vs_ldpq(vq, kyberConsts);
 5350     __ add(tmpAddr, coeffs, 384);
 5351     load64shorts(vs1, tmpAddr);
 5352     load64shorts(vs2, zetas);
 5353     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5354     __ add(tmpAddr, coeffs, 256);
 5355     load64shorts(vs1, tmpAddr);
 5356     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5357     vs_addv(vs1, __ T8H, vs1, vs2);
 5358     __ add(tmpAddr, coeffs, 256);
 5359     store64shorts(vs1, tmpAddr);
 5360     store64shorts(vs3, tmpAddr);
 5361 
 5362     // level 2
 5363     vs_ldpq(vq, kyberConsts);
 5364     int offsets1[4] = { 0, 32, 128, 160 };
 5365     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5366     load64shorts(vs2, zetas);
 5367     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5368     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5369     // kyber_subv_addv64();
 5370     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5371     vs_addv(vs1, __ T8H, vs1, vs2);
 5372     __ add(tmpAddr, coeffs, 0);
 5373     vs_stpq_post(vs_front(vs1), tmpAddr);
 5374     vs_stpq_post(vs_front(vs3), tmpAddr);
 5375     vs_stpq_post(vs_back(vs1), tmpAddr);
 5376     vs_stpq_post(vs_back(vs3), tmpAddr);
 5377     vs_ldpq(vq, kyberConsts);
 5378     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5379     load64shorts(vs2, zetas);
 5380     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5381     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5382     // kyber_subv_addv64();
 5383     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5384     vs_addv(vs1, __ T8H, vs1, vs2);
 5385     __ add(tmpAddr, coeffs, 256);
 5386     vs_stpq_post(vs_front(vs1), tmpAddr);
 5387     vs_stpq_post(vs_front(vs3), tmpAddr);
 5388     vs_stpq_post(vs_back(vs1), tmpAddr);
 5389     vs_stpq_post(vs_back(vs3), tmpAddr);
 5390 
 5391     // level 3
 5392     vs_ldpq(vq, kyberConsts);
 5393     int offsets2[4] = { 0, 64, 128, 192 };
 5394     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5395     load64shorts(vs2, zetas);
 5396     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5397     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5398     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5399     vs_addv(vs1, __ T8H, vs1, vs2);
 5400     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5401     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5402 
 5403     vs_ldpq(vq, kyberConsts);
 5404     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5405     load64shorts(vs2, zetas);
 5406     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5407     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5408     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5409     vs_addv(vs1, __ T8H, vs1, vs2);
 5410     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5411     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5412 
 5413     // level 4
 5414     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5415     // so they are loaded using employing an ldr at 8 distinct offsets.
 5416 
 5417     vs_ldpq(vq, kyberConsts);
 5418     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5419     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5420     load64shorts(vs2, zetas);
 5421     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5422     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5423     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5424     vs_addv(vs1, __ T8H, vs1, vs2);
 5425     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5426     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5427 
 5428     vs_ldpq(vq, kyberConsts);
 5429     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5430     load64shorts(vs2, zetas);
 5431     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5432     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5433     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5434     vs_addv(vs1, __ T8H, vs1, vs2);
 5435     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5436     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5437 
 5438     // level 5
 5439     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5440     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5441 
 5442     vs_ldpq(vq, kyberConsts);
 5443     int offsets4[4] = { 0, 32, 64, 96 };
 5444     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5445     load32shorts(vs_front(vs2), zetas);
 5446     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5447     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5448     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5449     load32shorts(vs_front(vs2), zetas);
 5450     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5451     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5452     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5453     load32shorts(vs_front(vs2), zetas);
 5454     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5455     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5456 
 5457     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5458     load32shorts(vs_front(vs2), zetas);
 5459     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5460     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5461 
 5462     // level 6
 5463     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5464     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5465 
 5466     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5467     load32shorts(vs_front(vs2), zetas);
 5468     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5469     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5470     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5471     // __ ldpq(v18, v19, __ post(zetas, 32));
 5472     load32shorts(vs_front(vs2), zetas);
 5473     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5474     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5475 
 5476     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5477     load32shorts(vs_front(vs2), zetas);
 5478     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5479     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5480 
 5481     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5482     load32shorts(vs_front(vs2), zetas);
 5483     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5484     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5485 
 5486     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5487     __ mov(r0, zr); // return 0
 5488     __ ret(lr);
 5489 
 5490     return start;
 5491   }
 5492 
 5493   // Kyber Inverse NTT function
 5494   // Implements
 5495   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5496   //
 5497   // coeffs (short[256]) = c_rarg0
 5498   // ntt_zetas (short[256]) = c_rarg1
 5499   address generate_kyberInverseNtt() {
 5500 
 5501     __ align(CodeEntryAlignment);
 5502     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5503     StubCodeMark mark(this, stub_id);
 5504     address start = __ pc();
 5505     __ enter();
 5506 
 5507     const Register coeffs = c_rarg0;
 5508     const Register zetas = c_rarg1;
 5509 
 5510     const Register kyberConsts = r10;
 5511     const Register tmpAddr = r11;
 5512     const Register tmpAddr2 = c_rarg2;
 5513 
 5514     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5515     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5516     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5517 
 5518     __ lea(kyberConsts,
 5519              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5520 
 5521     // level 0
 5522     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5523     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5524 
 5525     vs_ldpq(vq, kyberConsts);
 5526     int offsets4[4] = { 0, 32, 64, 96 };
 5527     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5528     load32shorts(vs_front(vs2), zetas);
 5529     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5530                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5531     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5532     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5533     load32shorts(vs_front(vs2), zetas);
 5534     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5535                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5536     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5537     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5538     load32shorts(vs_front(vs2), zetas);
 5539     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5540                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5541     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5542     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5543     load32shorts(vs_front(vs2), zetas);
 5544     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5545                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5546     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5547 
 5548     // level 1
 5549     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5550     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5551 
 5552     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5553     load32shorts(vs_front(vs2), zetas);
 5554     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5555                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5556     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5557     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5558     load32shorts(vs_front(vs2), zetas);
 5559     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5560                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5561     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5562 
 5563     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5564     load32shorts(vs_front(vs2), zetas);
 5565     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5566                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5567     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5568     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5569     load32shorts(vs_front(vs2), zetas);
 5570     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5571                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5572     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5573 
 5574     // level 2
 5575     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5576     // so they are loaded using employing an ldr at 8 distinct offsets.
 5577 
 5578     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5579     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5580     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5581     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5582     vs_subv(vs1, __ T8H, vs1, vs2);
 5583     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5584     load64shorts(vs2, zetas);
 5585     vs_ldpq(vq, kyberConsts);
 5586     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5587     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5588 
 5589     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5590     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5591     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5592     vs_subv(vs1, __ T8H, vs1, vs2);
 5593     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5594     load64shorts(vs2, zetas);
 5595     vs_ldpq(vq, kyberConsts);
 5596     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5597     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5598 
 5599     // Barrett reduction at indexes where overflow may happen
 5600 
 5601     // load q and the multiplier for the Barrett reduction
 5602     __ add(tmpAddr, kyberConsts, 16);
 5603     vs_ldpq(vq, tmpAddr);
 5604 
 5605     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5606     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5607     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5608     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5609     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5610     vs_sshr(vs2, __ T8H, vs2, 11);
 5611     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5612     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5613     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5614     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5615     vs_sshr(vs2, __ T8H, vs2, 11);
 5616     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5617     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5618 
 5619     // level 3
 5620     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5621     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5622 
 5623     int offsets2[4] = { 0, 64, 128, 192 };
 5624     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5625     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5626     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5627     vs_subv(vs1, __ T8H, vs1, vs2);
 5628     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5629     load64shorts(vs2, zetas);
 5630     vs_ldpq(vq, kyberConsts);
 5631     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5632     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5633 
 5634     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5635     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5636     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5637     vs_subv(vs1, __ T8H, vs1, vs2);
 5638     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5639     load64shorts(vs2, zetas);
 5640     vs_ldpq(vq, kyberConsts);
 5641     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5642     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5643 
 5644     // level 4
 5645 
 5646     int offsets1[4] = { 0, 32, 128, 160 };
 5647     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5648     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5649     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5650     vs_subv(vs1, __ T8H, vs1, vs2);
 5651     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5652     load64shorts(vs2, zetas);
 5653     vs_ldpq(vq, kyberConsts);
 5654     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5655     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5656 
 5657     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5658     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5659     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5660     vs_subv(vs1, __ T8H, vs1, vs2);
 5661     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5662     load64shorts(vs2, zetas);
 5663     vs_ldpq(vq, kyberConsts);
 5664     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5665     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5666 
 5667     // level 5
 5668 
 5669     __ add(tmpAddr, coeffs, 0);
 5670     load64shorts(vs1, tmpAddr);
 5671     __ add(tmpAddr, coeffs, 128);
 5672     load64shorts(vs2, tmpAddr);
 5673     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5674     vs_subv(vs1, __ T8H, vs1, vs2);
 5675     __ add(tmpAddr, coeffs, 0);
 5676     store64shorts(vs3, tmpAddr);
 5677     load64shorts(vs2, zetas);
 5678     vs_ldpq(vq, kyberConsts);
 5679     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5680     __ add(tmpAddr, coeffs, 128);
 5681     store64shorts(vs2, tmpAddr);
 5682 
 5683     load64shorts(vs1, tmpAddr);
 5684     __ add(tmpAddr, coeffs, 384);
 5685     load64shorts(vs2, tmpAddr);
 5686     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5687     vs_subv(vs1, __ T8H, vs1, vs2);
 5688     __ add(tmpAddr, coeffs, 256);
 5689     store64shorts(vs3, tmpAddr);
 5690     load64shorts(vs2, zetas);
 5691     vs_ldpq(vq, kyberConsts);
 5692     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5693     __ add(tmpAddr, coeffs, 384);
 5694     store64shorts(vs2, tmpAddr);
 5695 
 5696     // Barrett reduction at indexes where overflow may happen
 5697 
 5698     // load q and the multiplier for the Barrett reduction
 5699     __ add(tmpAddr, kyberConsts, 16);
 5700     vs_ldpq(vq, tmpAddr);
 5701 
 5702     int offsets0[2] = { 0, 256 };
 5703     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5704     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5705     vs_sshr(vs2, __ T8H, vs2, 11);
 5706     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5707     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5708 
 5709     // level 6
 5710 
 5711     __ add(tmpAddr, coeffs, 0);
 5712     load64shorts(vs1, tmpAddr);
 5713     __ add(tmpAddr, coeffs, 256);
 5714     load64shorts(vs2, tmpAddr);
 5715     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5716     vs_subv(vs1, __ T8H, vs1, vs2);
 5717     __ add(tmpAddr, coeffs, 0);
 5718     store64shorts(vs3, tmpAddr);
 5719     load64shorts(vs2, zetas);
 5720     vs_ldpq(vq, kyberConsts);
 5721     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5722     __ add(tmpAddr, coeffs, 256);
 5723     store64shorts(vs2, tmpAddr);
 5724 
 5725     __ add(tmpAddr, coeffs, 128);
 5726     load64shorts(vs1, tmpAddr);
 5727     __ add(tmpAddr, coeffs, 384);
 5728     load64shorts(vs2, tmpAddr);
 5729     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5730     vs_subv(vs1, __ T8H, vs1, vs2);
 5731     __ add(tmpAddr, coeffs, 128);
 5732     store64shorts(vs3, tmpAddr);
 5733     load64shorts(vs2, zetas);
 5734     vs_ldpq(vq, kyberConsts);
 5735     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5736     __ add(tmpAddr, coeffs, 384);
 5737     store64shorts(vs2, tmpAddr);
 5738 
 5739     // multiply by 2^-n
 5740 
 5741     // load toMont(2^-n mod q)
 5742     __ add(tmpAddr, kyberConsts, 48);
 5743     __ ldr(v29, __ Q, tmpAddr);
 5744 
 5745     vs_ldpq(vq, kyberConsts);
 5746     __ add(tmpAddr, coeffs, 0);
 5747     load64shorts(vs1, tmpAddr);
 5748     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5749     __ add(tmpAddr, coeffs, 0);
 5750     store64shorts(vs2, tmpAddr);
 5751 
 5752     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5753     load64shorts(vs1, tmpAddr);
 5754     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5755     __ add(tmpAddr, coeffs, 128);
 5756     store64shorts(vs2, tmpAddr);
 5757 
 5758     // now tmpAddr contains coeffs + 256
 5759     load64shorts(vs1, tmpAddr);
 5760     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5761     __ add(tmpAddr, coeffs, 256);
 5762     store64shorts(vs2, tmpAddr);
 5763 
 5764     // now tmpAddr contains coeffs + 384
 5765     load64shorts(vs1, tmpAddr);
 5766     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5767     __ add(tmpAddr, coeffs, 384);
 5768     store64shorts(vs2, tmpAddr);
 5769 
 5770     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5771     __ mov(r0, zr); // return 0
 5772     __ ret(lr);
 5773 
 5774     return start;
 5775   }
 5776 
 5777   // Kyber multiply polynomials in the NTT domain.
 5778   // Implements
 5779   // static int implKyberNttMult(
 5780   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5781   //
 5782   // result (short[256]) = c_rarg0
 5783   // ntta (short[256]) = c_rarg1
 5784   // nttb (short[256]) = c_rarg2
 5785   // zetas (short[128]) = c_rarg3
 5786   address generate_kyberNttMult() {
 5787 
 5788     __ align(CodeEntryAlignment);
 5789     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5790     StubCodeMark mark(this, stub_id);
 5791     address start = __ pc();
 5792     __ enter();
 5793 
 5794     const Register result = c_rarg0;
 5795     const Register ntta = c_rarg1;
 5796     const Register nttb = c_rarg2;
 5797     const Register zetas = c_rarg3;
 5798 
 5799     const Register kyberConsts = r10;
 5800     const Register limit = r11;
 5801 
 5802     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5803     VSeq<4> vs3(16), vs4(20);
 5804     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5805     VSeq<2> vz(28);          // pair of zetas
 5806     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5807 
 5808     __ lea(kyberConsts,
 5809              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5810 
 5811     Label kyberNttMult_loop;
 5812 
 5813     __ add(limit, result, 512);
 5814 
 5815     // load q and qinv
 5816     vs_ldpq(vq, kyberConsts);
 5817 
 5818     // load R^2 mod q (to convert back from Montgomery representation)
 5819     __ add(kyberConsts, kyberConsts, 64);
 5820     __ ldr(v27, __ Q, kyberConsts);
 5821 
 5822     __ BIND(kyberNttMult_loop);
 5823 
 5824     // load 16 zetas
 5825     vs_ldpq_post(vz, zetas);
 5826 
 5827     // load 2 sets of 32 coefficients from the two input arrays
 5828     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5829     // are striped across pairs of vector registers
 5830     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5831     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5832     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5833     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5834 
 5835     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5836     // i.e. montmul the first and second halves of vs1 in order and
 5837     // then with one sequence reversed storing the two results in vs3
 5838     //
 5839     // vs3[0] <- montmul(a0, b0)
 5840     // vs3[1] <- montmul(a1, b1)
 5841     // vs3[2] <- montmul(a0, b1)
 5842     // vs3[3] <- montmul(a1, b0)
 5843     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5844     kyber_montmul16(vs_back(vs3),
 5845                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5846 
 5847     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5848     // i.e. montmul the first and second halves of vs4 in order and
 5849     // then with one sequence reversed storing the two results in vs1
 5850     //
 5851     // vs1[0] <- montmul(a2, b2)
 5852     // vs1[1] <- montmul(a3, b3)
 5853     // vs1[2] <- montmul(a2, b3)
 5854     // vs1[3] <- montmul(a3, b2)
 5855     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5856     kyber_montmul16(vs_back(vs1),
 5857                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5858 
 5859     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5860     // We can schedule two montmuls at a time if we use a suitable vector
 5861     // sequence <vs3[1], vs1[1]>.
 5862     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5863     VSeq<2> vs5(vs3[1], delta);
 5864 
 5865     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5866     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5867     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5868 
 5869     // add results in pairs storing in vs3
 5870     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5871     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5872     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5873 
 5874     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5875     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5876     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5877 
 5878     // vs1 <- montmul(vs3, montRSquareModQ)
 5879     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5880 
 5881     // store back the two pairs of result vectors de-interleaved as 8H elements
 5882     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5883     // in memory
 5884     vs_st2_post(vs1, __ T8H, result);
 5885 
 5886     __ cmp(result, limit);
 5887     __ br(Assembler::NE, kyberNttMult_loop);
 5888 
 5889     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5890     __ mov(r0, zr); // return 0
 5891     __ ret(lr);
 5892 
 5893     return start;
 5894   }
 5895 
 5896   // Kyber add 2 polynomials.
 5897   // Implements
 5898   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5899   //
 5900   // result (short[256]) = c_rarg0
 5901   // a (short[256]) = c_rarg1
 5902   // b (short[256]) = c_rarg2
 5903   address generate_kyberAddPoly_2() {
 5904 
 5905     __ align(CodeEntryAlignment);
 5906     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5907     StubCodeMark mark(this, stub_id);
 5908     address start = __ pc();
 5909     __ enter();
 5910 
 5911     const Register result = c_rarg0;
 5912     const Register a = c_rarg1;
 5913     const Register b = c_rarg2;
 5914 
 5915     const Register kyberConsts = r11;
 5916 
 5917     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5918     // So, we can load, add and store the data in 3 groups of 11,
 5919     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5920     // registers. A further constraint is that the mapping needs
 5921     // to skip callee saves. So, we allocate the register
 5922     // sequences using two 8 sequences, two 2 sequences and two
 5923     // single registers.
 5924     VSeq<8> vs1_1(0);
 5925     VSeq<2> vs1_2(16);
 5926     FloatRegister vs1_3 = v28;
 5927     VSeq<8> vs2_1(18);
 5928     VSeq<2> vs2_2(26);
 5929     FloatRegister vs2_3 = v29;
 5930 
 5931     // two constant vector sequences
 5932     VSeq<8> vc_1(31, 0);
 5933     VSeq<2> vc_2(31, 0);
 5934 
 5935     FloatRegister vc_3 = v31;
 5936     __ lea(kyberConsts,
 5937              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5938 
 5939     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5940     for (int i = 0; i < 3; i++) {
 5941       // load 80 or 88 values from a into vs1_1/2/3
 5942       vs_ldpq_post(vs1_1, a);
 5943       vs_ldpq_post(vs1_2, a);
 5944       if (i < 2) {
 5945         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5946       }
 5947       // load 80 or 88 values from b into vs2_1/2/3
 5948       vs_ldpq_post(vs2_1, b);
 5949       vs_ldpq_post(vs2_2, b);
 5950       if (i < 2) {
 5951         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5952       }
 5953       // sum 80 or 88 values across vs1 and vs2 into vs1
 5954       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5955       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5956       if (i < 2) {
 5957         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5958       }
 5959       // add constant to all 80 or 88 results
 5960       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5961       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5962       if (i < 2) {
 5963         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5964       }
 5965       // store 80 or 88 values
 5966       vs_stpq_post(vs1_1, result);
 5967       vs_stpq_post(vs1_2, result);
 5968       if (i < 2) {
 5969         __ str(vs1_3, __ Q, __ post(result, 16));
 5970       }
 5971     }
 5972 
 5973     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5974     __ mov(r0, zr); // return 0
 5975     __ ret(lr);
 5976 
 5977     return start;
 5978   }
 5979 
 5980   // Kyber add 3 polynomials.
 5981   // Implements
 5982   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5983   //
 5984   // result (short[256]) = c_rarg0
 5985   // a (short[256]) = c_rarg1
 5986   // b (short[256]) = c_rarg2
 5987   // c (short[256]) = c_rarg3
 5988   address generate_kyberAddPoly_3() {
 5989 
 5990     __ align(CodeEntryAlignment);
 5991     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 5992     StubCodeMark mark(this, stub_id);
 5993     address start = __ pc();
 5994     __ enter();
 5995 
 5996     const Register result = c_rarg0;
 5997     const Register a = c_rarg1;
 5998     const Register b = c_rarg2;
 5999     const Register c = c_rarg3;
 6000 
 6001     const Register kyberConsts = r11;
 6002 
 6003     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6004     // quadwords.  So, we can load, add and store the data in 3
 6005     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6006     // of 10 or 11 registers. A further constraint is that the
 6007     // mapping needs to skip callee saves. So, we allocate the
 6008     // register sequences using two 8 sequences, two 2 sequences
 6009     // and two single registers.
 6010     VSeq<8> vs1_1(0);
 6011     VSeq<2> vs1_2(16);
 6012     FloatRegister vs1_3 = v28;
 6013     VSeq<8> vs2_1(18);
 6014     VSeq<2> vs2_2(26);
 6015     FloatRegister vs2_3 = v29;
 6016 
 6017     // two constant vector sequences
 6018     VSeq<8> vc_1(31, 0);
 6019     VSeq<2> vc_2(31, 0);
 6020 
 6021     FloatRegister vc_3 = v31;
 6022 
 6023     __ lea(kyberConsts,
 6024              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6025 
 6026     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6027     for (int i = 0; i < 3; i++) {
 6028       // load 80 or 88 values from a into vs1_1/2/3
 6029       vs_ldpq_post(vs1_1, a);
 6030       vs_ldpq_post(vs1_2, a);
 6031       if (i < 2) {
 6032         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6033       }
 6034       // load 80 or 88 values from b into vs2_1/2/3
 6035       vs_ldpq_post(vs2_1, b);
 6036       vs_ldpq_post(vs2_2, b);
 6037       if (i < 2) {
 6038         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6039       }
 6040       // sum 80 or 88 values across vs1 and vs2 into vs1
 6041       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6042       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6043       if (i < 2) {
 6044         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6045       }
 6046       // load 80 or 88 values from c into vs2_1/2/3
 6047       vs_ldpq_post(vs2_1, c);
 6048       vs_ldpq_post(vs2_2, c);
 6049       if (i < 2) {
 6050         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6051       }
 6052       // sum 80 or 88 values across vs1 and vs2 into vs1
 6053       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6054       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6055       if (i < 2) {
 6056         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6057       }
 6058       // add constant to all 80 or 88 results
 6059       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6060       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6061       if (i < 2) {
 6062         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6063       }
 6064       // store 80 or 88 values
 6065       vs_stpq_post(vs1_1, result);
 6066       vs_stpq_post(vs1_2, result);
 6067       if (i < 2) {
 6068         __ str(vs1_3, __ Q, __ post(result, 16));
 6069       }
 6070     }
 6071 
 6072     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6073     __ mov(r0, zr); // return 0
 6074     __ ret(lr);
 6075 
 6076     return start;
 6077   }
 6078 
 6079   // Kyber parse XOF output to polynomial coefficient candidates
 6080   // or decodePoly(12, ...).
 6081   // Implements
 6082   // static int implKyber12To16(
 6083   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6084   //
 6085   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6086   //
 6087   // condensed (byte[]) = c_rarg0
 6088   // condensedIndex = c_rarg1
 6089   // parsed (short[112 or 256]) = c_rarg2
 6090   // parsedLength (112 or 256) = c_rarg3
 6091   address generate_kyber12To16() {
 6092     Label L_F00, L_loop, L_end;
 6093 
 6094     __ align(CodeEntryAlignment);
 6095     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6096     StubCodeMark mark(this, stub_id);
 6097     address start = __ pc();
 6098     __ enter();
 6099 
 6100     const Register condensed = c_rarg0;
 6101     const Register condensedOffs = c_rarg1;
 6102     const Register parsed = c_rarg2;
 6103     const Register parsedLength = c_rarg3;
 6104 
 6105     const Register tmpAddr = r11;
 6106 
 6107     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6108     // quadwords so we need a 6 vector sequence for the inputs.
 6109     // Parsing produces 64 shorts, employing two 8 vector
 6110     // sequences to store and combine the intermediate data.
 6111     VSeq<6> vin(24);
 6112     VSeq<8> va(0), vb(16);
 6113 
 6114     __ adr(tmpAddr, L_F00);
 6115     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6116     __ add(condensed, condensed, condensedOffs);
 6117 
 6118     __ BIND(L_loop);
 6119     // load 96 (6 x 16B) byte values
 6120     vs_ld3_post(vin, __ T16B, condensed);
 6121 
 6122     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6123     // holds 48 (16x3) contiguous bytes from memory striped
 6124     // horizontally across each of the 16 byte lanes. Equivalently,
 6125     // that is 16 pairs of 12-bit integers. Likewise the back half
 6126     // holds the next 48 bytes in the same arrangement.
 6127 
 6128     // Each vector in the front half can also be viewed as a vertical
 6129     // strip across the 16 pairs of 12 bit integers. Each byte in
 6130     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6131     // byte in vin[1] stores the high 4 bits of the first int and the
 6132     // low 4 bits of the second int. Each byte in vin[2] stores the
 6133     // high 8 bits of the second int. Likewise the vectors in second
 6134     // half.
 6135 
 6136     // Converting the data to 16-bit shorts requires first of all
 6137     // expanding each of the 6 x 16B vectors into 6 corresponding
 6138     // pairs of 8H vectors. Mask, shift and add operations on the
 6139     // resulting vector pairs can be used to combine 4 and 8 bit
 6140     // parts of related 8H vector elements.
 6141     //
 6142     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6143     // twice, one copy manipulated to provide the lower 4 bits
 6144     // belonging to the first short in a pair and another copy
 6145     // manipulated to provide the higher 4 bits belonging to the
 6146     // second short in a pair. This is why the the vector sequences va
 6147     // and vb used to hold the expanded 8H elements are of length 8.
 6148 
 6149     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6150     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6151     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6152     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6153     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6154     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6155     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6156     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6157 
 6158     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6159     // and vb[4:5]
 6160     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6161     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6162     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6163     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6164     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6165     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6166 
 6167     // shift lo byte of copy 1 of the middle stripe into the high byte
 6168     __ shl(va[2], __ T8H, va[2], 8);
 6169     __ shl(va[3], __ T8H, va[3], 8);
 6170     __ shl(vb[2], __ T8H, vb[2], 8);
 6171     __ shl(vb[3], __ T8H, vb[3], 8);
 6172 
 6173     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6174     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6175     // are in bit positions [4..11].
 6176     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6177     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6178     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6179     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6180 
 6181     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6182     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6183     // copy2
 6184     __ andr(va[2], __ T16B, va[2], v31);
 6185     __ andr(va[3], __ T16B, va[3], v31);
 6186     __ ushr(va[4], __ T8H, va[4], 4);
 6187     __ ushr(va[5], __ T8H, va[5], 4);
 6188     __ andr(vb[2], __ T16B, vb[2], v31);
 6189     __ andr(vb[3], __ T16B, vb[3], v31);
 6190     __ ushr(vb[4], __ T8H, vb[4], 4);
 6191     __ ushr(vb[5], __ T8H, vb[5], 4);
 6192 
 6193     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6194     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6195     // n.b. the ordering ensures: i) inputs are consumed before they
 6196     // are overwritten ii) the order of 16-bit results across successive
 6197     // pairs of vectors in va and then vb reflects the order of the
 6198     // corresponding 12-bit inputs
 6199     __ addv(va[0], __ T8H, va[0], va[2]);
 6200     __ addv(va[2], __ T8H, va[1], va[3]);
 6201     __ addv(va[1], __ T8H, va[4], va[6]);
 6202     __ addv(va[3], __ T8H, va[5], va[7]);
 6203     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6204     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6205     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6206     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6207 
 6208     // store 64 results interleaved as shorts
 6209     vs_st2_post(vs_front(va), __ T8H, parsed);
 6210     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6211 
 6212     __ sub(parsedLength, parsedLength, 64);
 6213     __ cmp(parsedLength, (u1)64);
 6214     __ br(Assembler::GE, L_loop);
 6215     __ cbz(parsedLength, L_end);
 6216 
 6217     // if anything is left it should be a final 72 bytes of input
 6218     // i.e. a final 48 12-bit values. so we handle this by loading
 6219     // 48 bytes into all 16B lanes of front(vin) and only 24
 6220     // bytes into the lower 8B lane of back(vin)
 6221     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6222     vs_ld3(vs_back(vin), __ T8B, condensed);
 6223 
 6224     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6225     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6226     // 5 and target element 2 of vb duplicates element 4.
 6227     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6228     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6229     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6230     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6231     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6232     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6233 
 6234     // This time expand just the lower 8 lanes
 6235     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6236     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6237     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6238 
 6239     // shift lo byte of copy 1 of the middle stripe into the high byte
 6240     __ shl(va[2], __ T8H, va[2], 8);
 6241     __ shl(va[3], __ T8H, va[3], 8);
 6242     __ shl(vb[2], __ T8H, vb[2], 8);
 6243 
 6244     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6245     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6246     // int are in bit positions [4..11].
 6247     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6248     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6249     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6250 
 6251     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6252     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6253     // copy2
 6254     __ andr(va[2], __ T16B, va[2], v31);
 6255     __ andr(va[3], __ T16B, va[3], v31);
 6256     __ ushr(va[4], __ T8H, va[4], 4);
 6257     __ ushr(va[5], __ T8H, va[5], 4);
 6258     __ andr(vb[2], __ T16B, vb[2], v31);
 6259     __ ushr(vb[4], __ T8H, vb[4], 4);
 6260 
 6261 
 6262 
 6263     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6264     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6265 
 6266     // n.b. ordering ensures: i) inputs are consumed before they are
 6267     // overwritten ii) order of 16-bit results across succsessive
 6268     // pairs of vectors in va and then lower half of vb reflects order
 6269     // of corresponding 12-bit inputs
 6270     __ addv(va[0], __ T8H, va[0], va[2]);
 6271     __ addv(va[2], __ T8H, va[1], va[3]);
 6272     __ addv(va[1], __ T8H, va[4], va[6]);
 6273     __ addv(va[3], __ T8H, va[5], va[7]);
 6274     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6275     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6276 
 6277     // store 48 results interleaved as shorts
 6278     vs_st2_post(vs_front(va), __ T8H, parsed);
 6279     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6280 
 6281     __ BIND(L_end);
 6282 
 6283     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6284     __ mov(r0, zr); // return 0
 6285     __ ret(lr);
 6286 
 6287     // bind label and generate constant data used by this stub
 6288     __ BIND(L_F00);
 6289     __ emit_int64(0x0f000f000f000f00);
 6290     __ emit_int64(0x0f000f000f000f00);
 6291 
 6292     return start;
 6293   }
 6294 
 6295   // Kyber Barrett reduce function.
 6296   // Implements
 6297   // static int implKyberBarrettReduce(short[] coeffs) {}
 6298   //
 6299   // coeffs (short[256]) = c_rarg0
 6300   address generate_kyberBarrettReduce() {
 6301 
 6302     __ align(CodeEntryAlignment);
 6303     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6304     StubCodeMark mark(this, stub_id);
 6305     address start = __ pc();
 6306     __ enter();
 6307 
 6308     const Register coeffs = c_rarg0;
 6309 
 6310     const Register kyberConsts = r10;
 6311     const Register result = r11;
 6312 
 6313     // As above we process 256 sets of values in total i.e. 32 x
 6314     // 8H quadwords. So, we can load, add and store the data in 3
 6315     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6316     // of 10 or 11 registers. A further constraint is that the
 6317     // mapping needs to skip callee saves. So, we allocate the
 6318     // register sequences using two 8 sequences, two 2 sequences
 6319     // and two single registers.
 6320     VSeq<8> vs1_1(0);
 6321     VSeq<2> vs1_2(16);
 6322     FloatRegister vs1_3 = v28;
 6323     VSeq<8> vs2_1(18);
 6324     VSeq<2> vs2_2(26);
 6325     FloatRegister vs2_3 = v29;
 6326 
 6327     // we also need a pair of corresponding constant sequences
 6328 
 6329     VSeq<8> vc1_1(30, 0);
 6330     VSeq<2> vc1_2(30, 0);
 6331     FloatRegister vc1_3 = v30; // for kyber_q
 6332 
 6333     VSeq<8> vc2_1(31, 0);
 6334     VSeq<2> vc2_2(31, 0);
 6335     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6336 
 6337     __ add(result, coeffs, 0);
 6338     __ lea(kyberConsts,
 6339              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6340 
 6341     // load q and the multiplier for the Barrett reduction
 6342     __ add(kyberConsts, kyberConsts, 16);
 6343     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6344 
 6345     for (int i = 0; i < 3; i++) {
 6346       // load 80 or 88 coefficients
 6347       vs_ldpq_post(vs1_1, coeffs);
 6348       vs_ldpq_post(vs1_2, coeffs);
 6349       if (i < 2) {
 6350         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6351       }
 6352 
 6353       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6354       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6355       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6356       if (i < 2) {
 6357         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6358       }
 6359 
 6360       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6361       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6362       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6363       if (i < 2) {
 6364         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6365       }
 6366 
 6367       // vs1 <- vs1 - vs2 * kyber_q
 6368       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6369       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6370       if (i < 2) {
 6371         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6372       }
 6373 
 6374       vs_stpq_post(vs1_1, result);
 6375       vs_stpq_post(vs1_2, result);
 6376       if (i < 2) {
 6377         __ str(vs1_3, __ Q, __ post(result, 16));
 6378       }
 6379     }
 6380 
 6381     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6382     __ mov(r0, zr); // return 0
 6383     __ ret(lr);
 6384 
 6385     return start;
 6386   }
 6387 
 6388 
 6389   // Dilithium-specific montmul helper routines that generate parallel
 6390   // code for, respectively, a single 4x4s vector sequence montmul or
 6391   // two such multiplies in a row.
 6392 
 6393   // Perform 16 32-bit Montgomery multiplications in parallel
 6394   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6395                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6396     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6397     // It will assert that the register use is valid
 6398     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6399   }
 6400 
 6401   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6402   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6403                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6404     // Schedule two successive 4x4S multiplies via the montmul helper
 6405     // on the front and back halves of va, vb and vc. The helper will
 6406     // assert that the register use has no overlap conflicts on each
 6407     // individual call but we also need to ensure that the necessary
 6408     // disjoint/equality constraints are met across both calls.
 6409 
 6410     // vb, vc, vtmp and vq must be disjoint. va must either be
 6411     // disjoint from all other registers or equal vc
 6412 
 6413     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6414     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6415     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6416 
 6417     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6418     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6419 
 6420     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6421 
 6422     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6423     assert(vs_disjoint(va, vb), "va and vb overlap");
 6424     assert(vs_disjoint(va, vq), "va and vq overlap");
 6425     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6426 
 6427     // We multiply the front and back halves of each sequence 4 at a
 6428     // time because
 6429     //
 6430     // 1) we are currently only able to get 4-way instruction
 6431     // parallelism at best
 6432     //
 6433     // 2) we need registers for the constants in vq and temporary
 6434     // scratch registers to hold intermediate results so vtmp can only
 6435     // be a VSeq<4> which means we only have 4 scratch slots.
 6436 
 6437     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6438     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6439   }
 6440 
 6441   // Perform combined montmul then add/sub on 4x4S vectors.
 6442   void dilithium_montmul16_sub_add(
 6443           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6444           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6445     // compute a = montmul(a1, c)
 6446     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6447     // ouptut a1 = a0 - a
 6448     vs_subv(va1, __ T4S, va0, vc);
 6449     //    and a0 = a0 + a
 6450     vs_addv(va0, __ T4S, va0, vc);
 6451   }
 6452 
 6453   // Perform combined add/sub then montul on 4x4S vectors.
 6454   void dilithium_sub_add_montmul16(
 6455           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6456           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6457     // compute c = a0 - a1
 6458     vs_subv(vtmp1, __ T4S, va0, va1);
 6459     // output a0 = a0 + a1
 6460     vs_addv(va0, __ T4S, va0, va1);
 6461     // output a1 = b montmul c
 6462     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6463   }
 6464 
 6465   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6466   // in the Java implementation come in sequences of at least 8, so we
 6467   // can use ldpq to collect the corresponding data into pairs of vector
 6468   // registers.
 6469   // We collect the coefficients corresponding to the 'j+l' indexes into
 6470   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6471   // then we do the (Montgomery) multiplications by the zetas in parallel
 6472   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6473   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6474   // v0-v7 and finally save the results back to the coeffs array.
 6475   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6476     const Register coeffs, const Register zetas) {
 6477     int c1 = 0;
 6478     int c2 = 512;
 6479     int startIncr;
 6480     // don't use callee save registers v8 - v15
 6481     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6482     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6483     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6484     int offsets[4] = { 0, 32, 64, 96 };
 6485 
 6486     for (int level = 0; level < 5; level++) {
 6487       int c1Start = c1;
 6488       int c2Start = c2;
 6489       if (level == 3) {
 6490         offsets[1] = 32;
 6491         offsets[2] = 128;
 6492         offsets[3] = 160;
 6493       } else if (level == 4) {
 6494         offsets[1] = 64;
 6495         offsets[2] = 128;
 6496         offsets[3] = 192;
 6497       }
 6498 
 6499       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6500       // time at 4 different offsets and multiply them in order by the
 6501       // next set of input values. So we employ indexed load and store
 6502       // pair instructions with arrangement 4S.
 6503       for (int i = 0; i < 4; i++) {
 6504         // reload q and qinv
 6505         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6506         // load 8x4S coefficients via second start pos == c2
 6507         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6508         // load next 8x4S inputs == b
 6509         vs_ldpq_post(vs2, zetas);
 6510         // compute a == c2 * b mod MONT_Q
 6511         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6512         // load 8x4s coefficients via first start pos == c1
 6513         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6514         // compute a1 =  c1 + a
 6515         vs_addv(vs3, __ T4S, vs1, vs2);
 6516         // compute a2 =  c1 - a
 6517         vs_subv(vs1, __ T4S, vs1, vs2);
 6518         // output a1 and a2
 6519         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6520         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6521 
 6522         int k = 4 * level + i;
 6523 
 6524         if (k > 7) {
 6525           startIncr = 256;
 6526         } else if (k == 5) {
 6527           startIncr = 384;
 6528         } else {
 6529           startIncr = 128;
 6530         }
 6531 
 6532         c1Start += startIncr;
 6533         c2Start += startIncr;
 6534       }
 6535 
 6536       c2 /= 2;
 6537     }
 6538   }
 6539 
 6540   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6541   // Implements the method
 6542   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6543   // of the Java class sun.security.provider
 6544   //
 6545   // coeffs (int[256]) = c_rarg0
 6546   // zetas (int[256]) = c_rarg1
 6547   address generate_dilithiumAlmostNtt() {
 6548 
 6549     __ align(CodeEntryAlignment);
 6550     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6551     StubCodeMark mark(this, stub_id);
 6552     address start = __ pc();
 6553     __ enter();
 6554 
 6555     const Register coeffs = c_rarg0;
 6556     const Register zetas = c_rarg1;
 6557 
 6558     const Register tmpAddr = r9;
 6559     const Register dilithiumConsts = r10;
 6560     const Register result = r11;
 6561     // don't use callee save registers v8 - v15
 6562     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6563     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6564     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6565     int offsets[4] = { 0, 32, 64, 96};
 6566     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6567     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6568     __ add(result, coeffs, 0);
 6569     __ lea(dilithiumConsts,
 6570              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6571 
 6572     // Each level represents one iteration of the outer for loop of the Java version.
 6573 
 6574     // level 0-4
 6575     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6576 
 6577     // level 5
 6578 
 6579     // At level 5 the coefficients we need to combine with the zetas
 6580     // are grouped in memory in blocks of size 4. So, for both sets of
 6581     // coefficients we load 4 adjacent values at 8 different offsets
 6582     // using an indexed ldr with register variant Q and multiply them
 6583     // in sequence order by the next set of inputs. Likewise we store
 6584     // the resuls using an indexed str with register variant Q.
 6585     for (int i = 0; i < 1024; i += 256) {
 6586       // reload constants q, qinv each iteration as they get clobbered later
 6587       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6588       // load 32 (8x4S) coefficients via first offsets = c1
 6589       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6590       // load next 32 (8x4S) inputs = b
 6591       vs_ldpq_post(vs2, zetas);
 6592       // a = b montul c1
 6593       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6594       // load 32 (8x4S) coefficients via second offsets = c2
 6595       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6596       // add/sub with result of multiply
 6597       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6598       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6599       // write back new coefficients using same offsets
 6600       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6601       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6602     }
 6603 
 6604     // level 6
 6605     // At level 6 the coefficients we need to combine with the zetas
 6606     // are grouped in memory in pairs, the first two being montmul
 6607     // inputs and the second add/sub inputs. We can still implement
 6608     // the montmul+sub+add using 4-way parallelism but only if we
 6609     // combine the coefficients with the zetas 16 at a time. We load 8
 6610     // adjacent values at 4 different offsets using an ld2 load with
 6611     // arrangement 2D. That interleaves the lower and upper halves of
 6612     // each pair of quadwords into successive vector registers. We
 6613     // then need to montmul the 4 even elements of the coefficients
 6614     // register sequence by the zetas in order and then add/sub the 4
 6615     // odd elements of the coefficients register sequence. We use an
 6616     // equivalent st2 operation to store the results back into memory
 6617     // de-interleaved.
 6618     for (int i = 0; i < 1024; i += 128) {
 6619       // reload constants q, qinv each iteration as they get clobbered later
 6620       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6621       // load interleaved 16 (4x2D) coefficients via offsets
 6622       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6623       // load next 16 (4x4S) inputs
 6624       vs_ldpq_post(vs_front(vs2), zetas);
 6625       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6626       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6627                                   vs_front(vs2), vtmp, vq);
 6628       // store interleaved 16 (4x2D) coefficients via offsets
 6629       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6630     }
 6631 
 6632     // level 7
 6633     // At level 7 the coefficients we need to combine with the zetas
 6634     // occur singly with montmul inputs alterating with add/sub
 6635     // inputs. Once again we can use 4-way parallelism to combine 16
 6636     // zetas at a time. However, we have to load 8 adjacent values at
 6637     // 4 different offsets using an ld2 load with arrangement 4S. That
 6638     // interleaves the the odd words of each pair into one
 6639     // coefficients vector register and the even words of the pair
 6640     // into the next register. We then need to montmul the 4 even
 6641     // elements of the coefficients register sequence by the zetas in
 6642     // order and then add/sub the 4 odd elements of the coefficients
 6643     // register sequence. We use an equivalent st2 operation to store
 6644     // the results back into memory de-interleaved.
 6645 
 6646     for (int i = 0; i < 1024; i += 128) {
 6647       // reload constants q, qinv each iteration as they get clobbered later
 6648       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6649       // load interleaved 16 (4x4S) coefficients via offsets
 6650       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6651       // load next 16 (4x4S) inputs
 6652       vs_ldpq_post(vs_front(vs2), zetas);
 6653       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6654       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6655                                   vs_front(vs2), vtmp, vq);
 6656       // store interleaved 16 (4x4S) coefficients via offsets
 6657       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6658     }
 6659     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6660     __ mov(r0, zr); // return 0
 6661     __ ret(lr);
 6662 
 6663     return start;
 6664   }
 6665 
 6666   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6667   // in the Java implementation come in sequences of at least 8, so we
 6668   // can use ldpq to collect the corresponding data into pairs of vector
 6669   // registers
 6670   // We collect the coefficients that correspond to the 'j's into vs1
 6671   // the coefficiets that correspond to the 'j+l's into vs2 then
 6672   // do the additions into vs3 and the subtractions into vs1 then
 6673   // save the result of the additions, load the zetas into vs2
 6674   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6675   // finally save the results back to the coeffs array
 6676   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6677     const Register coeffs, const Register zetas) {
 6678     int c1 = 0;
 6679     int c2 = 32;
 6680     int startIncr;
 6681     int offsets[4];
 6682     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6683     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6684     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6685 
 6686     offsets[0] = 0;
 6687 
 6688     for (int level = 3; level < 8; level++) {
 6689       int c1Start = c1;
 6690       int c2Start = c2;
 6691       if (level == 3) {
 6692         offsets[1] = 64;
 6693         offsets[2] = 128;
 6694         offsets[3] = 192;
 6695       } else if (level == 4) {
 6696         offsets[1] = 32;
 6697         offsets[2] = 128;
 6698         offsets[3] = 160;
 6699       } else {
 6700         offsets[1] = 32;
 6701         offsets[2] = 64;
 6702         offsets[3] = 96;
 6703       }
 6704 
 6705       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6706       // time at 4 different offsets and multiply them in order by the
 6707       // next set of input values. So we employ indexed load and store
 6708       // pair instructions with arrangement 4S.
 6709       for (int i = 0; i < 4; i++) {
 6710         // load v1 32 (8x4S) coefficients relative to first start index
 6711         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6712         // load v2 32 (8x4S) coefficients relative to second start index
 6713         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6714         // a0 = v1 + v2 -- n.b. clobbers vqs
 6715         vs_addv(vs3, __ T4S, vs1, vs2);
 6716         // a1 = v1 - v2
 6717         vs_subv(vs1, __ T4S, vs1, vs2);
 6718         // save a1 relative to first start index
 6719         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6720         // load constants q, qinv each iteration as they get clobbered above
 6721         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6722         // load b next 32 (8x4S) inputs
 6723         vs_ldpq_post(vs2, zetas);
 6724         // a = a1 montmul b
 6725         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6726         // save a relative to second start index
 6727         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6728 
 6729         int k = 4 * level + i;
 6730 
 6731         if (k < 24) {
 6732           startIncr = 256;
 6733         } else if (k == 25) {
 6734           startIncr = 384;
 6735         } else {
 6736           startIncr = 128;
 6737         }
 6738 
 6739         c1Start += startIncr;
 6740         c2Start += startIncr;
 6741       }
 6742 
 6743       c2 *= 2;
 6744     }
 6745   }
 6746 
 6747   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6748   // Implements the method
 6749   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6750   // the sun.security.provider.ML_DSA class.
 6751   //
 6752   // coeffs (int[256]) = c_rarg0
 6753   // zetas (int[256]) = c_rarg1
 6754   address generate_dilithiumAlmostInverseNtt() {
 6755 
 6756     __ align(CodeEntryAlignment);
 6757     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6758     StubCodeMark mark(this, stub_id);
 6759     address start = __ pc();
 6760     __ enter();
 6761 
 6762     const Register coeffs = c_rarg0;
 6763     const Register zetas = c_rarg1;
 6764 
 6765     const Register tmpAddr = r9;
 6766     const Register dilithiumConsts = r10;
 6767     const Register result = r11;
 6768     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6769     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6770     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6771     int offsets[4] = { 0, 32, 64, 96 };
 6772     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6773     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6774 
 6775     __ add(result, coeffs, 0);
 6776     __ lea(dilithiumConsts,
 6777              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6778 
 6779     // Each level represents one iteration of the outer for loop of the Java version
 6780 
 6781     // level 0
 6782     // At level 0 we need to interleave adjacent quartets of
 6783     // coefficients before we multiply and add/sub by the next 16
 6784     // zetas just as we did for level 7 in the multiply code. So we
 6785     // load and store the values using an ld2/st2 with arrangement 4S.
 6786     for (int i = 0; i < 1024; i += 128) {
 6787       // load constants q, qinv
 6788       // n.b. this can be moved out of the loop as they do not get
 6789       // clobbered by first two loops
 6790       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6791       // a0/a1 load interleaved 32 (8x4S) coefficients
 6792       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6793       // b load next 32 (8x4S) inputs
 6794       vs_ldpq_post(vs_front(vs2), zetas);
 6795       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6796       // n.b. second half of vs2 provides temporary register storage
 6797       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6798                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6799       // a0/a1 store interleaved 32 (8x4S) coefficients
 6800       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6801     }
 6802 
 6803     // level 1
 6804     // At level 1 we need to interleave pairs of adjacent pairs of
 6805     // coefficients before we multiply by the next 16 zetas just as we
 6806     // did for level 6 in the multiply code. So we load and store the
 6807     // values an ld2/st2 with arrangement 2D.
 6808     for (int i = 0; i < 1024; i += 128) {
 6809       // a0/a1 load interleaved 32 (8x2D) coefficients
 6810       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6811       // b load next 16 (4x4S) inputs
 6812       vs_ldpq_post(vs_front(vs2), zetas);
 6813       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6814       // n.b. second half of vs2 provides temporary register storage
 6815       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6816                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6817       // a0/a1 store interleaved 32 (8x2D) coefficients
 6818       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6819     }
 6820 
 6821     // level 2
 6822     // At level 2 coefficients come in blocks of 4. So, we load 4
 6823     // adjacent coefficients at 8 distinct offsets for both the first
 6824     // and second coefficient sequences, using an ldr with register
 6825     // variant Q then combine them with next set of 32 zetas. Likewise
 6826     // we store the results using an str with register variant Q.
 6827     for (int i = 0; i < 1024; i += 256) {
 6828       // c0 load 32 (8x4S) coefficients via first offsets
 6829       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6830       // c1 load 32 (8x4S) coefficients via second offsets
 6831       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6832       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6833       vs_addv(vs3, __ T4S, vs1, vs2);
 6834       // c = c0 - c1
 6835       vs_subv(vs1, __ T4S, vs1, vs2);
 6836       // store a0 32 (8x4S) coefficients via first offsets
 6837       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6838       // b load 32 (8x4S) next inputs
 6839       vs_ldpq_post(vs2, zetas);
 6840       // reload constants q, qinv -- they were clobbered earlier
 6841       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6842       // compute a1 = b montmul c
 6843       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6844       // store a1 32 (8x4S) coefficients via second offsets
 6845       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6846     }
 6847 
 6848     // level 3-7
 6849     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6850 
 6851     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6852     __ mov(r0, zr); // return 0
 6853     __ ret(lr);
 6854 
 6855     return start;
 6856   }
 6857 
 6858   // Dilithium multiply polynomials in the NTT domain.
 6859   // Straightforward implementation of the method
 6860   // static int implDilithiumNttMult(
 6861   //              int[] result, int[] ntta, int[] nttb {} of
 6862   // the sun.security.provider.ML_DSA class.
 6863   //
 6864   // result (int[256]) = c_rarg0
 6865   // poly1 (int[256]) = c_rarg1
 6866   // poly2 (int[256]) = c_rarg2
 6867   address generate_dilithiumNttMult() {
 6868 
 6869         __ align(CodeEntryAlignment);
 6870     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6871     StubCodeMark mark(this, stub_id);
 6872     address start = __ pc();
 6873     __ enter();
 6874 
 6875     Label L_loop;
 6876 
 6877     const Register result = c_rarg0;
 6878     const Register poly1 = c_rarg1;
 6879     const Register poly2 = c_rarg2;
 6880 
 6881     const Register dilithiumConsts = r10;
 6882     const Register len = r11;
 6883 
 6884     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6885     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6886     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6887     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6888 
 6889     __ lea(dilithiumConsts,
 6890              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6891 
 6892     // load constants q, qinv
 6893     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6894     // load constant rSquare into v29
 6895     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6896 
 6897     __ mov(len, zr);
 6898     __ add(len, len, 1024);
 6899 
 6900     __ BIND(L_loop);
 6901 
 6902     // b load 32 (8x4S) next inputs from poly1
 6903     vs_ldpq_post(vs1, poly1);
 6904     // c load 32 (8x4S) next inputs from poly2
 6905     vs_ldpq_post(vs2, poly2);
 6906     // compute a = b montmul c
 6907     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6908     // compute a = rsquare montmul a
 6909     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6910     // save a 32 (8x4S) results
 6911     vs_stpq_post(vs2, result);
 6912 
 6913     __ sub(len, len, 128);
 6914     __ cmp(len, (u1)128);
 6915     __ br(Assembler::GE, L_loop);
 6916 
 6917     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6918     __ mov(r0, zr); // return 0
 6919     __ ret(lr);
 6920 
 6921     return start;
 6922   }
 6923 
 6924   // Dilithium Motgomery multiply an array by a constant.
 6925   // A straightforward implementation of the method
 6926   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6927   // of the sun.security.provider.MLDSA class
 6928   //
 6929   // coeffs (int[256]) = c_rarg0
 6930   // constant (int) = c_rarg1
 6931   address generate_dilithiumMontMulByConstant() {
 6932 
 6933     __ align(CodeEntryAlignment);
 6934     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6935     StubCodeMark mark(this, stub_id);
 6936     address start = __ pc();
 6937     __ enter();
 6938 
 6939     Label L_loop;
 6940 
 6941     const Register coeffs = c_rarg0;
 6942     const Register constant = c_rarg1;
 6943 
 6944     const Register dilithiumConsts = r10;
 6945     const Register result = r11;
 6946     const Register len = r12;
 6947 
 6948     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6949     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6950     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6951     VSeq<8> vconst(29, 0);             // for montmul by constant
 6952 
 6953     // results track inputs
 6954     __ add(result, coeffs, 0);
 6955     __ lea(dilithiumConsts,
 6956              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6957 
 6958     // load constants q, qinv -- they do not get clobbered by first two loops
 6959     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6960     // copy caller supplied constant across vconst
 6961     __ dup(vconst[0], __ T4S, constant);
 6962     __ mov(len, zr);
 6963     __ add(len, len, 1024);
 6964 
 6965     __ BIND(L_loop);
 6966 
 6967     // load next 32 inputs
 6968     vs_ldpq_post(vs2, coeffs);
 6969     // mont mul by constant
 6970     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6971     // write next 32 results
 6972     vs_stpq_post(vs2, result);
 6973 
 6974     __ sub(len, len, 128);
 6975     __ cmp(len, (u1)128);
 6976     __ br(Assembler::GE, L_loop);
 6977 
 6978     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6979     __ mov(r0, zr); // return 0
 6980     __ ret(lr);
 6981 
 6982     return start;
 6983   }
 6984 
 6985   // Dilithium decompose poly.
 6986   // Implements the method
 6987   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6988   // of the sun.security.provider.ML_DSA class
 6989   //
 6990   // input (int[256]) = c_rarg0
 6991   // lowPart (int[256]) = c_rarg1
 6992   // highPart (int[256]) = c_rarg2
 6993   // twoGamma2  (int) = c_rarg3
 6994   // multiplier (int) = c_rarg4
 6995   address generate_dilithiumDecomposePoly() {
 6996 
 6997     __ align(CodeEntryAlignment);
 6998     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 6999     StubCodeMark mark(this, stub_id);
 7000     address start = __ pc();
 7001     Label L_loop;
 7002 
 7003     const Register input = c_rarg0;
 7004     const Register lowPart = c_rarg1;
 7005     const Register highPart = c_rarg2;
 7006     const Register twoGamma2 = c_rarg3;
 7007     const Register multiplier = c_rarg4;
 7008 
 7009     const Register len = r9;
 7010     const Register dilithiumConsts = r10;
 7011     const Register tmp = r11;
 7012 
 7013     // 6 independent sets of 4x4s values
 7014     VSeq<4> vs1(0), vs2(4), vs3(8);
 7015     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7016 
 7017     // 7 constants for cross-multiplying
 7018     VSeq<4> one(25, 0);
 7019     VSeq<4> qminus1(26, 0);
 7020     VSeq<4> g2(27, 0);
 7021     VSeq<4> twog2(28, 0);
 7022     VSeq<4> mult(29, 0);
 7023     VSeq<4> q(30, 0);
 7024     VSeq<4> qadd(31, 0);
 7025 
 7026     __ enter();
 7027 
 7028     __ lea(dilithiumConsts,
 7029              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7030 
 7031     // save callee-saved registers
 7032     __ stpd(v8, v9, __ pre(sp, -64));
 7033     __ stpd(v10, v11, Address(sp, 16));
 7034     __ stpd(v12, v13, Address(sp, 32));
 7035     __ stpd(v14, v15, Address(sp, 48));
 7036 
 7037     // populate constant registers
 7038     __ mov(tmp, zr);
 7039     __ add(tmp, tmp, 1);
 7040     __ dup(one[0], __ T4S, tmp); // 1
 7041     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7042     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7043     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7044     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7045     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7046     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7047 
 7048     __ mov(len, zr);
 7049     __ add(len, len, 1024);
 7050 
 7051     __ BIND(L_loop);
 7052 
 7053     // load next 4x4S inputs interleaved: rplus --> vs1
 7054     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7055 
 7056     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7057     vs_addv(vtmp, __ T4S, vs1, qadd);
 7058     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7059     vs_mulv(vtmp, __ T4S, vtmp, q);
 7060     vs_subv(vs1, __ T4S, vs1, vtmp);
 7061 
 7062     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7063     vs_sshr(vtmp, __ T4S, vs1, 31);
 7064     vs_andr(vtmp, vtmp, q);
 7065     vs_addv(vs1, __ T4S, vs1, vtmp);
 7066 
 7067     // quotient --> vs2
 7068     // int quotient = (rplus * multiplier) >> 22;
 7069     vs_mulv(vtmp, __ T4S, vs1, mult);
 7070     vs_sshr(vs2, __ T4S, vtmp, 22);
 7071 
 7072     // r0 --> vs3
 7073     // int r0 = rplus - quotient * twoGamma2;
 7074     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7075     vs_subv(vs3, __ T4S, vs1, vtmp);
 7076 
 7077     // mask --> vs4
 7078     // int mask = (twoGamma2 - r0) >> 22;
 7079     vs_subv(vtmp, __ T4S, twog2, vs3);
 7080     vs_sshr(vs4, __ T4S, vtmp, 22);
 7081 
 7082     // r0 -= (mask & twoGamma2);
 7083     vs_andr(vtmp, vs4, twog2);
 7084     vs_subv(vs3, __ T4S, vs3, vtmp);
 7085 
 7086     //  quotient += (mask & 1);
 7087     vs_andr(vtmp, vs4, one);
 7088     vs_addv(vs2, __ T4S, vs2, vtmp);
 7089 
 7090     // mask = (twoGamma2 / 2 - r0) >> 31;
 7091     vs_subv(vtmp, __ T4S, g2, vs3);
 7092     vs_sshr(vs4, __ T4S, vtmp, 31);
 7093 
 7094     // r0 -= (mask & twoGamma2);
 7095     vs_andr(vtmp, vs4, twog2);
 7096     vs_subv(vs3, __ T4S, vs3, vtmp);
 7097 
 7098     // quotient += (mask & 1);
 7099     vs_andr(vtmp, vs4, one);
 7100     vs_addv(vs2, __ T4S, vs2, vtmp);
 7101 
 7102     // r1 --> vs5
 7103     // int r1 = rplus - r0 - (dilithium_q - 1);
 7104     vs_subv(vtmp, __ T4S, vs1, vs3);
 7105     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7106 
 7107     // r1 --> vs1 (overwriting rplus)
 7108     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7109     vs_negr(vtmp, __ T4S, vs5);
 7110     vs_orr(vtmp, vs5, vtmp);
 7111     vs_sshr(vs1, __ T4S, vtmp, 31);
 7112 
 7113     // r0 += ~r1;
 7114     vs_notr(vtmp, vs1);
 7115     vs_addv(vs3, __ T4S, vs3, vtmp);
 7116 
 7117     // r1 = r1 & quotient;
 7118     vs_andr(vs1, vs2, vs1);
 7119 
 7120     // store results inteleaved
 7121     // lowPart[m] = r0;
 7122     // highPart[m] = r1;
 7123     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7124     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7125 
 7126     __ sub(len, len, 64);
 7127     __ cmp(len, (u1)64);
 7128     __ br(Assembler::GE, L_loop);
 7129 
 7130     // restore callee-saved vector registers
 7131     __ ldpd(v14, v15, Address(sp, 48));
 7132     __ ldpd(v12, v13, Address(sp, 32));
 7133     __ ldpd(v10, v11, Address(sp, 16));
 7134     __ ldpd(v8, v9, __ post(sp, 64));
 7135 
 7136     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7137     __ mov(r0, zr); // return 0
 7138     __ ret(lr);
 7139 
 7140     return start;
 7141   }
 7142 
 7143   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7144              Register tmp0, Register tmp1, Register tmp2) {
 7145     __ bic(tmp0, a2, a1); // for a0
 7146     __ bic(tmp1, a3, a2); // for a1
 7147     __ bic(tmp2, a4, a3); // for a2
 7148     __ eor(a2, a2, tmp2);
 7149     __ bic(tmp2, a0, a4); // for a3
 7150     __ eor(a3, a3, tmp2);
 7151     __ bic(tmp2, a1, a0); // for a4
 7152     __ eor(a0, a0, tmp0);
 7153     __ eor(a1, a1, tmp1);
 7154     __ eor(a4, a4, tmp2);
 7155   }
 7156 
 7157   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7158                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7159                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7160                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7161                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7162                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7163                         Register tmp0, Register tmp1, Register tmp2) {
 7164     __ eor3(tmp1, a4, a9, a14);
 7165     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7166     __ eor3(tmp2, a1, a6, a11);
 7167     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7168     __ rax1(tmp2, tmp0, tmp1); // d0
 7169     {
 7170 
 7171       Register tmp3, tmp4;
 7172       if (can_use_fp && can_use_r18) {
 7173         tmp3 = rfp;
 7174         tmp4 = r18_tls;
 7175       } else {
 7176         tmp3 = a4;
 7177         tmp4 = a9;
 7178         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7179       }
 7180 
 7181       __ eor3(tmp3, a0, a5, a10);
 7182       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7183       __ eor(a0, a0, tmp2);
 7184       __ eor(a5, a5, tmp2);
 7185       __ eor(a10, a10, tmp2);
 7186       __ eor(a15, a15, tmp2);
 7187       __ eor(a20, a20, tmp2); // d0(tmp2)
 7188       __ eor3(tmp3, a2, a7, a12);
 7189       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7190       __ rax1(tmp3, tmp4, tmp2); // d1
 7191       __ eor(a1, a1, tmp3);
 7192       __ eor(a6, a6, tmp3);
 7193       __ eor(a11, a11, tmp3);
 7194       __ eor(a16, a16, tmp3);
 7195       __ eor(a21, a21, tmp3); // d1(tmp3)
 7196       __ rax1(tmp3, tmp2, tmp0); // d3
 7197       __ eor3(tmp2, a3, a8, a13);
 7198       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7199       __ eor(a3, a3, tmp3);
 7200       __ eor(a8, a8, tmp3);
 7201       __ eor(a13, a13, tmp3);
 7202       __ eor(a18, a18, tmp3);
 7203       __ eor(a23, a23, tmp3);
 7204       __ rax1(tmp2, tmp1, tmp0); // d2
 7205       __ eor(a2, a2, tmp2);
 7206       __ eor(a7, a7, tmp2);
 7207       __ eor(a12, a12, tmp2);
 7208       __ rax1(tmp0, tmp0, tmp4); // d4
 7209       if (!can_use_fp || !can_use_r18) {
 7210         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7211       }
 7212       __ eor(a17, a17, tmp2);
 7213       __ eor(a22, a22, tmp2);
 7214       __ eor(a4, a4, tmp0);
 7215       __ eor(a9, a9, tmp0);
 7216       __ eor(a14, a14, tmp0);
 7217       __ eor(a19, a19, tmp0);
 7218       __ eor(a24, a24, tmp0);
 7219     }
 7220 
 7221     __ rol(tmp0, a10, 3);
 7222     __ rol(a10, a1, 1);
 7223     __ rol(a1, a6, 44);
 7224     __ rol(a6, a9, 20);
 7225     __ rol(a9, a22, 61);
 7226     __ rol(a22, a14, 39);
 7227     __ rol(a14, a20, 18);
 7228     __ rol(a20, a2, 62);
 7229     __ rol(a2, a12, 43);
 7230     __ rol(a12, a13, 25);
 7231     __ rol(a13, a19, 8) ;
 7232     __ rol(a19, a23, 56);
 7233     __ rol(a23, a15, 41);
 7234     __ rol(a15, a4, 27);
 7235     __ rol(a4, a24, 14);
 7236     __ rol(a24, a21, 2);
 7237     __ rol(a21, a8, 55);
 7238     __ rol(a8, a16, 45);
 7239     __ rol(a16, a5, 36);
 7240     __ rol(a5, a3, 28);
 7241     __ rol(a3, a18, 21);
 7242     __ rol(a18, a17, 15);
 7243     __ rol(a17, a11, 10);
 7244     __ rol(a11, a7, 6);
 7245     __ mov(a7, tmp0);
 7246 
 7247     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7248     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7249     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7250     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7251     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7252 
 7253     __ ldr(tmp1, __ post(rc, 8));
 7254     __ eor(a0, a0, tmp1);
 7255 
 7256   }
 7257 
 7258   // Arguments:
 7259   //
 7260   // Inputs:
 7261   //   c_rarg0   - byte[]  source+offset
 7262   //   c_rarg1   - byte[]  SHA.state
 7263   //   c_rarg2   - int     block_size
 7264   //   c_rarg3   - int     offset
 7265   //   c_rarg4   - int     limit
 7266   //
 7267   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7268     bool multi_block;
 7269     switch (stub_id) {
 7270     case StubId::stubgen_sha3_implCompress_id:
 7271       multi_block = false;
 7272       break;
 7273     case StubId::stubgen_sha3_implCompressMB_id:
 7274       multi_block = true;
 7275       break;
 7276     default:
 7277       ShouldNotReachHere();
 7278     }
 7279 
 7280     static const uint64_t round_consts[24] = {
 7281       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7282       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7283       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7284       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7285       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7286       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7287       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7288       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7289     };
 7290 
 7291     __ align(CodeEntryAlignment);
 7292     StubCodeMark mark(this, stub_id);
 7293     address start = __ pc();
 7294 
 7295     Register buf           = c_rarg0;
 7296     Register state         = c_rarg1;
 7297     Register block_size    = c_rarg2;
 7298     Register ofs           = c_rarg3;
 7299     Register limit         = c_rarg4;
 7300 
 7301     // use r3.r17,r19..r28 to keep a0..a24.
 7302     // a0..a24 are respective locals from SHA3.java
 7303     Register a0 = r25,
 7304              a1 = r26,
 7305              a2 = r27,
 7306              a3 = r3,
 7307              a4 = r4,
 7308              a5 = r5,
 7309              a6 = r6,
 7310              a7 = r7,
 7311              a8 = rscratch1, // r8
 7312              a9 = rscratch2, // r9
 7313              a10 = r10,
 7314              a11 = r11,
 7315              a12 = r12,
 7316              a13 = r13,
 7317              a14 = r14,
 7318              a15 = r15,
 7319              a16 = r16,
 7320              a17 = r17,
 7321              a18 = r28,
 7322              a19 = r19,
 7323              a20 = r20,
 7324              a21 = r21,
 7325              a22 = r22,
 7326              a23 = r23,
 7327              a24 = r24;
 7328 
 7329     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7330 
 7331     Label sha3_loop, rounds24_preloop, loop_body;
 7332     Label sha3_512_or_sha3_384, shake128;
 7333 
 7334     bool can_use_r18 = false;
 7335 #ifndef R18_RESERVED
 7336     can_use_r18 = true;
 7337 #endif
 7338     bool can_use_fp = !PreserveFramePointer;
 7339 
 7340     __ enter();
 7341 
 7342     // save almost all yet unsaved gpr registers on stack
 7343     __ str(block_size, __ pre(sp, -128));
 7344     if (multi_block) {
 7345       __ stpw(ofs, limit, Address(sp, 8));
 7346     }
 7347     // 8 bytes at sp+16 will be used to keep buf
 7348     __ stp(r19, r20, Address(sp, 32));
 7349     __ stp(r21, r22, Address(sp, 48));
 7350     __ stp(r23, r24, Address(sp, 64));
 7351     __ stp(r25, r26, Address(sp, 80));
 7352     __ stp(r27, r28, Address(sp, 96));
 7353     if (can_use_r18 && can_use_fp) {
 7354       __ stp(r18_tls, state, Address(sp, 112));
 7355     } else {
 7356       __ str(state, Address(sp, 112));
 7357     }
 7358 
 7359     // begin sha3 calculations: loading a0..a24 from state arrary
 7360     __ ldp(a0, a1, state);
 7361     __ ldp(a2, a3, Address(state, 16));
 7362     __ ldp(a4, a5, Address(state, 32));
 7363     __ ldp(a6, a7, Address(state, 48));
 7364     __ ldp(a8, a9, Address(state, 64));
 7365     __ ldp(a10, a11, Address(state, 80));
 7366     __ ldp(a12, a13, Address(state, 96));
 7367     __ ldp(a14, a15, Address(state, 112));
 7368     __ ldp(a16, a17, Address(state, 128));
 7369     __ ldp(a18, a19, Address(state, 144));
 7370     __ ldp(a20, a21, Address(state, 160));
 7371     __ ldp(a22, a23, Address(state, 176));
 7372     __ ldr(a24, Address(state, 192));
 7373 
 7374     __ BIND(sha3_loop);
 7375 
 7376     // load input
 7377     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7378     __ eor(a0, a0, tmp3);
 7379     __ eor(a1, a1, tmp2);
 7380     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7381     __ eor(a2, a2, tmp3);
 7382     __ eor(a3, a3, tmp2);
 7383     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7384     __ eor(a4, a4, tmp3);
 7385     __ eor(a5, a5, tmp2);
 7386     __ ldr(tmp3, __ post(buf, 8));
 7387     __ eor(a6, a6, tmp3);
 7388 
 7389     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7390     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7391 
 7392     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7393     __ eor(a7, a7, tmp3);
 7394     __ eor(a8, a8, tmp2);
 7395     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7396     __ eor(a9, a9, tmp3);
 7397     __ eor(a10, a10, tmp2);
 7398     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7399     __ eor(a11, a11, tmp3);
 7400     __ eor(a12, a12, tmp2);
 7401     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7402     __ eor(a13, a13, tmp3);
 7403     __ eor(a14, a14, tmp2);
 7404     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7405     __ eor(a15, a15, tmp3);
 7406     __ eor(a16, a16, tmp2);
 7407 
 7408     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7409     __ andw(tmp2, block_size, 48);
 7410     __ cbzw(tmp2, rounds24_preloop);
 7411     __ tbnz(block_size, 5, shake128);
 7412     // block_size == 144, bit5 == 0, SHA3-244
 7413     __ ldr(tmp3, __ post(buf, 8));
 7414     __ eor(a17, a17, tmp3);
 7415     __ b(rounds24_preloop);
 7416 
 7417     __ BIND(shake128);
 7418     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7419     __ eor(a17, a17, tmp3);
 7420     __ eor(a18, a18, tmp2);
 7421     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7422     __ eor(a19, a19, tmp3);
 7423     __ eor(a20, a20, tmp2);
 7424     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7425 
 7426     __ BIND(sha3_512_or_sha3_384);
 7427     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7428     __ eor(a7, a7, tmp3);
 7429     __ eor(a8, a8, tmp2);
 7430     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7431 
 7432     // SHA3-384
 7433     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7434     __ eor(a9, a9, tmp3);
 7435     __ eor(a10, a10, tmp2);
 7436     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7437     __ eor(a11, a11, tmp3);
 7438     __ eor(a12, a12, tmp2);
 7439 
 7440     __ BIND(rounds24_preloop);
 7441     __ fmovs(v0, 24.0); // float loop counter,
 7442     __ fmovs(v1, 1.0);  // exact representation
 7443 
 7444     __ str(buf, Address(sp, 16));
 7445     __ lea(tmp3, ExternalAddress((address) round_consts));
 7446 
 7447     __ BIND(loop_body);
 7448     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7449                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7450                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7451                      tmp0, tmp1, tmp2);
 7452     __ fsubs(v0, v0, v1);
 7453     __ fcmps(v0, 0.0);
 7454     __ br(__ NE, loop_body);
 7455 
 7456     if (multi_block) {
 7457       __ ldrw(block_size, sp); // block_size
 7458       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7459       __ addw(tmp2, tmp2, block_size);
 7460       __ cmpw(tmp2, tmp1);
 7461       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7462       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7463       __ br(Assembler::LE, sha3_loop);
 7464       __ movw(c_rarg0, tmp2); // return offset
 7465     }
 7466     if (can_use_fp && can_use_r18) {
 7467       __ ldp(r18_tls, state, Address(sp, 112));
 7468     } else {
 7469       __ ldr(state, Address(sp, 112));
 7470     }
 7471     // save calculated sha3 state
 7472     __ stp(a0, a1, Address(state));
 7473     __ stp(a2, a3, Address(state, 16));
 7474     __ stp(a4, a5, Address(state, 32));
 7475     __ stp(a6, a7, Address(state, 48));
 7476     __ stp(a8, a9, Address(state, 64));
 7477     __ stp(a10, a11, Address(state, 80));
 7478     __ stp(a12, a13, Address(state, 96));
 7479     __ stp(a14, a15, Address(state, 112));
 7480     __ stp(a16, a17, Address(state, 128));
 7481     __ stp(a18, a19, Address(state, 144));
 7482     __ stp(a20, a21, Address(state, 160));
 7483     __ stp(a22, a23, Address(state, 176));
 7484     __ str(a24, Address(state, 192));
 7485 
 7486     // restore required registers from stack
 7487     __ ldp(r19, r20, Address(sp, 32));
 7488     __ ldp(r21, r22, Address(sp, 48));
 7489     __ ldp(r23, r24, Address(sp, 64));
 7490     __ ldp(r25, r26, Address(sp, 80));
 7491     __ ldp(r27, r28, Address(sp, 96));
 7492     if (can_use_fp && can_use_r18) {
 7493       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7494     } // else no need to recalculate rfp, since it wasn't changed
 7495 
 7496     __ leave();
 7497 
 7498     __ ret(lr);
 7499 
 7500     return start;
 7501   }
 7502 
 7503   /**
 7504    *  Arguments:
 7505    *
 7506    * Inputs:
 7507    *   c_rarg0   - int crc
 7508    *   c_rarg1   - byte* buf
 7509    *   c_rarg2   - int length
 7510    *
 7511    * Output:
 7512    *       rax   - int crc result
 7513    */
 7514   address generate_updateBytesCRC32() {
 7515     assert(UseCRC32Intrinsics, "what are we doing here?");
 7516 
 7517     __ align(CodeEntryAlignment);
 7518     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7519     StubCodeMark mark(this, stub_id);
 7520 
 7521     address start = __ pc();
 7522 
 7523     const Register crc   = c_rarg0;  // crc
 7524     const Register buf   = c_rarg1;  // source java byte array address
 7525     const Register len   = c_rarg2;  // length
 7526     const Register table0 = c_rarg3; // crc_table address
 7527     const Register table1 = c_rarg4;
 7528     const Register table2 = c_rarg5;
 7529     const Register table3 = c_rarg6;
 7530     const Register tmp3 = c_rarg7;
 7531 
 7532     BLOCK_COMMENT("Entry:");
 7533     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7534 
 7535     __ kernel_crc32(crc, buf, len,
 7536               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7537 
 7538     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7539     __ ret(lr);
 7540 
 7541     return start;
 7542   }
 7543 
 7544   /**
 7545    *  Arguments:
 7546    *
 7547    * Inputs:
 7548    *   c_rarg0   - int crc
 7549    *   c_rarg1   - byte* buf
 7550    *   c_rarg2   - int length
 7551    *   c_rarg3   - int* table
 7552    *
 7553    * Output:
 7554    *       r0   - int crc result
 7555    */
 7556   address generate_updateBytesCRC32C() {
 7557     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7558 
 7559     __ align(CodeEntryAlignment);
 7560     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7561     StubCodeMark mark(this, stub_id);
 7562 
 7563     address start = __ pc();
 7564 
 7565     const Register crc   = c_rarg0;  // crc
 7566     const Register buf   = c_rarg1;  // source java byte array address
 7567     const Register len   = c_rarg2;  // length
 7568     const Register table0 = c_rarg3; // crc_table address
 7569     const Register table1 = c_rarg4;
 7570     const Register table2 = c_rarg5;
 7571     const Register table3 = c_rarg6;
 7572     const Register tmp3 = c_rarg7;
 7573 
 7574     BLOCK_COMMENT("Entry:");
 7575     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7576 
 7577     __ kernel_crc32c(crc, buf, len,
 7578               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7579 
 7580     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7581     __ ret(lr);
 7582 
 7583     return start;
 7584   }
 7585 
 7586   /***
 7587    *  Arguments:
 7588    *
 7589    *  Inputs:
 7590    *   c_rarg0   - int   adler
 7591    *   c_rarg1   - byte* buff
 7592    *   c_rarg2   - int   len
 7593    *
 7594    * Output:
 7595    *   c_rarg0   - int adler result
 7596    */
 7597   address generate_updateBytesAdler32() {
 7598     __ align(CodeEntryAlignment);
 7599     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7600     StubCodeMark mark(this, stub_id);
 7601     address start = __ pc();
 7602 
 7603     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7604 
 7605     // Aliases
 7606     Register adler  = c_rarg0;
 7607     Register s1     = c_rarg0;
 7608     Register s2     = c_rarg3;
 7609     Register buff   = c_rarg1;
 7610     Register len    = c_rarg2;
 7611     Register nmax  = r4;
 7612     Register base  = r5;
 7613     Register count = r6;
 7614     Register temp0 = rscratch1;
 7615     Register temp1 = rscratch2;
 7616     FloatRegister vbytes = v0;
 7617     FloatRegister vs1acc = v1;
 7618     FloatRegister vs2acc = v2;
 7619     FloatRegister vtable = v3;
 7620 
 7621     // Max number of bytes we can process before having to take the mod
 7622     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7623     uint64_t BASE = 0xfff1;
 7624     uint64_t NMAX = 0x15B0;
 7625 
 7626     __ mov(base, BASE);
 7627     __ mov(nmax, NMAX);
 7628 
 7629     // Load accumulation coefficients for the upper 16 bits
 7630     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7631     __ ld1(vtable, __ T16B, Address(temp0));
 7632 
 7633     // s1 is initialized to the lower 16 bits of adler
 7634     // s2 is initialized to the upper 16 bits of adler
 7635     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7636     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7637 
 7638     // The pipelined loop needs at least 16 elements for 1 iteration
 7639     // It does check this, but it is more effective to skip to the cleanup loop
 7640     __ cmp(len, (u1)16);
 7641     __ br(Assembler::HS, L_nmax);
 7642     __ cbz(len, L_combine);
 7643 
 7644     __ bind(L_simple_by1_loop);
 7645     __ ldrb(temp0, Address(__ post(buff, 1)));
 7646     __ add(s1, s1, temp0);
 7647     __ add(s2, s2, s1);
 7648     __ subs(len, len, 1);
 7649     __ br(Assembler::HI, L_simple_by1_loop);
 7650 
 7651     // s1 = s1 % BASE
 7652     __ subs(temp0, s1, base);
 7653     __ csel(s1, temp0, s1, Assembler::HS);
 7654 
 7655     // s2 = s2 % BASE
 7656     __ lsr(temp0, s2, 16);
 7657     __ lsl(temp1, temp0, 4);
 7658     __ sub(temp1, temp1, temp0);
 7659     __ add(s2, temp1, s2, ext::uxth);
 7660 
 7661     __ subs(temp0, s2, base);
 7662     __ csel(s2, temp0, s2, Assembler::HS);
 7663 
 7664     __ b(L_combine);
 7665 
 7666     __ bind(L_nmax);
 7667     __ subs(len, len, nmax);
 7668     __ sub(count, nmax, 16);
 7669     __ br(Assembler::LO, L_by16);
 7670 
 7671     __ bind(L_nmax_loop);
 7672 
 7673     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7674                                       vbytes, vs1acc, vs2acc, vtable);
 7675 
 7676     __ subs(count, count, 16);
 7677     __ br(Assembler::HS, L_nmax_loop);
 7678 
 7679     // s1 = s1 % BASE
 7680     __ lsr(temp0, s1, 16);
 7681     __ lsl(temp1, temp0, 4);
 7682     __ sub(temp1, temp1, temp0);
 7683     __ add(temp1, temp1, s1, ext::uxth);
 7684 
 7685     __ lsr(temp0, temp1, 16);
 7686     __ lsl(s1, temp0, 4);
 7687     __ sub(s1, s1, temp0);
 7688     __ add(s1, s1, temp1, ext:: uxth);
 7689 
 7690     __ subs(temp0, s1, base);
 7691     __ csel(s1, temp0, s1, Assembler::HS);
 7692 
 7693     // s2 = s2 % BASE
 7694     __ lsr(temp0, s2, 16);
 7695     __ lsl(temp1, temp0, 4);
 7696     __ sub(temp1, temp1, temp0);
 7697     __ add(temp1, temp1, s2, ext::uxth);
 7698 
 7699     __ lsr(temp0, temp1, 16);
 7700     __ lsl(s2, temp0, 4);
 7701     __ sub(s2, s2, temp0);
 7702     __ add(s2, s2, temp1, ext:: uxth);
 7703 
 7704     __ subs(temp0, s2, base);
 7705     __ csel(s2, temp0, s2, Assembler::HS);
 7706 
 7707     __ subs(len, len, nmax);
 7708     __ sub(count, nmax, 16);
 7709     __ br(Assembler::HS, L_nmax_loop);
 7710 
 7711     __ bind(L_by16);
 7712     __ adds(len, len, count);
 7713     __ br(Assembler::LO, L_by1);
 7714 
 7715     __ bind(L_by16_loop);
 7716 
 7717     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7718                                       vbytes, vs1acc, vs2acc, vtable);
 7719 
 7720     __ subs(len, len, 16);
 7721     __ br(Assembler::HS, L_by16_loop);
 7722 
 7723     __ bind(L_by1);
 7724     __ adds(len, len, 15);
 7725     __ br(Assembler::LO, L_do_mod);
 7726 
 7727     __ bind(L_by1_loop);
 7728     __ ldrb(temp0, Address(__ post(buff, 1)));
 7729     __ add(s1, temp0, s1);
 7730     __ add(s2, s2, s1);
 7731     __ subs(len, len, 1);
 7732     __ br(Assembler::HS, L_by1_loop);
 7733 
 7734     __ bind(L_do_mod);
 7735     // s1 = s1 % BASE
 7736     __ lsr(temp0, s1, 16);
 7737     __ lsl(temp1, temp0, 4);
 7738     __ sub(temp1, temp1, temp0);
 7739     __ add(temp1, temp1, s1, ext::uxth);
 7740 
 7741     __ lsr(temp0, temp1, 16);
 7742     __ lsl(s1, temp0, 4);
 7743     __ sub(s1, s1, temp0);
 7744     __ add(s1, s1, temp1, ext:: uxth);
 7745 
 7746     __ subs(temp0, s1, base);
 7747     __ csel(s1, temp0, s1, Assembler::HS);
 7748 
 7749     // s2 = s2 % BASE
 7750     __ lsr(temp0, s2, 16);
 7751     __ lsl(temp1, temp0, 4);
 7752     __ sub(temp1, temp1, temp0);
 7753     __ add(temp1, temp1, s2, ext::uxth);
 7754 
 7755     __ lsr(temp0, temp1, 16);
 7756     __ lsl(s2, temp0, 4);
 7757     __ sub(s2, s2, temp0);
 7758     __ add(s2, s2, temp1, ext:: uxth);
 7759 
 7760     __ subs(temp0, s2, base);
 7761     __ csel(s2, temp0, s2, Assembler::HS);
 7762 
 7763     // Combine lower bits and higher bits
 7764     __ bind(L_combine);
 7765     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7766 
 7767     __ ret(lr);
 7768 
 7769     return start;
 7770   }
 7771 
 7772   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7773           Register temp0, Register temp1, FloatRegister vbytes,
 7774           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7775     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7776     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7777     // In non-vectorized code, we update s1 and s2 as:
 7778     //   s1 <- s1 + b1
 7779     //   s2 <- s2 + s1
 7780     //   s1 <- s1 + b2
 7781     //   s2 <- s2 + b1
 7782     //   ...
 7783     //   s1 <- s1 + b16
 7784     //   s2 <- s2 + s1
 7785     // Putting above assignments together, we have:
 7786     //   s1_new = s1 + b1 + b2 + ... + b16
 7787     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7788     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7789     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7790     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7791 
 7792     // s2 = s2 + s1 * 16
 7793     __ add(s2, s2, s1, Assembler::LSL, 4);
 7794 
 7795     // vs1acc = b1 + b2 + b3 + ... + b16
 7796     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7797     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7798     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7799     __ uaddlv(vs1acc, __ T16B, vbytes);
 7800     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7801 
 7802     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7803     __ fmovd(temp0, vs1acc);
 7804     __ fmovd(temp1, vs2acc);
 7805     __ add(s1, s1, temp0);
 7806     __ add(s2, s2, temp1);
 7807   }
 7808 
 7809   /**
 7810    *  Arguments:
 7811    *
 7812    *  Input:
 7813    *    c_rarg0   - x address
 7814    *    c_rarg1   - x length
 7815    *    c_rarg2   - y address
 7816    *    c_rarg3   - y length
 7817    *    c_rarg4   - z address
 7818    */
 7819   address generate_multiplyToLen() {
 7820     __ align(CodeEntryAlignment);
 7821     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7822     StubCodeMark mark(this, stub_id);
 7823 
 7824     address start = __ pc();
 7825     const Register x     = r0;
 7826     const Register xlen  = r1;
 7827     const Register y     = r2;
 7828     const Register ylen  = r3;
 7829     const Register z     = r4;
 7830 
 7831     const Register tmp0  = r5;
 7832     const Register tmp1  = r10;
 7833     const Register tmp2  = r11;
 7834     const Register tmp3  = r12;
 7835     const Register tmp4  = r13;
 7836     const Register tmp5  = r14;
 7837     const Register tmp6  = r15;
 7838     const Register tmp7  = r16;
 7839 
 7840     BLOCK_COMMENT("Entry:");
 7841     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7842     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7843     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7844     __ ret(lr);
 7845 
 7846     return start;
 7847   }
 7848 
 7849   address generate_squareToLen() {
 7850     // squareToLen algorithm for sizes 1..127 described in java code works
 7851     // faster than multiply_to_len on some CPUs and slower on others, but
 7852     // multiply_to_len shows a bit better overall results
 7853     __ align(CodeEntryAlignment);
 7854     StubId stub_id = StubId::stubgen_squareToLen_id;
 7855     StubCodeMark mark(this, stub_id);
 7856     address start = __ pc();
 7857 
 7858     const Register x     = r0;
 7859     const Register xlen  = r1;
 7860     const Register z     = r2;
 7861     const Register y     = r4; // == x
 7862     const Register ylen  = r5; // == xlen
 7863 
 7864     const Register tmp0  = r3;
 7865     const Register tmp1  = r10;
 7866     const Register tmp2  = r11;
 7867     const Register tmp3  = r12;
 7868     const Register tmp4  = r13;
 7869     const Register tmp5  = r14;
 7870     const Register tmp6  = r15;
 7871     const Register tmp7  = r16;
 7872 
 7873     RegSet spilled_regs = RegSet::of(y, ylen);
 7874     BLOCK_COMMENT("Entry:");
 7875     __ enter();
 7876     __ push(spilled_regs, sp);
 7877     __ mov(y, x);
 7878     __ mov(ylen, xlen);
 7879     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7880     __ pop(spilled_regs, sp);
 7881     __ leave();
 7882     __ ret(lr);
 7883     return start;
 7884   }
 7885 
 7886   address generate_mulAdd() {
 7887     __ align(CodeEntryAlignment);
 7888     StubId stub_id = StubId::stubgen_mulAdd_id;
 7889     StubCodeMark mark(this, stub_id);
 7890 
 7891     address start = __ pc();
 7892 
 7893     const Register out     = r0;
 7894     const Register in      = r1;
 7895     const Register offset  = r2;
 7896     const Register len     = r3;
 7897     const Register k       = r4;
 7898 
 7899     BLOCK_COMMENT("Entry:");
 7900     __ enter();
 7901     __ mul_add(out, in, offset, len, k);
 7902     __ leave();
 7903     __ ret(lr);
 7904 
 7905     return start;
 7906   }
 7907 
 7908   // Arguments:
 7909   //
 7910   // Input:
 7911   //   c_rarg0   - newArr address
 7912   //   c_rarg1   - oldArr address
 7913   //   c_rarg2   - newIdx
 7914   //   c_rarg3   - shiftCount
 7915   //   c_rarg4   - numIter
 7916   //
 7917   address generate_bigIntegerRightShift() {
 7918     __ align(CodeEntryAlignment);
 7919     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7920     StubCodeMark mark(this, stub_id);
 7921     address start = __ pc();
 7922 
 7923     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7924 
 7925     Register newArr        = c_rarg0;
 7926     Register oldArr        = c_rarg1;
 7927     Register newIdx        = c_rarg2;
 7928     Register shiftCount    = c_rarg3;
 7929     Register numIter       = c_rarg4;
 7930     Register idx           = numIter;
 7931 
 7932     Register newArrCur     = rscratch1;
 7933     Register shiftRevCount = rscratch2;
 7934     Register oldArrCur     = r13;
 7935     Register oldArrNext    = r14;
 7936 
 7937     FloatRegister oldElem0        = v0;
 7938     FloatRegister oldElem1        = v1;
 7939     FloatRegister newElem         = v2;
 7940     FloatRegister shiftVCount     = v3;
 7941     FloatRegister shiftVRevCount  = v4;
 7942 
 7943     __ cbz(idx, Exit);
 7944 
 7945     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7946 
 7947     // left shift count
 7948     __ movw(shiftRevCount, 32);
 7949     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7950 
 7951     // numIter too small to allow a 4-words SIMD loop, rolling back
 7952     __ cmp(numIter, (u1)4);
 7953     __ br(Assembler::LT, ShiftThree);
 7954 
 7955     __ dup(shiftVCount,    __ T4S, shiftCount);
 7956     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7957     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7958 
 7959     __ BIND(ShiftSIMDLoop);
 7960 
 7961     // Calculate the load addresses
 7962     __ sub(idx, idx, 4);
 7963     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7964     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7965     __ add(oldArrCur,  oldArrNext, 4);
 7966 
 7967     // Load 4 words and process
 7968     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7969     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7970     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7971     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7972     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7973     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7974 
 7975     __ cmp(idx, (u1)4);
 7976     __ br(Assembler::LT, ShiftTwoLoop);
 7977     __ b(ShiftSIMDLoop);
 7978 
 7979     __ BIND(ShiftTwoLoop);
 7980     __ cbz(idx, Exit);
 7981     __ cmp(idx, (u1)1);
 7982     __ br(Assembler::EQ, ShiftOne);
 7983 
 7984     // Calculate the load addresses
 7985     __ sub(idx, idx, 2);
 7986     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7987     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7988     __ add(oldArrCur,  oldArrNext, 4);
 7989 
 7990     // Load 2 words and process
 7991     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7992     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7993     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7994     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7995     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7996     __ st1(newElem,   __ T2S, Address(newArrCur));
 7997     __ b(ShiftTwoLoop);
 7998 
 7999     __ BIND(ShiftThree);
 8000     __ tbz(idx, 1, ShiftOne);
 8001     __ tbz(idx, 0, ShiftTwo);
 8002     __ ldrw(r10,  Address(oldArr, 12));
 8003     __ ldrw(r11,  Address(oldArr, 8));
 8004     __ lsrvw(r10, r10, shiftCount);
 8005     __ lslvw(r11, r11, shiftRevCount);
 8006     __ orrw(r12,  r10, r11);
 8007     __ strw(r12,  Address(newArr, 8));
 8008 
 8009     __ BIND(ShiftTwo);
 8010     __ ldrw(r10,  Address(oldArr, 8));
 8011     __ ldrw(r11,  Address(oldArr, 4));
 8012     __ lsrvw(r10, r10, shiftCount);
 8013     __ lslvw(r11, r11, shiftRevCount);
 8014     __ orrw(r12,  r10, r11);
 8015     __ strw(r12,  Address(newArr, 4));
 8016 
 8017     __ BIND(ShiftOne);
 8018     __ ldrw(r10,  Address(oldArr, 4));
 8019     __ ldrw(r11,  Address(oldArr));
 8020     __ lsrvw(r10, r10, shiftCount);
 8021     __ lslvw(r11, r11, shiftRevCount);
 8022     __ orrw(r12,  r10, r11);
 8023     __ strw(r12,  Address(newArr));
 8024 
 8025     __ BIND(Exit);
 8026     __ ret(lr);
 8027 
 8028     return start;
 8029   }
 8030 
 8031   // Arguments:
 8032   //
 8033   // Input:
 8034   //   c_rarg0   - newArr address
 8035   //   c_rarg1   - oldArr address
 8036   //   c_rarg2   - newIdx
 8037   //   c_rarg3   - shiftCount
 8038   //   c_rarg4   - numIter
 8039   //
 8040   address generate_bigIntegerLeftShift() {
 8041     __ align(CodeEntryAlignment);
 8042     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8043     StubCodeMark mark(this, stub_id);
 8044     address start = __ pc();
 8045 
 8046     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8047 
 8048     Register newArr        = c_rarg0;
 8049     Register oldArr        = c_rarg1;
 8050     Register newIdx        = c_rarg2;
 8051     Register shiftCount    = c_rarg3;
 8052     Register numIter       = c_rarg4;
 8053 
 8054     Register shiftRevCount = rscratch1;
 8055     Register oldArrNext    = rscratch2;
 8056 
 8057     FloatRegister oldElem0        = v0;
 8058     FloatRegister oldElem1        = v1;
 8059     FloatRegister newElem         = v2;
 8060     FloatRegister shiftVCount     = v3;
 8061     FloatRegister shiftVRevCount  = v4;
 8062 
 8063     __ cbz(numIter, Exit);
 8064 
 8065     __ add(oldArrNext, oldArr, 4);
 8066     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8067 
 8068     // right shift count
 8069     __ movw(shiftRevCount, 32);
 8070     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8071 
 8072     // numIter too small to allow a 4-words SIMD loop, rolling back
 8073     __ cmp(numIter, (u1)4);
 8074     __ br(Assembler::LT, ShiftThree);
 8075 
 8076     __ dup(shiftVCount,     __ T4S, shiftCount);
 8077     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8078     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8079 
 8080     __ BIND(ShiftSIMDLoop);
 8081 
 8082     // load 4 words and process
 8083     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8084     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8085     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8086     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8087     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8088     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8089     __ sub(numIter,   numIter, 4);
 8090 
 8091     __ cmp(numIter, (u1)4);
 8092     __ br(Assembler::LT, ShiftTwoLoop);
 8093     __ b(ShiftSIMDLoop);
 8094 
 8095     __ BIND(ShiftTwoLoop);
 8096     __ cbz(numIter, Exit);
 8097     __ cmp(numIter, (u1)1);
 8098     __ br(Assembler::EQ, ShiftOne);
 8099 
 8100     // load 2 words and process
 8101     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8102     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8103     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8104     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8105     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8106     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8107     __ sub(numIter,   numIter, 2);
 8108     __ b(ShiftTwoLoop);
 8109 
 8110     __ BIND(ShiftThree);
 8111     __ ldrw(r10,  __ post(oldArr, 4));
 8112     __ ldrw(r11,  __ post(oldArrNext, 4));
 8113     __ lslvw(r10, r10, shiftCount);
 8114     __ lsrvw(r11, r11, shiftRevCount);
 8115     __ orrw(r12,  r10, r11);
 8116     __ strw(r12,  __ post(newArr, 4));
 8117     __ tbz(numIter, 1, Exit);
 8118     __ tbz(numIter, 0, ShiftOne);
 8119 
 8120     __ BIND(ShiftTwo);
 8121     __ ldrw(r10,  __ post(oldArr, 4));
 8122     __ ldrw(r11,  __ post(oldArrNext, 4));
 8123     __ lslvw(r10, r10, shiftCount);
 8124     __ lsrvw(r11, r11, shiftRevCount);
 8125     __ orrw(r12,  r10, r11);
 8126     __ strw(r12,  __ post(newArr, 4));
 8127 
 8128     __ BIND(ShiftOne);
 8129     __ ldrw(r10,  Address(oldArr));
 8130     __ ldrw(r11,  Address(oldArrNext));
 8131     __ lslvw(r10, r10, shiftCount);
 8132     __ lsrvw(r11, r11, shiftRevCount);
 8133     __ orrw(r12,  r10, r11);
 8134     __ strw(r12,  Address(newArr));
 8135 
 8136     __ BIND(Exit);
 8137     __ ret(lr);
 8138 
 8139     return start;
 8140   }
 8141 
 8142   address generate_count_positives(address &count_positives_long) {
 8143     const u1 large_loop_size = 64;
 8144     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8145     int dcache_line = VM_Version::dcache_line_size();
 8146 
 8147     Register ary1 = r1, len = r2, result = r0;
 8148 
 8149     __ align(CodeEntryAlignment);
 8150 
 8151     StubId stub_id = StubId::stubgen_count_positives_id;
 8152     StubCodeMark mark(this, stub_id);
 8153 
 8154     address entry = __ pc();
 8155 
 8156     __ enter();
 8157     // precondition: a copy of len is already in result
 8158     // __ mov(result, len);
 8159 
 8160   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8161         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8162 
 8163   __ cmp(len, (u1)15);
 8164   __ br(Assembler::GT, LEN_OVER_15);
 8165   // The only case when execution falls into this code is when pointer is near
 8166   // the end of memory page and we have to avoid reading next page
 8167   __ add(ary1, ary1, len);
 8168   __ subs(len, len, 8);
 8169   __ br(Assembler::GT, LEN_OVER_8);
 8170   __ ldr(rscratch2, Address(ary1, -8));
 8171   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8172   __ lsrv(rscratch2, rscratch2, rscratch1);
 8173   __ tst(rscratch2, UPPER_BIT_MASK);
 8174   __ csel(result, zr, result, Assembler::NE);
 8175   __ leave();
 8176   __ ret(lr);
 8177   __ bind(LEN_OVER_8);
 8178   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8179   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8180   __ tst(rscratch2, UPPER_BIT_MASK);
 8181   __ br(Assembler::NE, RET_NO_POP);
 8182   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8183   __ lsrv(rscratch1, rscratch1, rscratch2);
 8184   __ tst(rscratch1, UPPER_BIT_MASK);
 8185   __ bind(RET_NO_POP);
 8186   __ csel(result, zr, result, Assembler::NE);
 8187   __ leave();
 8188   __ ret(lr);
 8189 
 8190   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8191   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8192 
 8193   count_positives_long = __ pc(); // 2nd entry point
 8194 
 8195   __ enter();
 8196 
 8197   __ bind(LEN_OVER_15);
 8198     __ push(spilled_regs, sp);
 8199     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8200     __ cbz(rscratch2, ALIGNED);
 8201     __ ldp(tmp6, tmp1, Address(ary1));
 8202     __ mov(tmp5, 16);
 8203     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8204     __ add(ary1, ary1, rscratch1);
 8205     __ orr(tmp6, tmp6, tmp1);
 8206     __ tst(tmp6, UPPER_BIT_MASK);
 8207     __ br(Assembler::NE, RET_ADJUST);
 8208     __ sub(len, len, rscratch1);
 8209 
 8210   __ bind(ALIGNED);
 8211     __ cmp(len, large_loop_size);
 8212     __ br(Assembler::LT, CHECK_16);
 8213     // Perform 16-byte load as early return in pre-loop to handle situation
 8214     // when initially aligned large array has negative values at starting bytes,
 8215     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8216     // slower. Cases with negative bytes further ahead won't be affected that
 8217     // much. In fact, it'll be faster due to early loads, less instructions and
 8218     // less branches in LARGE_LOOP.
 8219     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8220     __ sub(len, len, 16);
 8221     __ orr(tmp6, tmp6, tmp1);
 8222     __ tst(tmp6, UPPER_BIT_MASK);
 8223     __ br(Assembler::NE, RET_ADJUST_16);
 8224     __ cmp(len, large_loop_size);
 8225     __ br(Assembler::LT, CHECK_16);
 8226 
 8227     if (SoftwarePrefetchHintDistance >= 0
 8228         && SoftwarePrefetchHintDistance >= dcache_line) {
 8229       // initial prefetch
 8230       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8231     }
 8232   __ bind(LARGE_LOOP);
 8233     if (SoftwarePrefetchHintDistance >= 0) {
 8234       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8235     }
 8236     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8237     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8238     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8239     // instructions per cycle and have less branches, but this approach disables
 8240     // early return, thus, all 64 bytes are loaded and checked every time.
 8241     __ ldp(tmp2, tmp3, Address(ary1));
 8242     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8243     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8244     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8245     __ add(ary1, ary1, large_loop_size);
 8246     __ sub(len, len, large_loop_size);
 8247     __ orr(tmp2, tmp2, tmp3);
 8248     __ orr(tmp4, tmp4, tmp5);
 8249     __ orr(rscratch1, rscratch1, rscratch2);
 8250     __ orr(tmp6, tmp6, tmp1);
 8251     __ orr(tmp2, tmp2, tmp4);
 8252     __ orr(rscratch1, rscratch1, tmp6);
 8253     __ orr(tmp2, tmp2, rscratch1);
 8254     __ tst(tmp2, UPPER_BIT_MASK);
 8255     __ br(Assembler::NE, RET_ADJUST_LONG);
 8256     __ cmp(len, large_loop_size);
 8257     __ br(Assembler::GE, LARGE_LOOP);
 8258 
 8259   __ bind(CHECK_16); // small 16-byte load pre-loop
 8260     __ cmp(len, (u1)16);
 8261     __ br(Assembler::LT, POST_LOOP16);
 8262 
 8263   __ bind(LOOP16); // small 16-byte load loop
 8264     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8265     __ sub(len, len, 16);
 8266     __ orr(tmp2, tmp2, tmp3);
 8267     __ tst(tmp2, UPPER_BIT_MASK);
 8268     __ br(Assembler::NE, RET_ADJUST_16);
 8269     __ cmp(len, (u1)16);
 8270     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8271 
 8272   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8273     __ cmp(len, (u1)8);
 8274     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8275     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8276     __ tst(tmp3, UPPER_BIT_MASK);
 8277     __ br(Assembler::NE, RET_ADJUST);
 8278     __ sub(len, len, 8);
 8279 
 8280   __ bind(POST_LOOP16_LOAD_TAIL);
 8281     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8282     __ ldr(tmp1, Address(ary1));
 8283     __ mov(tmp2, 64);
 8284     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8285     __ lslv(tmp1, tmp1, tmp4);
 8286     __ tst(tmp1, UPPER_BIT_MASK);
 8287     __ br(Assembler::NE, RET_ADJUST);
 8288     // Fallthrough
 8289 
 8290   __ bind(RET_LEN);
 8291     __ pop(spilled_regs, sp);
 8292     __ leave();
 8293     __ ret(lr);
 8294 
 8295     // difference result - len is the count of guaranteed to be
 8296     // positive bytes
 8297 
 8298   __ bind(RET_ADJUST_LONG);
 8299     __ add(len, len, (u1)(large_loop_size - 16));
 8300   __ bind(RET_ADJUST_16);
 8301     __ add(len, len, 16);
 8302   __ bind(RET_ADJUST);
 8303     __ pop(spilled_regs, sp);
 8304     __ leave();
 8305     __ sub(result, result, len);
 8306     __ ret(lr);
 8307 
 8308     return entry;
 8309   }
 8310 
 8311   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8312         bool usePrefetch, Label &NOT_EQUAL) {
 8313     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8314         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8315         tmp7 = r12, tmp8 = r13;
 8316     Label LOOP;
 8317 
 8318     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8319     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8320     __ bind(LOOP);
 8321     if (usePrefetch) {
 8322       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8323       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8324     }
 8325     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8326     __ eor(tmp1, tmp1, tmp2);
 8327     __ eor(tmp3, tmp3, tmp4);
 8328     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8329     __ orr(tmp1, tmp1, tmp3);
 8330     __ cbnz(tmp1, NOT_EQUAL);
 8331     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8332     __ eor(tmp5, tmp5, tmp6);
 8333     __ eor(tmp7, tmp7, tmp8);
 8334     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8335     __ orr(tmp5, tmp5, tmp7);
 8336     __ cbnz(tmp5, NOT_EQUAL);
 8337     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8338     __ eor(tmp1, tmp1, tmp2);
 8339     __ eor(tmp3, tmp3, tmp4);
 8340     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8341     __ orr(tmp1, tmp1, tmp3);
 8342     __ cbnz(tmp1, NOT_EQUAL);
 8343     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8344     __ eor(tmp5, tmp5, tmp6);
 8345     __ sub(cnt1, cnt1, 8 * wordSize);
 8346     __ eor(tmp7, tmp7, tmp8);
 8347     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8348     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8349     // cmp) because subs allows an unlimited range of immediate operand.
 8350     __ subs(tmp6, cnt1, loopThreshold);
 8351     __ orr(tmp5, tmp5, tmp7);
 8352     __ cbnz(tmp5, NOT_EQUAL);
 8353     __ br(__ GE, LOOP);
 8354     // post-loop
 8355     __ eor(tmp1, tmp1, tmp2);
 8356     __ eor(tmp3, tmp3, tmp4);
 8357     __ orr(tmp1, tmp1, tmp3);
 8358     __ sub(cnt1, cnt1, 2 * wordSize);
 8359     __ cbnz(tmp1, NOT_EQUAL);
 8360   }
 8361 
 8362   void generate_large_array_equals_loop_simd(int loopThreshold,
 8363         bool usePrefetch, Label &NOT_EQUAL) {
 8364     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8365         tmp2 = rscratch2;
 8366     Label LOOP;
 8367 
 8368     __ bind(LOOP);
 8369     if (usePrefetch) {
 8370       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8371       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8372     }
 8373     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8374     __ sub(cnt1, cnt1, 8 * wordSize);
 8375     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8376     __ subs(tmp1, cnt1, loopThreshold);
 8377     __ eor(v0, __ T16B, v0, v4);
 8378     __ eor(v1, __ T16B, v1, v5);
 8379     __ eor(v2, __ T16B, v2, v6);
 8380     __ eor(v3, __ T16B, v3, v7);
 8381     __ orr(v0, __ T16B, v0, v1);
 8382     __ orr(v1, __ T16B, v2, v3);
 8383     __ orr(v0, __ T16B, v0, v1);
 8384     __ umov(tmp1, v0, __ D, 0);
 8385     __ umov(tmp2, v0, __ D, 1);
 8386     __ orr(tmp1, tmp1, tmp2);
 8387     __ cbnz(tmp1, NOT_EQUAL);
 8388     __ br(__ GE, LOOP);
 8389   }
 8390 
 8391   // a1 = r1 - array1 address
 8392   // a2 = r2 - array2 address
 8393   // result = r0 - return value. Already contains "false"
 8394   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8395   // r3-r5 are reserved temporary registers
 8396   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8397   address generate_large_array_equals() {
 8398     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8399         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8400         tmp7 = r12, tmp8 = r13;
 8401     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8402         SMALL_LOOP, POST_LOOP;
 8403     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8404     // calculate if at least 32 prefetched bytes are used
 8405     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8406     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8407     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8408     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8409         tmp5, tmp6, tmp7, tmp8);
 8410 
 8411     __ align(CodeEntryAlignment);
 8412 
 8413     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8414     StubCodeMark mark(this, stub_id);
 8415 
 8416     address entry = __ pc();
 8417     __ enter();
 8418     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8419     // also advance pointers to use post-increment instead of pre-increment
 8420     __ add(a1, a1, wordSize);
 8421     __ add(a2, a2, wordSize);
 8422     if (AvoidUnalignedAccesses) {
 8423       // both implementations (SIMD/nonSIMD) are using relatively large load
 8424       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8425       // on some CPUs in case of address is not at least 16-byte aligned.
 8426       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8427       // load if needed at least for 1st address and make if 16-byte aligned.
 8428       Label ALIGNED16;
 8429       __ tbz(a1, 3, ALIGNED16);
 8430       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8431       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8432       __ sub(cnt1, cnt1, wordSize);
 8433       __ eor(tmp1, tmp1, tmp2);
 8434       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8435       __ bind(ALIGNED16);
 8436     }
 8437     if (UseSIMDForArrayEquals) {
 8438       if (SoftwarePrefetchHintDistance >= 0) {
 8439         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8440         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8441         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8442             /* prfm = */ true, NOT_EQUAL);
 8443         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8444         __ br(__ LT, TAIL);
 8445       }
 8446       __ bind(NO_PREFETCH_LARGE_LOOP);
 8447       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8448           /* prfm = */ false, NOT_EQUAL);
 8449     } else {
 8450       __ push(spilled_regs, sp);
 8451       if (SoftwarePrefetchHintDistance >= 0) {
 8452         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8453         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8454         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8455             /* prfm = */ true, NOT_EQUAL);
 8456         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8457         __ br(__ LT, TAIL);
 8458       }
 8459       __ bind(NO_PREFETCH_LARGE_LOOP);
 8460       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8461           /* prfm = */ false, NOT_EQUAL);
 8462     }
 8463     __ bind(TAIL);
 8464       __ cbz(cnt1, EQUAL);
 8465       __ subs(cnt1, cnt1, wordSize);
 8466       __ br(__ LE, POST_LOOP);
 8467     __ bind(SMALL_LOOP);
 8468       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8469       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8470       __ subs(cnt1, cnt1, wordSize);
 8471       __ eor(tmp1, tmp1, tmp2);
 8472       __ cbnz(tmp1, NOT_EQUAL);
 8473       __ br(__ GT, SMALL_LOOP);
 8474     __ bind(POST_LOOP);
 8475       __ ldr(tmp1, Address(a1, cnt1));
 8476       __ ldr(tmp2, Address(a2, cnt1));
 8477       __ eor(tmp1, tmp1, tmp2);
 8478       __ cbnz(tmp1, NOT_EQUAL);
 8479     __ bind(EQUAL);
 8480       __ mov(result, true);
 8481     __ bind(NOT_EQUAL);
 8482       if (!UseSIMDForArrayEquals) {
 8483         __ pop(spilled_regs, sp);
 8484       }
 8485     __ bind(NOT_EQUAL_NO_POP);
 8486     __ leave();
 8487     __ ret(lr);
 8488     return entry;
 8489   }
 8490 
 8491   // result = r0 - return value. Contains initial hashcode value on entry.
 8492   // ary = r1 - array address
 8493   // cnt = r2 - elements count
 8494   // Clobbers: v0-v13, rscratch1, rscratch2
 8495   address generate_large_arrays_hashcode(BasicType eltype) {
 8496     const Register result = r0, ary = r1, cnt = r2;
 8497     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8498     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8499     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8500     const FloatRegister vpowm = v13;
 8501 
 8502     ARRAYS_HASHCODE_REGISTERS;
 8503 
 8504     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8505 
 8506     unsigned int vf; // vectorization factor
 8507     bool multiply_by_halves;
 8508     Assembler::SIMD_Arrangement load_arrangement;
 8509     switch (eltype) {
 8510     case T_BOOLEAN:
 8511     case T_BYTE:
 8512       load_arrangement = Assembler::T8B;
 8513       multiply_by_halves = true;
 8514       vf = 8;
 8515       break;
 8516     case T_CHAR:
 8517     case T_SHORT:
 8518       load_arrangement = Assembler::T8H;
 8519       multiply_by_halves = true;
 8520       vf = 8;
 8521       break;
 8522     case T_INT:
 8523       load_arrangement = Assembler::T4S;
 8524       multiply_by_halves = false;
 8525       vf = 4;
 8526       break;
 8527     default:
 8528       ShouldNotReachHere();
 8529     }
 8530 
 8531     // Unroll factor
 8532     const unsigned uf = 4;
 8533 
 8534     // Effective vectorization factor
 8535     const unsigned evf = vf * uf;
 8536 
 8537     __ align(CodeEntryAlignment);
 8538 
 8539     StubId stub_id;
 8540     switch (eltype) {
 8541     case T_BOOLEAN:
 8542       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8543       break;
 8544     case T_BYTE:
 8545       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8546       break;
 8547     case T_CHAR:
 8548       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8549       break;
 8550     case T_SHORT:
 8551       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8552       break;
 8553     case T_INT:
 8554       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8555       break;
 8556     default:
 8557       stub_id = StubId::NO_STUBID;
 8558       ShouldNotReachHere();
 8559     };
 8560 
 8561     StubCodeMark mark(this, stub_id);
 8562 
 8563     address entry = __ pc();
 8564     __ enter();
 8565 
 8566     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8567     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8568     // value shouldn't change throughout both loops.
 8569     __ movw(rscratch1, intpow(31U, 3));
 8570     __ mov(vpow, Assembler::S, 0, rscratch1);
 8571     __ movw(rscratch1, intpow(31U, 2));
 8572     __ mov(vpow, Assembler::S, 1, rscratch1);
 8573     __ movw(rscratch1, intpow(31U, 1));
 8574     __ mov(vpow, Assembler::S, 2, rscratch1);
 8575     __ movw(rscratch1, intpow(31U, 0));
 8576     __ mov(vpow, Assembler::S, 3, rscratch1);
 8577 
 8578     __ mov(vmul0, Assembler::T16B, 0);
 8579     __ mov(vmul0, Assembler::S, 3, result);
 8580 
 8581     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8582     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8583 
 8584     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8585     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8586 
 8587     // SMALL LOOP
 8588     __ bind(SMALL_LOOP);
 8589 
 8590     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8591     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8592     __ subsw(rscratch2, rscratch2, vf);
 8593 
 8594     if (load_arrangement == Assembler::T8B) {
 8595       // Extend 8B to 8H to be able to use vector multiply
 8596       // instructions
 8597       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8598       if (is_signed_subword_type(eltype)) {
 8599         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8600       } else {
 8601         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8602       }
 8603     }
 8604 
 8605     switch (load_arrangement) {
 8606     case Assembler::T4S:
 8607       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8608       break;
 8609     case Assembler::T8B:
 8610     case Assembler::T8H:
 8611       assert(is_subword_type(eltype), "subword type expected");
 8612       if (is_signed_subword_type(eltype)) {
 8613         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8614       } else {
 8615         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8616       }
 8617       break;
 8618     default:
 8619       __ should_not_reach_here();
 8620     }
 8621 
 8622     // Process the upper half of a vector
 8623     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8624       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8625       if (is_signed_subword_type(eltype)) {
 8626         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8627       } else {
 8628         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8629       }
 8630     }
 8631 
 8632     __ br(Assembler::HI, SMALL_LOOP);
 8633 
 8634     // SMALL LOOP'S EPILOQUE
 8635     __ lsr(rscratch2, cnt, exact_log2(evf));
 8636     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8637 
 8638     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8639     __ addv(vmul0, Assembler::T4S, vmul0);
 8640     __ umov(result, vmul0, Assembler::S, 0);
 8641 
 8642     // TAIL
 8643     __ bind(TAIL);
 8644 
 8645     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8646     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8647     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8648     __ andr(rscratch2, cnt, vf - 1);
 8649     __ bind(TAIL_SHORTCUT);
 8650     __ adr(rscratch1, BR_BASE);
 8651     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8652     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8653     __ movw(rscratch2, 0x1f);
 8654     __ br(rscratch1);
 8655 
 8656     for (size_t i = 0; i < vf - 1; ++i) {
 8657       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8658                                    eltype);
 8659       __ maddw(result, result, rscratch2, rscratch1);
 8660       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8661       // Generate 2nd nop to have 4 instructions per iteration.
 8662       if (VM_Version::supports_a53mac()) {
 8663         __ nop();
 8664       }
 8665     }
 8666     __ bind(BR_BASE);
 8667 
 8668     __ leave();
 8669     __ ret(lr);
 8670 
 8671     // LARGE LOOP
 8672     __ bind(LARGE_LOOP_PREHEADER);
 8673 
 8674     __ lsr(rscratch2, cnt, exact_log2(evf));
 8675 
 8676     if (multiply_by_halves) {
 8677       // 31^4 - multiplier between lower and upper parts of a register
 8678       __ movw(rscratch1, intpow(31U, vf / 2));
 8679       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8680       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8681       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8682       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8683     } else {
 8684       // 31^16
 8685       __ movw(rscratch1, intpow(31U, evf));
 8686       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8687     }
 8688 
 8689     __ mov(vmul3, Assembler::T16B, 0);
 8690     __ mov(vmul2, Assembler::T16B, 0);
 8691     __ mov(vmul1, Assembler::T16B, 0);
 8692 
 8693     __ bind(LARGE_LOOP);
 8694 
 8695     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8696     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8697     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8698     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8699 
 8700     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8701            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8702 
 8703     if (load_arrangement == Assembler::T8B) {
 8704       // Extend 8B to 8H to be able to use vector multiply
 8705       // instructions
 8706       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8707       if (is_signed_subword_type(eltype)) {
 8708         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8709         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8710         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8711         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8712       } else {
 8713         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8714         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8715         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8716         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8717       }
 8718     }
 8719 
 8720     switch (load_arrangement) {
 8721     case Assembler::T4S:
 8722       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8723       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8724       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8725       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8726       break;
 8727     case Assembler::T8B:
 8728     case Assembler::T8H:
 8729       assert(is_subword_type(eltype), "subword type expected");
 8730       if (is_signed_subword_type(eltype)) {
 8731         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8732         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8733         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8734         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8735       } else {
 8736         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8737         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8738         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8739         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8740       }
 8741       break;
 8742     default:
 8743       __ should_not_reach_here();
 8744     }
 8745 
 8746     // Process the upper half of a vector
 8747     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8748       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8749       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8750       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8751       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8752       if (is_signed_subword_type(eltype)) {
 8753         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8754         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8755         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8756         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8757       } else {
 8758         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8759         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8760         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8761         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8762       }
 8763     }
 8764 
 8765     __ subsw(rscratch2, rscratch2, 1);
 8766     __ br(Assembler::HI, LARGE_LOOP);
 8767 
 8768     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8769     __ addv(vmul3, Assembler::T4S, vmul3);
 8770     __ umov(result, vmul3, Assembler::S, 0);
 8771 
 8772     __ mov(rscratch2, intpow(31U, vf));
 8773 
 8774     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8775     __ addv(vmul2, Assembler::T4S, vmul2);
 8776     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8777     __ maddw(result, result, rscratch2, rscratch1);
 8778 
 8779     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8780     __ addv(vmul1, Assembler::T4S, vmul1);
 8781     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8782     __ maddw(result, result, rscratch2, rscratch1);
 8783 
 8784     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8785     __ addv(vmul0, Assembler::T4S, vmul0);
 8786     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8787     __ maddw(result, result, rscratch2, rscratch1);
 8788 
 8789     __ andr(rscratch2, cnt, vf - 1);
 8790     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8791 
 8792     __ leave();
 8793     __ ret(lr);
 8794 
 8795     return entry;
 8796   }
 8797 
 8798   address generate_dsin_dcos(bool isCos) {
 8799     __ align(CodeEntryAlignment);
 8800     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8801     StubCodeMark mark(this, stub_id);
 8802     address start = __ pc();
 8803     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8804         (address)StubRoutines::aarch64::_two_over_pi,
 8805         (address)StubRoutines::aarch64::_pio2,
 8806         (address)StubRoutines::aarch64::_dsin_coef,
 8807         (address)StubRoutines::aarch64::_dcos_coef);
 8808     return start;
 8809   }
 8810 
 8811   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8812   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8813       Label &DIFF2) {
 8814     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8815     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8816 
 8817     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8818     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8819     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8820     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8821 
 8822     __ fmovd(tmpL, vtmp3);
 8823     __ eor(rscratch2, tmp3, tmpL);
 8824     __ cbnz(rscratch2, DIFF2);
 8825 
 8826     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8827     __ umov(tmpL, vtmp3, __ D, 1);
 8828     __ eor(rscratch2, tmpU, tmpL);
 8829     __ cbnz(rscratch2, DIFF1);
 8830 
 8831     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8832     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8833     __ fmovd(tmpL, vtmp);
 8834     __ eor(rscratch2, tmp3, tmpL);
 8835     __ cbnz(rscratch2, DIFF2);
 8836 
 8837     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8838     __ umov(tmpL, vtmp, __ D, 1);
 8839     __ eor(rscratch2, tmpU, tmpL);
 8840     __ cbnz(rscratch2, DIFF1);
 8841   }
 8842 
 8843   // r0  = result
 8844   // r1  = str1
 8845   // r2  = cnt1
 8846   // r3  = str2
 8847   // r4  = cnt2
 8848   // r10 = tmp1
 8849   // r11 = tmp2
 8850   address generate_compare_long_string_different_encoding(bool isLU) {
 8851     __ align(CodeEntryAlignment);
 8852     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8853     StubCodeMark mark(this, stub_id);
 8854     address entry = __ pc();
 8855     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8856         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8857         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8858     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8859         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8860     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8861     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8862 
 8863     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8864 
 8865     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8866     // cnt2 == amount of characters left to compare
 8867     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8868     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8869     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8870     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8871     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8872     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8873     __ eor(rscratch2, tmp1, tmp2);
 8874     __ mov(rscratch1, tmp2);
 8875     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8876     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8877              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8878     __ push(spilled_regs, sp);
 8879     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8880     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8881 
 8882     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8883 
 8884     if (SoftwarePrefetchHintDistance >= 0) {
 8885       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8886       __ br(__ LT, NO_PREFETCH);
 8887       __ bind(LARGE_LOOP_PREFETCH);
 8888         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8889         __ mov(tmp4, 2);
 8890         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8891         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8892           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8893           __ subs(tmp4, tmp4, 1);
 8894           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8895           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8896           __ mov(tmp4, 2);
 8897         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8898           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8899           __ subs(tmp4, tmp4, 1);
 8900           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8901           __ sub(cnt2, cnt2, 64);
 8902           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8903           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8904     }
 8905     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8906     __ bind(NO_PREFETCH);
 8907     __ subs(cnt2, cnt2, 16);
 8908     __ br(__ LT, TAIL);
 8909     __ align(OptoLoopAlignment);
 8910     __ bind(SMALL_LOOP); // smaller loop
 8911       __ subs(cnt2, cnt2, 16);
 8912       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8913       __ br(__ GE, SMALL_LOOP);
 8914       __ cmn(cnt2, (u1)16);
 8915       __ br(__ EQ, LOAD_LAST);
 8916     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8917       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8918       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8919       __ ldr(tmp3, Address(cnt1, -8));
 8920       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8921       __ b(LOAD_LAST);
 8922     __ bind(DIFF2);
 8923       __ mov(tmpU, tmp3);
 8924     __ bind(DIFF1);
 8925       __ pop(spilled_regs, sp);
 8926       __ b(CALCULATE_DIFFERENCE);
 8927     __ bind(LOAD_LAST);
 8928       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8929       // No need to load it again
 8930       __ mov(tmpU, tmp3);
 8931       __ pop(spilled_regs, sp);
 8932 
 8933       // tmp2 points to the address of the last 4 Latin1 characters right now
 8934       __ ldrs(vtmp, Address(tmp2));
 8935       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8936       __ fmovd(tmpL, vtmp);
 8937 
 8938       __ eor(rscratch2, tmpU, tmpL);
 8939       __ cbz(rscratch2, DONE);
 8940 
 8941     // Find the first different characters in the longwords and
 8942     // compute their difference.
 8943     __ bind(CALCULATE_DIFFERENCE);
 8944       __ rev(rscratch2, rscratch2);
 8945       __ clz(rscratch2, rscratch2);
 8946       __ andr(rscratch2, rscratch2, -16);
 8947       __ lsrv(tmp1, tmp1, rscratch2);
 8948       __ uxthw(tmp1, tmp1);
 8949       __ lsrv(rscratch1, rscratch1, rscratch2);
 8950       __ uxthw(rscratch1, rscratch1);
 8951       __ subw(result, tmp1, rscratch1);
 8952     __ bind(DONE);
 8953       __ ret(lr);
 8954     return entry;
 8955   }
 8956 
 8957   // r0 = input (float16)
 8958   // v0 = result (float)
 8959   // v1 = temporary float register
 8960   address generate_float16ToFloat() {
 8961     __ align(CodeEntryAlignment);
 8962     StubId stub_id = StubId::stubgen_hf2f_id;
 8963     StubCodeMark mark(this, stub_id);
 8964     address entry = __ pc();
 8965     BLOCK_COMMENT("Entry:");
 8966     __ flt16_to_flt(v0, r0, v1);
 8967     __ ret(lr);
 8968     return entry;
 8969   }
 8970 
 8971   // v0 = input (float)
 8972   // r0 = result (float16)
 8973   // v1 = temporary float register
 8974   address generate_floatToFloat16() {
 8975     __ align(CodeEntryAlignment);
 8976     StubId stub_id = StubId::stubgen_f2hf_id;
 8977     StubCodeMark mark(this, stub_id);
 8978     address entry = __ pc();
 8979     BLOCK_COMMENT("Entry:");
 8980     __ flt_to_flt16(r0, v0, v1);
 8981     __ ret(lr);
 8982     return entry;
 8983   }
 8984 
 8985   address generate_method_entry_barrier() {
 8986     __ align(CodeEntryAlignment);
 8987     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 8988     StubCodeMark mark(this, stub_id);
 8989 
 8990     Label deoptimize_label;
 8991 
 8992     address start = __ pc();
 8993 
 8994     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8995 
 8996     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8997       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8998       // We can get here despite the nmethod being good, if we have not
 8999       // yet applied our cross modification fence (or data fence).
 9000       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9001       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9002       __ ldrw(rscratch2, rscratch2);
 9003       __ strw(rscratch2, thread_epoch_addr);
 9004       __ isb();
 9005       __ membar(__ LoadLoad);
 9006     }
 9007 
 9008     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9009 
 9010     __ enter();
 9011     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9012 
 9013     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9014 
 9015     __ push_call_clobbered_registers();
 9016 
 9017     __ mov(c_rarg0, rscratch2);
 9018     __ call_VM_leaf
 9019          (CAST_FROM_FN_PTR
 9020           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9021 
 9022     __ reset_last_Java_frame(true);
 9023 
 9024     __ mov(rscratch1, r0);
 9025 
 9026     __ pop_call_clobbered_registers();
 9027 
 9028     __ cbnz(rscratch1, deoptimize_label);
 9029 
 9030     __ leave();
 9031     __ ret(lr);
 9032 
 9033     __ BIND(deoptimize_label);
 9034 
 9035     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9036     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9037 
 9038     __ mov(sp, rscratch1);
 9039     __ br(rscratch2);
 9040 
 9041     return start;
 9042   }
 9043 
 9044   // r0  = result
 9045   // r1  = str1
 9046   // r2  = cnt1
 9047   // r3  = str2
 9048   // r4  = cnt2
 9049   // r10 = tmp1
 9050   // r11 = tmp2
 9051   address generate_compare_long_string_same_encoding(bool isLL) {
 9052     __ align(CodeEntryAlignment);
 9053     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9054     StubCodeMark mark(this, stub_id);
 9055     address entry = __ pc();
 9056     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9057         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9058 
 9059     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9060 
 9061     // exit from large loop when less than 64 bytes left to read or we're about
 9062     // to prefetch memory behind array border
 9063     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9064 
 9065     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9066     __ eor(rscratch2, tmp1, tmp2);
 9067     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9068 
 9069     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9070     // update pointers, because of previous read
 9071     __ add(str1, str1, wordSize);
 9072     __ add(str2, str2, wordSize);
 9073     if (SoftwarePrefetchHintDistance >= 0) {
 9074       __ align(OptoLoopAlignment);
 9075       __ bind(LARGE_LOOP_PREFETCH);
 9076         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9077         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9078 
 9079         for (int i = 0; i < 4; i++) {
 9080           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9081           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9082           __ cmp(tmp1, tmp2);
 9083           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9084           __ br(Assembler::NE, DIFF);
 9085         }
 9086         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9087         __ add(str1, str1, 64);
 9088         __ add(str2, str2, 64);
 9089         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9090         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9091         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9092     }
 9093 
 9094     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9095     __ br(Assembler::LE, LESS16);
 9096     __ align(OptoLoopAlignment);
 9097     __ bind(LOOP_COMPARE16);
 9098       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9099       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9100       __ cmp(tmp1, tmp2);
 9101       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9102       __ br(Assembler::NE, DIFF);
 9103       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9104       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9105       __ br(Assembler::LT, LESS16);
 9106 
 9107       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9108       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9109       __ cmp(tmp1, tmp2);
 9110       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9111       __ br(Assembler::NE, DIFF);
 9112       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9113       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9114       __ br(Assembler::GE, LOOP_COMPARE16);
 9115       __ cbz(cnt2, LENGTH_DIFF);
 9116 
 9117     __ bind(LESS16);
 9118       // each 8 compare
 9119       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9120       __ br(Assembler::LE, LESS8);
 9121       __ ldr(tmp1, Address(__ post(str1, 8)));
 9122       __ ldr(tmp2, Address(__ post(str2, 8)));
 9123       __ eor(rscratch2, tmp1, tmp2);
 9124       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9125       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9126 
 9127     __ bind(LESS8); // directly load last 8 bytes
 9128       if (!isLL) {
 9129         __ add(cnt2, cnt2, cnt2);
 9130       }
 9131       __ ldr(tmp1, Address(str1, cnt2));
 9132       __ ldr(tmp2, Address(str2, cnt2));
 9133       __ eor(rscratch2, tmp1, tmp2);
 9134       __ cbz(rscratch2, LENGTH_DIFF);
 9135       __ b(CAL_DIFFERENCE);
 9136 
 9137     __ bind(DIFF);
 9138       __ cmp(tmp1, tmp2);
 9139       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9140       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9141       // reuse rscratch2 register for the result of eor instruction
 9142       __ eor(rscratch2, tmp1, tmp2);
 9143 
 9144     __ bind(CAL_DIFFERENCE);
 9145       __ rev(rscratch2, rscratch2);
 9146       __ clz(rscratch2, rscratch2);
 9147       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9148       __ lsrv(tmp1, tmp1, rscratch2);
 9149       __ lsrv(tmp2, tmp2, rscratch2);
 9150       if (isLL) {
 9151         __ uxtbw(tmp1, tmp1);
 9152         __ uxtbw(tmp2, tmp2);
 9153       } else {
 9154         __ uxthw(tmp1, tmp1);
 9155         __ uxthw(tmp2, tmp2);
 9156       }
 9157       __ subw(result, tmp1, tmp2);
 9158 
 9159     __ bind(LENGTH_DIFF);
 9160       __ ret(lr);
 9161     return entry;
 9162   }
 9163 
 9164   enum string_compare_mode {
 9165     LL,
 9166     LU,
 9167     UL,
 9168     UU,
 9169   };
 9170 
 9171   // The following registers are declared in aarch64.ad
 9172   // r0  = result
 9173   // r1  = str1
 9174   // r2  = cnt1
 9175   // r3  = str2
 9176   // r4  = cnt2
 9177   // r10 = tmp1
 9178   // r11 = tmp2
 9179   // z0  = ztmp1
 9180   // z1  = ztmp2
 9181   // p0  = pgtmp1
 9182   // p1  = pgtmp2
 9183   address generate_compare_long_string_sve(string_compare_mode mode) {
 9184     StubId stub_id;
 9185     switch (mode) {
 9186       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9187       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9188       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9189       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9190       default: ShouldNotReachHere();
 9191     }
 9192 
 9193     __ align(CodeEntryAlignment);
 9194     address entry = __ pc();
 9195     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9196              tmp1 = r10, tmp2 = r11;
 9197 
 9198     Label LOOP, DONE, MISMATCH;
 9199     Register vec_len = tmp1;
 9200     Register idx = tmp2;
 9201     // The minimum of the string lengths has been stored in cnt2.
 9202     Register cnt = cnt2;
 9203     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9204     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9205 
 9206 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9207     switch (mode) {                                                            \
 9208       case LL:                                                                 \
 9209         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9210         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9211         break;                                                                 \
 9212       case LU:                                                                 \
 9213         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9214         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9215         break;                                                                 \
 9216       case UL:                                                                 \
 9217         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9218         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9219         break;                                                                 \
 9220       case UU:                                                                 \
 9221         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9222         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9223         break;                                                                 \
 9224       default:                                                                 \
 9225         ShouldNotReachHere();                                                  \
 9226     }
 9227 
 9228     StubCodeMark mark(this, stub_id);
 9229 
 9230     __ mov(idx, 0);
 9231     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9232 
 9233     if (mode == LL) {
 9234       __ sve_cntb(vec_len);
 9235     } else {
 9236       __ sve_cnth(vec_len);
 9237     }
 9238 
 9239     __ sub(rscratch1, cnt, vec_len);
 9240 
 9241     __ bind(LOOP);
 9242 
 9243       // main loop
 9244       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9245       __ add(idx, idx, vec_len);
 9246       // Compare strings.
 9247       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9248       __ br(__ NE, MISMATCH);
 9249       __ cmp(idx, rscratch1);
 9250       __ br(__ LT, LOOP);
 9251 
 9252     // post loop, last iteration
 9253     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9254 
 9255     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9256     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9257     __ br(__ EQ, DONE);
 9258 
 9259     __ bind(MISMATCH);
 9260 
 9261     // Crop the vector to find its location.
 9262     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9263     // Extract the first different characters of each string.
 9264     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9265     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9266 
 9267     // Compute the difference of the first different characters.
 9268     __ sub(result, rscratch1, rscratch2);
 9269 
 9270     __ bind(DONE);
 9271     __ ret(lr);
 9272 #undef LOAD_PAIR
 9273     return entry;
 9274   }
 9275 
 9276   void generate_compare_long_strings() {
 9277     if (UseSVE == 0) {
 9278       StubRoutines::aarch64::_compare_long_string_LL
 9279           = generate_compare_long_string_same_encoding(true);
 9280       StubRoutines::aarch64::_compare_long_string_UU
 9281           = generate_compare_long_string_same_encoding(false);
 9282       StubRoutines::aarch64::_compare_long_string_LU
 9283           = generate_compare_long_string_different_encoding(true);
 9284       StubRoutines::aarch64::_compare_long_string_UL
 9285           = generate_compare_long_string_different_encoding(false);
 9286     } else {
 9287       StubRoutines::aarch64::_compare_long_string_LL
 9288           = generate_compare_long_string_sve(LL);
 9289       StubRoutines::aarch64::_compare_long_string_UU
 9290           = generate_compare_long_string_sve(UU);
 9291       StubRoutines::aarch64::_compare_long_string_LU
 9292           = generate_compare_long_string_sve(LU);
 9293       StubRoutines::aarch64::_compare_long_string_UL
 9294           = generate_compare_long_string_sve(UL);
 9295     }
 9296   }
 9297 
 9298   // R0 = result
 9299   // R1 = str2
 9300   // R2 = cnt1
 9301   // R3 = str1
 9302   // R4 = cnt2
 9303   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9304   //
 9305   // This generic linear code use few additional ideas, which makes it faster:
 9306   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9307   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9308   // 2) we can use "fast" algorithm of finding single character to search for
 9309   // first symbol with less branches(1 branch per each loaded register instead
 9310   // of branch for each symbol), so, this is where constants like
 9311   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9312   // 3) after loading and analyzing 1st register of source string, it can be
 9313   // used to search for every 1st character entry, saving few loads in
 9314   // comparison with "simplier-but-slower" implementation
 9315   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9316   // re-using/re-initializing/compressing register values, which makes code
 9317   // larger and a bit less readable, however, most of extra operations are
 9318   // issued during loads or branches, so, penalty is minimal
 9319   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9320     StubId stub_id;
 9321     if (str1_isL) {
 9322       if (str2_isL) {
 9323         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9324       } else {
 9325         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9326       }
 9327     } else {
 9328       if (str2_isL) {
 9329         ShouldNotReachHere();
 9330       } else {
 9331         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9332       }
 9333     }
 9334     __ align(CodeEntryAlignment);
 9335     StubCodeMark mark(this, stub_id);
 9336     address entry = __ pc();
 9337 
 9338     int str1_chr_size = str1_isL ? 1 : 2;
 9339     int str2_chr_size = str2_isL ? 1 : 2;
 9340     int str1_chr_shift = str1_isL ? 0 : 1;
 9341     int str2_chr_shift = str2_isL ? 0 : 1;
 9342     bool isL = str1_isL && str2_isL;
 9343    // parameters
 9344     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9345     // temporary registers
 9346     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9347     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9348     // redefinitions
 9349     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9350 
 9351     __ push(spilled_regs, sp);
 9352     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9353         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9354         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9355         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9356         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9357         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9358     // Read whole register from str1. It is safe, because length >=8 here
 9359     __ ldr(ch1, Address(str1));
 9360     // Read whole register from str2. It is safe, because length >=8 here
 9361     __ ldr(ch2, Address(str2));
 9362     __ sub(cnt2, cnt2, cnt1);
 9363     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9364     if (str1_isL != str2_isL) {
 9365       __ eor(v0, __ T16B, v0, v0);
 9366     }
 9367     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9368     __ mul(first, first, tmp1);
 9369     // check if we have less than 1 register to check
 9370     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9371     if (str1_isL != str2_isL) {
 9372       __ fmovd(v1, ch1);
 9373     }
 9374     __ br(__ LE, L_SMALL);
 9375     __ eor(ch2, first, ch2);
 9376     if (str1_isL != str2_isL) {
 9377       __ zip1(v1, __ T16B, v1, v0);
 9378     }
 9379     __ sub(tmp2, ch2, tmp1);
 9380     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9381     __ bics(tmp2, tmp2, ch2);
 9382     if (str1_isL != str2_isL) {
 9383       __ fmovd(ch1, v1);
 9384     }
 9385     __ br(__ NE, L_HAS_ZERO);
 9386     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9387     __ add(result, result, wordSize/str2_chr_size);
 9388     __ add(str2, str2, wordSize);
 9389     __ br(__ LT, L_POST_LOOP);
 9390     __ BIND(L_LOOP);
 9391       __ ldr(ch2, Address(str2));
 9392       __ eor(ch2, first, ch2);
 9393       __ sub(tmp2, ch2, tmp1);
 9394       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9395       __ bics(tmp2, tmp2, ch2);
 9396       __ br(__ NE, L_HAS_ZERO);
 9397     __ BIND(L_LOOP_PROCEED);
 9398       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9399       __ add(str2, str2, wordSize);
 9400       __ add(result, result, wordSize/str2_chr_size);
 9401       __ br(__ GE, L_LOOP);
 9402     __ BIND(L_POST_LOOP);
 9403       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9404       __ br(__ LE, NOMATCH);
 9405       __ ldr(ch2, Address(str2));
 9406       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9407       __ eor(ch2, first, ch2);
 9408       __ sub(tmp2, ch2, tmp1);
 9409       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9410       __ mov(tmp4, -1); // all bits set
 9411       __ b(L_SMALL_PROCEED);
 9412     __ align(OptoLoopAlignment);
 9413     __ BIND(L_SMALL);
 9414       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9415       __ eor(ch2, first, ch2);
 9416       if (str1_isL != str2_isL) {
 9417         __ zip1(v1, __ T16B, v1, v0);
 9418       }
 9419       __ sub(tmp2, ch2, tmp1);
 9420       __ mov(tmp4, -1); // all bits set
 9421       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9422       if (str1_isL != str2_isL) {
 9423         __ fmovd(ch1, v1); // move converted 4 symbols
 9424       }
 9425     __ BIND(L_SMALL_PROCEED);
 9426       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9427       __ bic(tmp2, tmp2, ch2);
 9428       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9429       __ rbit(tmp2, tmp2);
 9430       __ br(__ EQ, NOMATCH);
 9431     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9432       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9433       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9434       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9435       if (str2_isL) { // LL
 9436         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9437         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9438         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9439         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9440         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9441       } else {
 9442         __ mov(ch2, 0xE); // all bits in byte set except last one
 9443         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9444         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9445         __ lslv(tmp2, tmp2, tmp4);
 9446         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9447         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9448         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9449         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9450       }
 9451       __ cmp(ch1, ch2);
 9452       __ mov(tmp4, wordSize/str2_chr_size);
 9453       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9454     __ BIND(L_SMALL_CMP_LOOP);
 9455       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9456                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9457       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9458                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9459       __ add(tmp4, tmp4, 1);
 9460       __ cmp(tmp4, cnt1);
 9461       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9462       __ cmp(first, ch2);
 9463       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9464     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9465       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9466       __ clz(tmp4, tmp2);
 9467       __ add(result, result, 1); // advance index
 9468       __ add(str2, str2, str2_chr_size); // advance pointer
 9469       __ b(L_SMALL_HAS_ZERO_LOOP);
 9470     __ align(OptoLoopAlignment);
 9471     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9472       __ cmp(first, ch2);
 9473       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9474       __ b(DONE);
 9475     __ align(OptoLoopAlignment);
 9476     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9477       if (str2_isL) { // LL
 9478         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9479         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9480         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9481         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9482         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9483       } else {
 9484         __ mov(ch2, 0xE); // all bits in byte set except last one
 9485         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9486         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9487         __ lslv(tmp2, tmp2, tmp4);
 9488         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9489         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9490         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9491         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9492       }
 9493       __ cmp(ch1, ch2);
 9494       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9495       __ b(DONE);
 9496     __ align(OptoLoopAlignment);
 9497     __ BIND(L_HAS_ZERO);
 9498       __ rbit(tmp2, tmp2);
 9499       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9500       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9501       // It's fine because both counters are 32bit and are not changed in this
 9502       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9503       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9504       __ sub(result, result, 1);
 9505     __ BIND(L_HAS_ZERO_LOOP);
 9506       __ mov(cnt1, wordSize/str2_chr_size);
 9507       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9508       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9509       if (str2_isL) {
 9510         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9511         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9512         __ lslv(tmp2, tmp2, tmp4);
 9513         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9514         __ add(tmp4, tmp4, 1);
 9515         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9516         __ lsl(tmp2, tmp2, 1);
 9517         __ mov(tmp4, wordSize/str2_chr_size);
 9518       } else {
 9519         __ mov(ch2, 0xE);
 9520         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9521         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9522         __ lslv(tmp2, tmp2, tmp4);
 9523         __ add(tmp4, tmp4, 1);
 9524         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9525         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9526         __ lsl(tmp2, tmp2, 1);
 9527         __ mov(tmp4, wordSize/str2_chr_size);
 9528         __ sub(str2, str2, str2_chr_size);
 9529       }
 9530       __ cmp(ch1, ch2);
 9531       __ mov(tmp4, wordSize/str2_chr_size);
 9532       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9533     __ BIND(L_CMP_LOOP);
 9534       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9535                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9536       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9537                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9538       __ add(tmp4, tmp4, 1);
 9539       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9540       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9541       __ cmp(cnt1, ch2);
 9542       __ br(__ EQ, L_CMP_LOOP);
 9543     __ BIND(L_CMP_LOOP_NOMATCH);
 9544       // here we're not matched
 9545       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9546       __ clz(tmp4, tmp2);
 9547       __ add(str2, str2, str2_chr_size); // advance pointer
 9548       __ b(L_HAS_ZERO_LOOP);
 9549     __ align(OptoLoopAlignment);
 9550     __ BIND(L_CMP_LOOP_LAST_CMP);
 9551       __ cmp(cnt1, ch2);
 9552       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9553       __ b(DONE);
 9554     __ align(OptoLoopAlignment);
 9555     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9556       if (str2_isL) {
 9557         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9558         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9559         __ lslv(tmp2, tmp2, tmp4);
 9560         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9561         __ add(tmp4, tmp4, 1);
 9562         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9563         __ lsl(tmp2, tmp2, 1);
 9564       } else {
 9565         __ mov(ch2, 0xE);
 9566         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9567         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9568         __ lslv(tmp2, tmp2, tmp4);
 9569         __ add(tmp4, tmp4, 1);
 9570         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9571         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9572         __ lsl(tmp2, tmp2, 1);
 9573         __ sub(str2, str2, str2_chr_size);
 9574       }
 9575       __ cmp(ch1, ch2);
 9576       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9577       __ b(DONE);
 9578     __ align(OptoLoopAlignment);
 9579     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9580       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9581       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9582       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9583       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9584       // result by analyzed characters value, so, we can just reset lower bits
 9585       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9586       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9587       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9588       // index of last analyzed substring inside current octet. So, str2 in at
 9589       // respective start address. We need to advance it to next octet
 9590       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9591       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9592       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9593       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9594       __ movw(cnt2, cnt2);
 9595       __ b(L_LOOP_PROCEED);
 9596     __ align(OptoLoopAlignment);
 9597     __ BIND(NOMATCH);
 9598       __ mov(result, -1);
 9599     __ BIND(DONE);
 9600       __ pop(spilled_regs, sp);
 9601       __ ret(lr);
 9602     return entry;
 9603   }
 9604 
 9605   void generate_string_indexof_stubs() {
 9606     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9607     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9608     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9609   }
 9610 
 9611   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9612       FloatRegister src1, FloatRegister src2) {
 9613     Register dst = r1;
 9614     __ zip1(v1, __ T16B, src1, v0);
 9615     __ zip2(v2, __ T16B, src1, v0);
 9616     if (generatePrfm) {
 9617       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9618     }
 9619     __ zip1(v3, __ T16B, src2, v0);
 9620     __ zip2(v4, __ T16B, src2, v0);
 9621     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9622   }
 9623 
 9624   // R0 = src
 9625   // R1 = dst
 9626   // R2 = len
 9627   // R3 = len >> 3
 9628   // V0 = 0
 9629   // v1 = loaded 8 bytes
 9630   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9631   address generate_large_byte_array_inflate() {
 9632     __ align(CodeEntryAlignment);
 9633     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9634     StubCodeMark mark(this, stub_id);
 9635     address entry = __ pc();
 9636     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9637     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9638     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9639 
 9640     // do one more 8-byte read to have address 16-byte aligned in most cases
 9641     // also use single store instruction
 9642     __ ldrd(v2, __ post(src, 8));
 9643     __ sub(octetCounter, octetCounter, 2);
 9644     __ zip1(v1, __ T16B, v1, v0);
 9645     __ zip1(v2, __ T16B, v2, v0);
 9646     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9647     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9648     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9649     __ br(__ LE, LOOP_START);
 9650     __ b(LOOP_PRFM_START);
 9651     __ bind(LOOP_PRFM);
 9652       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9653     __ bind(LOOP_PRFM_START);
 9654       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9655       __ sub(octetCounter, octetCounter, 8);
 9656       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9657       inflate_and_store_2_fp_registers(true, v3, v4);
 9658       inflate_and_store_2_fp_registers(true, v5, v6);
 9659       __ br(__ GT, LOOP_PRFM);
 9660       __ cmp(octetCounter, (u1)8);
 9661       __ br(__ LT, DONE);
 9662     __ bind(LOOP);
 9663       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9664       __ bind(LOOP_START);
 9665       __ sub(octetCounter, octetCounter, 8);
 9666       __ cmp(octetCounter, (u1)8);
 9667       inflate_and_store_2_fp_registers(false, v3, v4);
 9668       inflate_and_store_2_fp_registers(false, v5, v6);
 9669       __ br(__ GE, LOOP);
 9670     __ bind(DONE);
 9671       __ ret(lr);
 9672     return entry;
 9673   }
 9674 
 9675   /**
 9676    *  Arguments:
 9677    *
 9678    *  Input:
 9679    *  c_rarg0   - current state address
 9680    *  c_rarg1   - H key address
 9681    *  c_rarg2   - data address
 9682    *  c_rarg3   - number of blocks
 9683    *
 9684    *  Output:
 9685    *  Updated state at c_rarg0
 9686    */
 9687   address generate_ghash_processBlocks() {
 9688     // Bafflingly, GCM uses little-endian for the byte order, but
 9689     // big-endian for the bit order.  For example, the polynomial 1 is
 9690     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9691     //
 9692     // So, we must either reverse the bytes in each word and do
 9693     // everything big-endian or reverse the bits in each byte and do
 9694     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9695     // the bits in each byte (we have an instruction, RBIT, to do
 9696     // that) and keep the data in little-endian bit order through the
 9697     // calculation, bit-reversing the inputs and outputs.
 9698 
 9699     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9700     StubCodeMark mark(this, stub_id);
 9701     Label polynomial; // local data generated at end of stub
 9702     __ align(CodeEntryAlignment);
 9703     address start = __ pc();
 9704 
 9705     Register state   = c_rarg0;
 9706     Register subkeyH = c_rarg1;
 9707     Register data    = c_rarg2;
 9708     Register blocks  = c_rarg3;
 9709 
 9710     FloatRegister vzr = v30;
 9711     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9712 
 9713     __ adr(rscratch1, polynomial);
 9714     __ ldrq(v24, rscratch1);    // The field polynomial
 9715 
 9716     __ ldrq(v0, Address(state));
 9717     __ ldrq(v1, Address(subkeyH));
 9718 
 9719     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9720     __ rbit(v0, __ T16B, v0);
 9721     __ rev64(v1, __ T16B, v1);
 9722     __ rbit(v1, __ T16B, v1);
 9723 
 9724     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9725     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9726 
 9727     {
 9728       Label L_ghash_loop;
 9729       __ bind(L_ghash_loop);
 9730 
 9731       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9732                                                  // reversing each byte
 9733       __ rbit(v2, __ T16B, v2);
 9734       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9735 
 9736       // Multiply state in v2 by subkey in v1
 9737       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9738                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9739                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9740       // Reduce v7:v5 by the field polynomial
 9741       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9742 
 9743       __ sub(blocks, blocks, 1);
 9744       __ cbnz(blocks, L_ghash_loop);
 9745     }
 9746 
 9747     // The bit-reversed result is at this point in v0
 9748     __ rev64(v0, __ T16B, v0);
 9749     __ rbit(v0, __ T16B, v0);
 9750 
 9751     __ st1(v0, __ T16B, state);
 9752     __ ret(lr);
 9753 
 9754     // bind label and generate local polynomial data
 9755     __ align(wordSize * 2);
 9756     __ bind(polynomial);
 9757     __ emit_int64(0x87);  // The low-order bits of the field
 9758                           // polynomial (i.e. p = z^7+z^2+z+1)
 9759                           // repeated in the low and high parts of a
 9760                           // 128-bit vector
 9761     __ emit_int64(0x87);
 9762 
 9763     return start;
 9764   }
 9765 
 9766   address generate_ghash_processBlocks_wide() {
 9767     address small = generate_ghash_processBlocks();
 9768 
 9769     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9770     StubCodeMark mark(this, stub_id);
 9771     Label polynomial;           // local data generated after stub
 9772     __ align(CodeEntryAlignment);
 9773     address start = __ pc();
 9774 
 9775     Register state   = c_rarg0;
 9776     Register subkeyH = c_rarg1;
 9777     Register data    = c_rarg2;
 9778     Register blocks  = c_rarg3;
 9779 
 9780     const int unroll = 4;
 9781 
 9782     __ cmp(blocks, (unsigned char)(unroll * 2));
 9783     __ br(__ LT, small);
 9784 
 9785     if (unroll > 1) {
 9786     // Save state before entering routine
 9787       __ sub(sp, sp, 4 * 16);
 9788       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9789       __ sub(sp, sp, 4 * 16);
 9790       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9791     }
 9792 
 9793     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
 9794 
 9795     if (unroll > 1) {
 9796       // And restore state
 9797       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9798       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9799     }
 9800 
 9801     __ cmp(blocks, (unsigned char)0);
 9802     __ br(__ GT, small);
 9803 
 9804     __ ret(lr);
 9805 
 9806     // bind label and generate polynomial data
 9807     __ align(wordSize * 2);
 9808     __ bind(polynomial);
 9809     __ emit_int64(0x87);  // The low-order bits of the field
 9810                           // polynomial (i.e. p = z^7+z^2+z+1)
 9811                           // repeated in the low and high parts of a
 9812                           // 128-bit vector
 9813     __ emit_int64(0x87);
 9814 
 9815     return start;
 9816 
 9817   }
 9818 
 9819   void generate_base64_encode_simdround(Register src, Register dst,
 9820         FloatRegister codec, u8 size) {
 9821 
 9822     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9823     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9824     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9825 
 9826     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9827 
 9828     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9829 
 9830     __ ushr(ind0, arrangement, in0,  2);
 9831 
 9832     __ ushr(ind1, arrangement, in1,  2);
 9833     __ shl(in0,   arrangement, in0,  6);
 9834     __ orr(ind1,  arrangement, ind1, in0);
 9835     __ ushr(ind1, arrangement, ind1, 2);
 9836 
 9837     __ ushr(ind2, arrangement, in2,  4);
 9838     __ shl(in1,   arrangement, in1,  4);
 9839     __ orr(ind2,  arrangement, in1,  ind2);
 9840     __ ushr(ind2, arrangement, ind2, 2);
 9841 
 9842     __ shl(ind3,  arrangement, in2,  2);
 9843     __ ushr(ind3, arrangement, ind3, 2);
 9844 
 9845     __ tbl(out0,  arrangement, codec,  4, ind0);
 9846     __ tbl(out1,  arrangement, codec,  4, ind1);
 9847     __ tbl(out2,  arrangement, codec,  4, ind2);
 9848     __ tbl(out3,  arrangement, codec,  4, ind3);
 9849 
 9850     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9851   }
 9852 
 9853    /**
 9854    *  Arguments:
 9855    *
 9856    *  Input:
 9857    *  c_rarg0   - src_start
 9858    *  c_rarg1   - src_offset
 9859    *  c_rarg2   - src_length
 9860    *  c_rarg3   - dest_start
 9861    *  c_rarg4   - dest_offset
 9862    *  c_rarg5   - isURL
 9863    *
 9864    */
 9865   address generate_base64_encodeBlock() {
 9866 
 9867     static const char toBase64[64] = {
 9868       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9869       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9870       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9871       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9872       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9873     };
 9874 
 9875     static const char toBase64URL[64] = {
 9876       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9877       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9878       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9879       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9880       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9881     };
 9882 
 9883     __ align(CodeEntryAlignment);
 9884     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9885     StubCodeMark mark(this, stub_id);
 9886     address start = __ pc();
 9887 
 9888     Register src   = c_rarg0;  // source array
 9889     Register soff  = c_rarg1;  // source start offset
 9890     Register send  = c_rarg2;  // source end offset
 9891     Register dst   = c_rarg3;  // dest array
 9892     Register doff  = c_rarg4;  // position for writing to dest array
 9893     Register isURL = c_rarg5;  // Base64 or URL character set
 9894 
 9895     // c_rarg6 and c_rarg7 are free to use as temps
 9896     Register codec  = c_rarg6;
 9897     Register length = c_rarg7;
 9898 
 9899     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9900 
 9901     __ add(src, src, soff);
 9902     __ add(dst, dst, doff);
 9903     __ sub(length, send, soff);
 9904 
 9905     // load the codec base address
 9906     __ lea(codec, ExternalAddress((address) toBase64));
 9907     __ cbz(isURL, ProcessData);
 9908     __ lea(codec, ExternalAddress((address) toBase64URL));
 9909 
 9910     __ BIND(ProcessData);
 9911 
 9912     // too short to formup a SIMD loop, roll back
 9913     __ cmp(length, (u1)24);
 9914     __ br(Assembler::LT, Process3B);
 9915 
 9916     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9917 
 9918     __ BIND(Process48B);
 9919     __ cmp(length, (u1)48);
 9920     __ br(Assembler::LT, Process24B);
 9921     generate_base64_encode_simdround(src, dst, v0, 16);
 9922     __ sub(length, length, 48);
 9923     __ b(Process48B);
 9924 
 9925     __ BIND(Process24B);
 9926     __ cmp(length, (u1)24);
 9927     __ br(Assembler::LT, SIMDExit);
 9928     generate_base64_encode_simdround(src, dst, v0, 8);
 9929     __ sub(length, length, 24);
 9930 
 9931     __ BIND(SIMDExit);
 9932     __ cbz(length, Exit);
 9933 
 9934     __ BIND(Process3B);
 9935     //  3 src bytes, 24 bits
 9936     __ ldrb(r10, __ post(src, 1));
 9937     __ ldrb(r11, __ post(src, 1));
 9938     __ ldrb(r12, __ post(src, 1));
 9939     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9940     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9941     // codec index
 9942     __ ubfmw(r15, r12, 18, 23);
 9943     __ ubfmw(r14, r12, 12, 17);
 9944     __ ubfmw(r13, r12, 6,  11);
 9945     __ andw(r12,  r12, 63);
 9946     // get the code based on the codec
 9947     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9948     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9949     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9950     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9951     __ strb(r15, __ post(dst, 1));
 9952     __ strb(r14, __ post(dst, 1));
 9953     __ strb(r13, __ post(dst, 1));
 9954     __ strb(r12, __ post(dst, 1));
 9955     __ sub(length, length, 3);
 9956     __ cbnz(length, Process3B);
 9957 
 9958     __ BIND(Exit);
 9959     __ ret(lr);
 9960 
 9961     return start;
 9962   }
 9963 
 9964   void generate_base64_decode_simdround(Register src, Register dst,
 9965         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9966 
 9967     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9968     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9969 
 9970     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9971     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9972 
 9973     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9974 
 9975     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9976 
 9977     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9978 
 9979     // we need unsigned saturating subtract, to make sure all input values
 9980     // in range [0, 63] will have 0U value in the higher half lookup
 9981     __ uqsubv(decH0, __ T16B, in0, v27);
 9982     __ uqsubv(decH1, __ T16B, in1, v27);
 9983     __ uqsubv(decH2, __ T16B, in2, v27);
 9984     __ uqsubv(decH3, __ T16B, in3, v27);
 9985 
 9986     // lower half lookup
 9987     __ tbl(decL0, arrangement, codecL, 4, in0);
 9988     __ tbl(decL1, arrangement, codecL, 4, in1);
 9989     __ tbl(decL2, arrangement, codecL, 4, in2);
 9990     __ tbl(decL3, arrangement, codecL, 4, in3);
 9991 
 9992     // higher half lookup
 9993     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9994     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9995     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9996     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9997 
 9998     // combine lower and higher
 9999     __ orr(decL0, arrangement, decL0, decH0);
10000     __ orr(decL1, arrangement, decL1, decH1);
10001     __ orr(decL2, arrangement, decL2, decH2);
10002     __ orr(decL3, arrangement, decL3, decH3);
10003 
10004     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10005     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10006     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10007     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10008     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10009     __ orr(in0, arrangement, decH0, decH1);
10010     __ orr(in1, arrangement, decH2, decH3);
10011     __ orr(in2, arrangement, in0,   in1);
10012     __ umaxv(in3, arrangement, in2);
10013     __ umov(rscratch2, in3, __ B, 0);
10014 
10015     // get the data to output
10016     __ shl(out0,  arrangement, decL0, 2);
10017     __ ushr(out1, arrangement, decL1, 4);
10018     __ orr(out0,  arrangement, out0,  out1);
10019     __ shl(out1,  arrangement, decL1, 4);
10020     __ ushr(out2, arrangement, decL2, 2);
10021     __ orr(out1,  arrangement, out1,  out2);
10022     __ shl(out2,  arrangement, decL2, 6);
10023     __ orr(out2,  arrangement, out2,  decL3);
10024 
10025     __ cbz(rscratch2, NoIllegalData);
10026 
10027     // handle illegal input
10028     __ umov(r10, in2, __ D, 0);
10029     if (size == 16) {
10030       __ cbnz(r10, ErrorInLowerHalf);
10031 
10032       // illegal input is in higher half, store the lower half now.
10033       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10034 
10035       __ umov(r10, in2,  __ D, 1);
10036       __ umov(r11, out0, __ D, 1);
10037       __ umov(r12, out1, __ D, 1);
10038       __ umov(r13, out2, __ D, 1);
10039       __ b(StoreLegalData);
10040 
10041       __ BIND(ErrorInLowerHalf);
10042     }
10043     __ umov(r11, out0, __ D, 0);
10044     __ umov(r12, out1, __ D, 0);
10045     __ umov(r13, out2, __ D, 0);
10046 
10047     __ BIND(StoreLegalData);
10048     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10049     __ strb(r11, __ post(dst, 1));
10050     __ strb(r12, __ post(dst, 1));
10051     __ strb(r13, __ post(dst, 1));
10052     __ lsr(r10, r10, 8);
10053     __ lsr(r11, r11, 8);
10054     __ lsr(r12, r12, 8);
10055     __ lsr(r13, r13, 8);
10056     __ b(StoreLegalData);
10057 
10058     __ BIND(NoIllegalData);
10059     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10060   }
10061 
10062 
10063    /**
10064    *  Arguments:
10065    *
10066    *  Input:
10067    *  c_rarg0   - src_start
10068    *  c_rarg1   - src_offset
10069    *  c_rarg2   - src_length
10070    *  c_rarg3   - dest_start
10071    *  c_rarg4   - dest_offset
10072    *  c_rarg5   - isURL
10073    *  c_rarg6   - isMIME
10074    *
10075    */
10076   address generate_base64_decodeBlock() {
10077 
10078     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10079     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10080     // titled "Base64 decoding".
10081 
10082     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10083     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10084     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10085     static const uint8_t fromBase64ForNoSIMD[256] = {
10086       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10087       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10088       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10089        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10090       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10091        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10092       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10093        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10094       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10095       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10096       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10097       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10098       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10099       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10100       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10101       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10102     };
10103 
10104     static const uint8_t fromBase64URLForNoSIMD[256] = {
10105       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10106       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10107       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10108        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10109       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10110        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10111       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10112        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10113       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10114       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10115       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10118       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10119       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10120       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10121     };
10122 
10123     // A legal value of base64 code is in range [0, 127].  We need two lookups
10124     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10125     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10126     // table vector lookup use tbx, out of range indices are unchanged in
10127     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10128     // The value of index 64 is set to 0, so that we know that we already get the
10129     // decoded data with the 1st lookup.
10130     static const uint8_t fromBase64ForSIMD[128] = {
10131       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10132       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10133       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10134        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10135         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10136        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10137       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10138        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10139     };
10140 
10141     static const uint8_t fromBase64URLForSIMD[128] = {
10142       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10143       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10144       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10145        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10146         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10147        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10148        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10149        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10150     };
10151 
10152     __ align(CodeEntryAlignment);
10153     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10154     StubCodeMark mark(this, stub_id);
10155     address start = __ pc();
10156 
10157     Register src    = c_rarg0;  // source array
10158     Register soff   = c_rarg1;  // source start offset
10159     Register send   = c_rarg2;  // source end offset
10160     Register dst    = c_rarg3;  // dest array
10161     Register doff   = c_rarg4;  // position for writing to dest array
10162     Register isURL  = c_rarg5;  // Base64 or URL character set
10163     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10164 
10165     Register length = send;    // reuse send as length of source data to process
10166 
10167     Register simd_codec   = c_rarg6;
10168     Register nosimd_codec = c_rarg7;
10169 
10170     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10171 
10172     __ enter();
10173 
10174     __ add(src, src, soff);
10175     __ add(dst, dst, doff);
10176 
10177     __ mov(doff, dst);
10178 
10179     __ sub(length, send, soff);
10180     __ bfm(length, zr, 0, 1);
10181 
10182     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10183     __ cbz(isURL, ProcessData);
10184     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10185 
10186     __ BIND(ProcessData);
10187     __ mov(rscratch1, length);
10188     __ cmp(length, (u1)144); // 144 = 80 + 64
10189     __ br(Assembler::LT, Process4B);
10190 
10191     // In the MIME case, the line length cannot be more than 76
10192     // bytes (see RFC 2045). This is too short a block for SIMD
10193     // to be worthwhile, so we use non-SIMD here.
10194     __ movw(rscratch1, 79);
10195 
10196     __ BIND(Process4B);
10197     __ ldrw(r14, __ post(src, 4));
10198     __ ubfxw(r10, r14, 0,  8);
10199     __ ubfxw(r11, r14, 8,  8);
10200     __ ubfxw(r12, r14, 16, 8);
10201     __ ubfxw(r13, r14, 24, 8);
10202     // get the de-code
10203     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10204     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10205     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10206     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10207     // error detection, 255u indicates an illegal input
10208     __ orrw(r14, r10, r11);
10209     __ orrw(r15, r12, r13);
10210     __ orrw(r14, r14, r15);
10211     __ tbnz(r14, 7, Exit);
10212     // recover the data
10213     __ lslw(r14, r10, 10);
10214     __ bfiw(r14, r11, 4, 6);
10215     __ bfmw(r14, r12, 2, 5);
10216     __ rev16w(r14, r14);
10217     __ bfiw(r13, r12, 6, 2);
10218     __ strh(r14, __ post(dst, 2));
10219     __ strb(r13, __ post(dst, 1));
10220     // non-simd loop
10221     __ subsw(rscratch1, rscratch1, 4);
10222     __ br(Assembler::GT, Process4B);
10223 
10224     // if exiting from PreProcess80B, rscratch1 == -1;
10225     // otherwise, rscratch1 == 0.
10226     __ cbzw(rscratch1, Exit);
10227     __ sub(length, length, 80);
10228 
10229     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10230     __ cbz(isURL, SIMDEnter);
10231     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10232 
10233     __ BIND(SIMDEnter);
10234     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10235     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10236     __ mov(rscratch1, 63);
10237     __ dup(v27, __ T16B, rscratch1);
10238 
10239     __ BIND(Process64B);
10240     __ cmp(length, (u1)64);
10241     __ br(Assembler::LT, Process32B);
10242     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10243     __ sub(length, length, 64);
10244     __ b(Process64B);
10245 
10246     __ BIND(Process32B);
10247     __ cmp(length, (u1)32);
10248     __ br(Assembler::LT, SIMDExit);
10249     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10250     __ sub(length, length, 32);
10251     __ b(Process32B);
10252 
10253     __ BIND(SIMDExit);
10254     __ cbz(length, Exit);
10255     __ movw(rscratch1, length);
10256     __ b(Process4B);
10257 
10258     __ BIND(Exit);
10259     __ sub(c_rarg0, dst, doff);
10260 
10261     __ leave();
10262     __ ret(lr);
10263 
10264     return start;
10265   }
10266 
10267   // Support for spin waits.
10268   address generate_spin_wait() {
10269     __ align(CodeEntryAlignment);
10270     StubId stub_id = StubId::stubgen_spin_wait_id;
10271     StubCodeMark mark(this, stub_id);
10272     address start = __ pc();
10273 
10274     __ spin_wait();
10275     __ ret(lr);
10276 
10277     return start;
10278   }
10279 
10280   void generate_lookup_secondary_supers_table_stub() {
10281     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10282     StubCodeMark mark(this, stub_id);
10283 
10284     const Register
10285       r_super_klass  = r0,
10286       r_array_base   = r1,
10287       r_array_length = r2,
10288       r_array_index  = r3,
10289       r_sub_klass    = r4,
10290       r_bitmap       = rscratch2,
10291       result         = r5;
10292     const FloatRegister
10293       vtemp          = v0;
10294 
10295     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10296       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10297       Label L_success;
10298       __ enter();
10299       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10300                                              r_array_base, r_array_length, r_array_index,
10301                                              vtemp, result, slot,
10302                                              /*stub_is_near*/true);
10303       __ leave();
10304       __ ret(lr);
10305     }
10306   }
10307 
10308   // Slow path implementation for UseSecondarySupersTable.
10309   address generate_lookup_secondary_supers_table_slow_path_stub() {
10310     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10311     StubCodeMark mark(this, stub_id);
10312 
10313     address start = __ pc();
10314     const Register
10315       r_super_klass  = r0,        // argument
10316       r_array_base   = r1,        // argument
10317       temp1          = r2,        // temp
10318       r_array_index  = r3,        // argument
10319       r_bitmap       = rscratch2, // argument
10320       result         = r5;        // argument
10321 
10322     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10323     __ ret(lr);
10324 
10325     return start;
10326   }
10327 
10328 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10329 
10330   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10331   //
10332   // If LSE is in use, generate LSE versions of all the stubs. The
10333   // non-LSE versions are in atomic_aarch64.S.
10334 
10335   // class AtomicStubMark records the entry point of a stub and the
10336   // stub pointer which will point to it. The stub pointer is set to
10337   // the entry point when ~AtomicStubMark() is called, which must be
10338   // after ICache::invalidate_range. This ensures safe publication of
10339   // the generated code.
10340   class AtomicStubMark {
10341     address _entry_point;
10342     aarch64_atomic_stub_t *_stub;
10343     MacroAssembler *_masm;
10344   public:
10345     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10346       _masm = masm;
10347       __ align(32);
10348       _entry_point = __ pc();
10349       _stub = stub;
10350     }
10351     ~AtomicStubMark() {
10352       *_stub = (aarch64_atomic_stub_t)_entry_point;
10353     }
10354   };
10355 
10356   // NB: For memory_order_conservative we need a trailing membar after
10357   // LSE atomic operations but not a leading membar.
10358   //
10359   // We don't need a leading membar because a clause in the Arm ARM
10360   // says:
10361   //
10362   //   Barrier-ordered-before
10363   //
10364   //   Barrier instructions order prior Memory effects before subsequent
10365   //   Memory effects generated by the same Observer. A read or a write
10366   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10367   //   Observer if and only if RW1 appears in program order before RW 2
10368   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10369   //   instruction with both Acquire and Release semantics.
10370   //
10371   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10372   // and Release semantics, therefore we don't need a leading
10373   // barrier. However, there is no corresponding Barrier-ordered-after
10374   // relationship, therefore we need a trailing membar to prevent a
10375   // later store or load from being reordered with the store in an
10376   // atomic instruction.
10377   //
10378   // This was checked by using the herd7 consistency model simulator
10379   // (http://diy.inria.fr/) with this test case:
10380   //
10381   // AArch64 LseCas
10382   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10383   // P0 | P1;
10384   // LDR W4, [X2] | MOV W3, #0;
10385   // DMB LD       | MOV W4, #1;
10386   // LDR W3, [X1] | CASAL W3, W4, [X1];
10387   //              | DMB ISH;
10388   //              | STR W4, [X2];
10389   // exists
10390   // (0:X3=0 /\ 0:X4=1)
10391   //
10392   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10393   // with the store to x in P1. Without the DMB in P1 this may happen.
10394   //
10395   // At the time of writing we don't know of any AArch64 hardware that
10396   // reorders stores in this way, but the Reference Manual permits it.
10397 
10398   void gen_cas_entry(Assembler::operand_size size,
10399                      atomic_memory_order order) {
10400     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10401       exchange_val = c_rarg2;
10402     bool acquire, release;
10403     switch (order) {
10404       case memory_order_relaxed:
10405         acquire = false;
10406         release = false;
10407         break;
10408       case memory_order_release:
10409         acquire = false;
10410         release = true;
10411         break;
10412       default:
10413         acquire = true;
10414         release = true;
10415         break;
10416     }
10417     __ mov(prev, compare_val);
10418     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10419     if (order == memory_order_conservative) {
10420       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10421     }
10422     if (size == Assembler::xword) {
10423       __ mov(r0, prev);
10424     } else {
10425       __ movw(r0, prev);
10426     }
10427     __ ret(lr);
10428   }
10429 
10430   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10431     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10432     // If not relaxed, then default to conservative.  Relaxed is the only
10433     // case we use enough to be worth specializing.
10434     if (order == memory_order_relaxed) {
10435       __ ldadd(size, incr, prev, addr);
10436     } else {
10437       __ ldaddal(size, incr, prev, addr);
10438       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10439     }
10440     if (size == Assembler::xword) {
10441       __ mov(r0, prev);
10442     } else {
10443       __ movw(r0, prev);
10444     }
10445     __ ret(lr);
10446   }
10447 
10448   void gen_swpal_entry(Assembler::operand_size size) {
10449     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10450     __ swpal(size, incr, prev, addr);
10451     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10452     if (size == Assembler::xword) {
10453       __ mov(r0, prev);
10454     } else {
10455       __ movw(r0, prev);
10456     }
10457     __ ret(lr);
10458   }
10459 
10460   void generate_atomic_entry_points() {
10461     if (! UseLSE) {
10462       return;
10463     }
10464     __ align(CodeEntryAlignment);
10465     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10466     StubCodeMark mark(this, stub_id);
10467     address first_entry = __ pc();
10468 
10469     // ADD, memory_order_conservative
10470     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10471     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10472     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10473     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10474 
10475     // ADD, memory_order_relaxed
10476     AtomicStubMark mark_fetch_add_4_relaxed
10477       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10478     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10479     AtomicStubMark mark_fetch_add_8_relaxed
10480       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10481     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10482 
10483     // XCHG, memory_order_conservative
10484     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10485     gen_swpal_entry(Assembler::word);
10486     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10487     gen_swpal_entry(Assembler::xword);
10488 
10489     // CAS, memory_order_conservative
10490     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10491     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10492     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10493     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10494     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10495     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10496 
10497     // CAS, memory_order_relaxed
10498     AtomicStubMark mark_cmpxchg_1_relaxed
10499       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10500     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10501     AtomicStubMark mark_cmpxchg_4_relaxed
10502       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10503     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10504     AtomicStubMark mark_cmpxchg_8_relaxed
10505       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10506     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10507 
10508     AtomicStubMark mark_cmpxchg_4_release
10509       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10510     gen_cas_entry(MacroAssembler::word, memory_order_release);
10511     AtomicStubMark mark_cmpxchg_8_release
10512       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10513     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10514 
10515     AtomicStubMark mark_cmpxchg_4_seq_cst
10516       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10517     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10518     AtomicStubMark mark_cmpxchg_8_seq_cst
10519       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10520     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10521 
10522     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10523   }
10524 #endif // LINUX
10525 
10526   address generate_cont_thaw(Continuation::thaw_kind kind) {
10527     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10528     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10529 
10530     address start = __ pc();
10531 
10532     if (return_barrier) {
10533       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10534       __ mov(sp, rscratch1);
10535     }
10536     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10537 
10538     if (return_barrier) {
10539       // preserve possible return value from a method returning to the return barrier
10540       __ fmovd(rscratch1, v0);
10541       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10542     }
10543 
10544     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10545     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10546     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10547 
10548     if (return_barrier) {
10549       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10550       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10551       __ fmovd(v0, rscratch1);
10552     }
10553     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10554 
10555 
10556     Label thaw_success;
10557     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10558     __ cbnz(rscratch2, thaw_success);
10559     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10560     __ br(rscratch1);
10561     __ bind(thaw_success);
10562 
10563     // make room for the thawed frames
10564     __ sub(rscratch1, sp, rscratch2);
10565     __ andr(rscratch1, rscratch1, -16); // align
10566     __ mov(sp, rscratch1);
10567 
10568     if (return_barrier) {
10569       // save original return value -- again
10570       __ fmovd(rscratch1, v0);
10571       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10572     }
10573 
10574     // If we want, we can templatize thaw by kind, and have three different entries
10575     __ movw(c_rarg1, (uint32_t)kind);
10576 
10577     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10578     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10579 
10580     if (return_barrier) {
10581       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10582       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10583       __ fmovd(v0, rscratch1);
10584     } else {
10585       __ mov(r0, zr); // return 0 (success) from doYield
10586     }
10587 
10588     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10589     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10590     __ mov(rfp, sp);
10591 
10592     if (return_barrier_exception) {
10593       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10594       __ authenticate_return_address(c_rarg1);
10595       __ verify_oop(r0);
10596       // save return value containing the exception oop in callee-saved R19
10597       __ mov(r19, r0);
10598 
10599       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10600 
10601       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10602       // __ reinitialize_ptrue();
10603 
10604       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10605 
10606       __ mov(r1, r0); // the exception handler
10607       __ mov(r0, r19); // restore return value containing the exception oop
10608       __ verify_oop(r0);
10609 
10610       __ leave();
10611       __ mov(r3, lr);
10612       __ br(r1); // the exception handler
10613     } else {
10614       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10615       __ leave();
10616       __ ret(lr);
10617     }
10618 
10619     return start;
10620   }
10621 
10622   address generate_cont_thaw() {
10623     if (!Continuations::enabled()) return nullptr;
10624 
10625     StubId stub_id = StubId::stubgen_cont_thaw_id;
10626     StubCodeMark mark(this, stub_id);
10627     address start = __ pc();
10628     generate_cont_thaw(Continuation::thaw_top);
10629     return start;
10630   }
10631 
10632   address generate_cont_returnBarrier() {
10633     if (!Continuations::enabled()) return nullptr;
10634 
10635     // TODO: will probably need multiple return barriers depending on return type
10636     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10637     StubCodeMark mark(this, stub_id);
10638     address start = __ pc();
10639 
10640     generate_cont_thaw(Continuation::thaw_return_barrier);
10641 
10642     return start;
10643   }
10644 
10645   address generate_cont_returnBarrier_exception() {
10646     if (!Continuations::enabled()) return nullptr;
10647 
10648     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10649     StubCodeMark mark(this, stub_id);
10650     address start = __ pc();
10651 
10652     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10653 
10654     return start;
10655   }
10656 
10657   address generate_cont_preempt_stub() {
10658     if (!Continuations::enabled()) return nullptr;
10659     StubId stub_id = StubId::stubgen_cont_preempt_id;
10660     StubCodeMark mark(this, stub_id);
10661     address start = __ pc();
10662 
10663     __ reset_last_Java_frame(true);
10664 
10665     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10666     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10667     __ mov(sp, rscratch2);
10668 
10669     Label preemption_cancelled;
10670     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10671     __ cbnz(rscratch1, preemption_cancelled);
10672 
10673     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10674     SharedRuntime::continuation_enter_cleanup(_masm);
10675     __ leave();
10676     __ ret(lr);
10677 
10678     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10679     __ bind(preemption_cancelled);
10680     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10681     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10682     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10683     __ ldr(rscratch1, Address(rscratch1));
10684     __ br(rscratch1);
10685 
10686     return start;
10687   }
10688 
10689   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10690   // are represented as long[5], with BITS_PER_LIMB = 26.
10691   // Pack five 26-bit limbs into three 64-bit registers.
10692   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10693     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10694     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10695     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10696     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10697 
10698     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10699     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10700     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10701     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10702 
10703     if (dest2->is_valid()) {
10704       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10705     } else {
10706 #ifdef ASSERT
10707       Label OK;
10708       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10709       __ br(__ EQ, OK);
10710       __ stop("high bits of Poly1305 integer should be zero");
10711       __ should_not_reach_here();
10712       __ bind(OK);
10713 #endif
10714     }
10715   }
10716 
10717   // As above, but return only a 128-bit integer, packed into two
10718   // 64-bit registers.
10719   void pack_26(Register dest0, Register dest1, Register src) {
10720     pack_26(dest0, dest1, noreg, src);
10721   }
10722 
10723   // Multiply and multiply-accumulate unsigned 64-bit registers.
10724   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10725     __ mul(prod_lo, n, m);
10726     __ umulh(prod_hi, n, m);
10727   }
10728   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10729     wide_mul(rscratch1, rscratch2, n, m);
10730     __ adds(sum_lo, sum_lo, rscratch1);
10731     __ adc(sum_hi, sum_hi, rscratch2);
10732   }
10733 
10734   // Poly1305, RFC 7539
10735 
10736   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10737   // description of the tricks used to simplify and accelerate this
10738   // computation.
10739 
10740   address generate_poly1305_processBlocks() {
10741     __ align(CodeEntryAlignment);
10742     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10743     StubCodeMark mark(this, stub_id);
10744     address start = __ pc();
10745     Label here;
10746     __ enter();
10747     RegSet callee_saved = RegSet::range(r19, r28);
10748     __ push(callee_saved, sp);
10749 
10750     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10751 
10752     // Arguments
10753     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10754 
10755     // R_n is the 128-bit randomly-generated key, packed into two
10756     // registers.  The caller passes this key to us as long[5], with
10757     // BITS_PER_LIMB = 26.
10758     const Register R_0 = *++regs, R_1 = *++regs;
10759     pack_26(R_0, R_1, r_start);
10760 
10761     // RR_n is (R_n >> 2) * 5
10762     const Register RR_0 = *++regs, RR_1 = *++regs;
10763     __ lsr(RR_0, R_0, 2);
10764     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10765     __ lsr(RR_1, R_1, 2);
10766     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10767 
10768     // U_n is the current checksum
10769     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10770     pack_26(U_0, U_1, U_2, acc_start);
10771 
10772     static constexpr int BLOCK_LENGTH = 16;
10773     Label DONE, LOOP;
10774 
10775     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10776     __ br(Assembler::LT, DONE); {
10777       __ bind(LOOP);
10778 
10779       // S_n is to be the sum of U_n and the next block of data
10780       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10781       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10782       __ adds(S_0, U_0, S_0);
10783       __ adcs(S_1, U_1, S_1);
10784       __ adc(S_2, U_2, zr);
10785       __ add(S_2, S_2, 1);
10786 
10787       const Register U_0HI = *++regs, U_1HI = *++regs;
10788 
10789       // NB: this logic depends on some of the special properties of
10790       // Poly1305 keys. In particular, because we know that the top
10791       // four bits of R_0 and R_1 are zero, we can add together
10792       // partial products without any risk of needing to propagate a
10793       // carry out.
10794       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10795       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10796       __ andr(U_2, R_0, 3);
10797       __ mul(U_2, S_2, U_2);
10798 
10799       // Recycle registers S_0, S_1, S_2
10800       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10801 
10802       // Partial reduction mod 2**130 - 5
10803       __ adds(U_1, U_0HI, U_1);
10804       __ adc(U_2, U_1HI, U_2);
10805       // Sum now in U_2:U_1:U_0.
10806       // Dead: U_0HI, U_1HI.
10807       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10808 
10809       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10810 
10811       // First, U_2:U_1:U_0 += (U_2 >> 2)
10812       __ lsr(rscratch1, U_2, 2);
10813       __ andr(U_2, U_2, (u8)3);
10814       __ adds(U_0, U_0, rscratch1);
10815       __ adcs(U_1, U_1, zr);
10816       __ adc(U_2, U_2, zr);
10817       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10818       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10819       __ adcs(U_1, U_1, zr);
10820       __ adc(U_2, U_2, zr);
10821 
10822       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10823       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10824       __ br(~ Assembler::LT, LOOP);
10825     }
10826 
10827     // Further reduce modulo 2^130 - 5
10828     __ lsr(rscratch1, U_2, 2);
10829     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10830     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10831     __ adcs(U_1, U_1, zr);
10832     __ andr(U_2, U_2, (u1)3);
10833     __ adc(U_2, U_2, zr);
10834 
10835     // Unpack the sum into five 26-bit limbs and write to memory.
10836     __ ubfiz(rscratch1, U_0, 0, 26);
10837     __ ubfx(rscratch2, U_0, 26, 26);
10838     __ stp(rscratch1, rscratch2, Address(acc_start));
10839     __ ubfx(rscratch1, U_0, 52, 12);
10840     __ bfi(rscratch1, U_1, 12, 14);
10841     __ ubfx(rscratch2, U_1, 14, 26);
10842     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10843     __ ubfx(rscratch1, U_1, 40, 24);
10844     __ bfi(rscratch1, U_2, 24, 3);
10845     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10846 
10847     __ bind(DONE);
10848     __ pop(callee_saved, sp);
10849     __ leave();
10850     __ ret(lr);
10851 
10852     return start;
10853   }
10854 
10855   // exception handler for upcall stubs
10856   address generate_upcall_stub_exception_handler() {
10857     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10858     StubCodeMark mark(this, stub_id);
10859     address start = __ pc();
10860 
10861     // Native caller has no idea how to handle exceptions,
10862     // so we just crash here. Up to callee to catch exceptions.
10863     __ verify_oop(r0);
10864     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10865     __ blr(rscratch1);
10866     __ should_not_reach_here();
10867 
10868     return start;
10869   }
10870 
10871   // load Method* target of MethodHandle
10872   // j_rarg0 = jobject receiver
10873   // rmethod = result
10874   address generate_upcall_stub_load_target() {
10875     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10876     StubCodeMark mark(this, stub_id);
10877     address start = __ pc();
10878 
10879     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10880       // Load target method from receiver
10881     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10882     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10883     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10884     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10885                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10886                       noreg, noreg);
10887     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10888 
10889     __ ret(lr);
10890 
10891     return start;
10892   }
10893 
10894 #undef __
10895 #define __ masm->
10896 
10897   class MontgomeryMultiplyGenerator : public MacroAssembler {
10898 
10899     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10900       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10901 
10902     RegSet _toSave;
10903     bool _squaring;
10904 
10905   public:
10906     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10907       : MacroAssembler(as->code()), _squaring(squaring) {
10908 
10909       // Register allocation
10910 
10911       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10912       Pa_base = *regs;       // Argument registers
10913       if (squaring)
10914         Pb_base = Pa_base;
10915       else
10916         Pb_base = *++regs;
10917       Pn_base = *++regs;
10918       Rlen= *++regs;
10919       inv = *++regs;
10920       Pm_base = *++regs;
10921 
10922                           // Working registers:
10923       Ra =  *++regs;        // The current digit of a, b, n, and m.
10924       Rb =  *++regs;
10925       Rm =  *++regs;
10926       Rn =  *++regs;
10927 
10928       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10929       Pb =  *++regs;
10930       Pm =  *++regs;
10931       Pn =  *++regs;
10932 
10933       t0 =  *++regs;        // Three registers which form a
10934       t1 =  *++regs;        // triple-precision accumuator.
10935       t2 =  *++regs;
10936 
10937       Ri =  *++regs;        // Inner and outer loop indexes.
10938       Rj =  *++regs;
10939 
10940       Rhi_ab = *++regs;     // Product registers: low and high parts
10941       Rlo_ab = *++regs;     // of a*b and m*n.
10942       Rhi_mn = *++regs;
10943       Rlo_mn = *++regs;
10944 
10945       // r19 and up are callee-saved.
10946       _toSave = RegSet::range(r19, *regs) + Pm_base;
10947     }
10948 
10949   private:
10950     void save_regs() {
10951       push(_toSave, sp);
10952     }
10953 
10954     void restore_regs() {
10955       pop(_toSave, sp);
10956     }
10957 
10958     template <typename T>
10959     void unroll_2(Register count, T block) {
10960       Label loop, end, odd;
10961       tbnz(count, 0, odd);
10962       cbz(count, end);
10963       align(16);
10964       bind(loop);
10965       (this->*block)();
10966       bind(odd);
10967       (this->*block)();
10968       subs(count, count, 2);
10969       br(Assembler::GT, loop);
10970       bind(end);
10971     }
10972 
10973     template <typename T>
10974     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10975       Label loop, end, odd;
10976       tbnz(count, 0, odd);
10977       cbz(count, end);
10978       align(16);
10979       bind(loop);
10980       (this->*block)(d, s, tmp);
10981       bind(odd);
10982       (this->*block)(d, s, tmp);
10983       subs(count, count, 2);
10984       br(Assembler::GT, loop);
10985       bind(end);
10986     }
10987 
10988     void pre1(RegisterOrConstant i) {
10989       block_comment("pre1");
10990       // Pa = Pa_base;
10991       // Pb = Pb_base + i;
10992       // Pm = Pm_base;
10993       // Pn = Pn_base + i;
10994       // Ra = *Pa;
10995       // Rb = *Pb;
10996       // Rm = *Pm;
10997       // Rn = *Pn;
10998       ldr(Ra, Address(Pa_base));
10999       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11000       ldr(Rm, Address(Pm_base));
11001       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11002       lea(Pa, Address(Pa_base));
11003       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11004       lea(Pm, Address(Pm_base));
11005       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11006 
11007       // Zero the m*n result.
11008       mov(Rhi_mn, zr);
11009       mov(Rlo_mn, zr);
11010     }
11011 
11012     // The core multiply-accumulate step of a Montgomery
11013     // multiplication.  The idea is to schedule operations as a
11014     // pipeline so that instructions with long latencies (loads and
11015     // multiplies) have time to complete before their results are
11016     // used.  This most benefits in-order implementations of the
11017     // architecture but out-of-order ones also benefit.
11018     void step() {
11019       block_comment("step");
11020       // MACC(Ra, Rb, t0, t1, t2);
11021       // Ra = *++Pa;
11022       // Rb = *--Pb;
11023       umulh(Rhi_ab, Ra, Rb);
11024       mul(Rlo_ab, Ra, Rb);
11025       ldr(Ra, pre(Pa, wordSize));
11026       ldr(Rb, pre(Pb, -wordSize));
11027       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11028                                        // previous iteration.
11029       // MACC(Rm, Rn, t0, t1, t2);
11030       // Rm = *++Pm;
11031       // Rn = *--Pn;
11032       umulh(Rhi_mn, Rm, Rn);
11033       mul(Rlo_mn, Rm, Rn);
11034       ldr(Rm, pre(Pm, wordSize));
11035       ldr(Rn, pre(Pn, -wordSize));
11036       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11037     }
11038 
11039     void post1() {
11040       block_comment("post1");
11041 
11042       // MACC(Ra, Rb, t0, t1, t2);
11043       // Ra = *++Pa;
11044       // Rb = *--Pb;
11045       umulh(Rhi_ab, Ra, Rb);
11046       mul(Rlo_ab, Ra, Rb);
11047       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11048       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11049 
11050       // *Pm = Rm = t0 * inv;
11051       mul(Rm, t0, inv);
11052       str(Rm, Address(Pm));
11053 
11054       // MACC(Rm, Rn, t0, t1, t2);
11055       // t0 = t1; t1 = t2; t2 = 0;
11056       umulh(Rhi_mn, Rm, Rn);
11057 
11058 #ifndef PRODUCT
11059       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11060       {
11061         mul(Rlo_mn, Rm, Rn);
11062         add(Rlo_mn, t0, Rlo_mn);
11063         Label ok;
11064         cbz(Rlo_mn, ok); {
11065           stop("broken Montgomery multiply");
11066         } bind(ok);
11067       }
11068 #endif
11069       // We have very carefully set things up so that
11070       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11071       // the lower half of Rm * Rn because we know the result already:
11072       // it must be -t0.  t0 + (-t0) must generate a carry iff
11073       // t0 != 0.  So, rather than do a mul and an adds we just set
11074       // the carry flag iff t0 is nonzero.
11075       //
11076       // mul(Rlo_mn, Rm, Rn);
11077       // adds(zr, t0, Rlo_mn);
11078       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11079       adcs(t0, t1, Rhi_mn);
11080       adc(t1, t2, zr);
11081       mov(t2, zr);
11082     }
11083 
11084     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11085       block_comment("pre2");
11086       // Pa = Pa_base + i-len;
11087       // Pb = Pb_base + len;
11088       // Pm = Pm_base + i-len;
11089       // Pn = Pn_base + len;
11090 
11091       if (i.is_register()) {
11092         sub(Rj, i.as_register(), len);
11093       } else {
11094         mov(Rj, i.as_constant());
11095         sub(Rj, Rj, len);
11096       }
11097       // Rj == i-len
11098 
11099       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11100       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11101       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11102       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11103 
11104       // Ra = *++Pa;
11105       // Rb = *--Pb;
11106       // Rm = *++Pm;
11107       // Rn = *--Pn;
11108       ldr(Ra, pre(Pa, wordSize));
11109       ldr(Rb, pre(Pb, -wordSize));
11110       ldr(Rm, pre(Pm, wordSize));
11111       ldr(Rn, pre(Pn, -wordSize));
11112 
11113       mov(Rhi_mn, zr);
11114       mov(Rlo_mn, zr);
11115     }
11116 
11117     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11118       block_comment("post2");
11119       if (i.is_constant()) {
11120         mov(Rj, i.as_constant()-len.as_constant());
11121       } else {
11122         sub(Rj, i.as_register(), len);
11123       }
11124 
11125       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11126 
11127       // As soon as we know the least significant digit of our result,
11128       // store it.
11129       // Pm_base[i-len] = t0;
11130       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11131 
11132       // t0 = t1; t1 = t2; t2 = 0;
11133       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11134       adc(t1, t2, zr);
11135       mov(t2, zr);
11136     }
11137 
11138     // A carry in t0 after Montgomery multiplication means that we
11139     // should subtract multiples of n from our result in m.  We'll
11140     // keep doing that until there is no carry.
11141     void normalize(RegisterOrConstant len) {
11142       block_comment("normalize");
11143       // while (t0)
11144       //   t0 = sub(Pm_base, Pn_base, t0, len);
11145       Label loop, post, again;
11146       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11147       cbz(t0, post); {
11148         bind(again); {
11149           mov(i, zr);
11150           mov(cnt, len);
11151           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11152           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11153           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11154           align(16);
11155           bind(loop); {
11156             sbcs(Rm, Rm, Rn);
11157             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11158             add(i, i, 1);
11159             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11160             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11161             sub(cnt, cnt, 1);
11162           } cbnz(cnt, loop);
11163           sbc(t0, t0, zr);
11164         } cbnz(t0, again);
11165       } bind(post);
11166     }
11167 
11168     // Move memory at s to d, reversing words.
11169     //    Increments d to end of copied memory
11170     //    Destroys tmp1, tmp2
11171     //    Preserves len
11172     //    Leaves s pointing to the address which was in d at start
11173     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11174       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11175       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11176 
11177       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11178       mov(tmp1, len);
11179       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11180       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11181     }
11182     // where
11183     void reverse1(Register d, Register s, Register tmp) {
11184       ldr(tmp, pre(s, -wordSize));
11185       ror(tmp, tmp, 32);
11186       str(tmp, post(d, wordSize));
11187     }
11188 
11189     void step_squaring() {
11190       // An extra ACC
11191       step();
11192       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11193     }
11194 
11195     void last_squaring(RegisterOrConstant i) {
11196       Label dont;
11197       // if ((i & 1) == 0) {
11198       tbnz(i.as_register(), 0, dont); {
11199         // MACC(Ra, Rb, t0, t1, t2);
11200         // Ra = *++Pa;
11201         // Rb = *--Pb;
11202         umulh(Rhi_ab, Ra, Rb);
11203         mul(Rlo_ab, Ra, Rb);
11204         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11205       } bind(dont);
11206     }
11207 
11208     void extra_step_squaring() {
11209       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11210 
11211       // MACC(Rm, Rn, t0, t1, t2);
11212       // Rm = *++Pm;
11213       // Rn = *--Pn;
11214       umulh(Rhi_mn, Rm, Rn);
11215       mul(Rlo_mn, Rm, Rn);
11216       ldr(Rm, pre(Pm, wordSize));
11217       ldr(Rn, pre(Pn, -wordSize));
11218     }
11219 
11220     void post1_squaring() {
11221       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11222 
11223       // *Pm = Rm = t0 * inv;
11224       mul(Rm, t0, inv);
11225       str(Rm, Address(Pm));
11226 
11227       // MACC(Rm, Rn, t0, t1, t2);
11228       // t0 = t1; t1 = t2; t2 = 0;
11229       umulh(Rhi_mn, Rm, Rn);
11230 
11231 #ifndef PRODUCT
11232       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11233       {
11234         mul(Rlo_mn, Rm, Rn);
11235         add(Rlo_mn, t0, Rlo_mn);
11236         Label ok;
11237         cbz(Rlo_mn, ok); {
11238           stop("broken Montgomery multiply");
11239         } bind(ok);
11240       }
11241 #endif
11242       // We have very carefully set things up so that
11243       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11244       // the lower half of Rm * Rn because we know the result already:
11245       // it must be -t0.  t0 + (-t0) must generate a carry iff
11246       // t0 != 0.  So, rather than do a mul and an adds we just set
11247       // the carry flag iff t0 is nonzero.
11248       //
11249       // mul(Rlo_mn, Rm, Rn);
11250       // adds(zr, t0, Rlo_mn);
11251       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11252       adcs(t0, t1, Rhi_mn);
11253       adc(t1, t2, zr);
11254       mov(t2, zr);
11255     }
11256 
11257     void acc(Register Rhi, Register Rlo,
11258              Register t0, Register t1, Register t2) {
11259       adds(t0, t0, Rlo);
11260       adcs(t1, t1, Rhi);
11261       adc(t2, t2, zr);
11262     }
11263 
11264   public:
11265     /**
11266      * Fast Montgomery multiplication.  The derivation of the
11267      * algorithm is in A Cryptographic Library for the Motorola
11268      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11269      *
11270      * Arguments:
11271      *
11272      * Inputs for multiplication:
11273      *   c_rarg0   - int array elements a
11274      *   c_rarg1   - int array elements b
11275      *   c_rarg2   - int array elements n (the modulus)
11276      *   c_rarg3   - int length
11277      *   c_rarg4   - int inv
11278      *   c_rarg5   - int array elements m (the result)
11279      *
11280      * Inputs for squaring:
11281      *   c_rarg0   - int array elements a
11282      *   c_rarg1   - int array elements n (the modulus)
11283      *   c_rarg2   - int length
11284      *   c_rarg3   - int inv
11285      *   c_rarg4   - int array elements m (the result)
11286      *
11287      */
11288     address generate_multiply() {
11289       Label argh, nothing;
11290       bind(argh);
11291       stop("MontgomeryMultiply total_allocation must be <= 8192");
11292 
11293       align(CodeEntryAlignment);
11294       address entry = pc();
11295 
11296       cbzw(Rlen, nothing);
11297 
11298       enter();
11299 
11300       // Make room.
11301       cmpw(Rlen, 512);
11302       br(Assembler::HI, argh);
11303       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11304       andr(sp, Ra, -2 * wordSize);
11305 
11306       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11307 
11308       {
11309         // Copy input args, reversing as we go.  We use Ra as a
11310         // temporary variable.
11311         reverse(Ra, Pa_base, Rlen, t0, t1);
11312         if (!_squaring)
11313           reverse(Ra, Pb_base, Rlen, t0, t1);
11314         reverse(Ra, Pn_base, Rlen, t0, t1);
11315       }
11316 
11317       // Push all call-saved registers and also Pm_base which we'll need
11318       // at the end.
11319       save_regs();
11320 
11321 #ifndef PRODUCT
11322       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11323       {
11324         ldr(Rn, Address(Pn_base, 0));
11325         mul(Rlo_mn, Rn, inv);
11326         subs(zr, Rlo_mn, -1);
11327         Label ok;
11328         br(EQ, ok); {
11329           stop("broken inverse in Montgomery multiply");
11330         } bind(ok);
11331       }
11332 #endif
11333 
11334       mov(Pm_base, Ra);
11335 
11336       mov(t0, zr);
11337       mov(t1, zr);
11338       mov(t2, zr);
11339 
11340       block_comment("for (int i = 0; i < len; i++) {");
11341       mov(Ri, zr); {
11342         Label loop, end;
11343         cmpw(Ri, Rlen);
11344         br(Assembler::GE, end);
11345 
11346         bind(loop);
11347         pre1(Ri);
11348 
11349         block_comment("  for (j = i; j; j--) {"); {
11350           movw(Rj, Ri);
11351           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11352         } block_comment("  } // j");
11353 
11354         post1();
11355         addw(Ri, Ri, 1);
11356         cmpw(Ri, Rlen);
11357         br(Assembler::LT, loop);
11358         bind(end);
11359         block_comment("} // i");
11360       }
11361 
11362       block_comment("for (int i = len; i < 2*len; i++) {");
11363       mov(Ri, Rlen); {
11364         Label loop, end;
11365         cmpw(Ri, Rlen, Assembler::LSL, 1);
11366         br(Assembler::GE, end);
11367 
11368         bind(loop);
11369         pre2(Ri, Rlen);
11370 
11371         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11372           lslw(Rj, Rlen, 1);
11373           subw(Rj, Rj, Ri);
11374           subw(Rj, Rj, 1);
11375           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11376         } block_comment("  } // j");
11377 
11378         post2(Ri, Rlen);
11379         addw(Ri, Ri, 1);
11380         cmpw(Ri, Rlen, Assembler::LSL, 1);
11381         br(Assembler::LT, loop);
11382         bind(end);
11383       }
11384       block_comment("} // i");
11385 
11386       normalize(Rlen);
11387 
11388       mov(Ra, Pm_base);  // Save Pm_base in Ra
11389       restore_regs();  // Restore caller's Pm_base
11390 
11391       // Copy our result into caller's Pm_base
11392       reverse(Pm_base, Ra, Rlen, t0, t1);
11393 
11394       leave();
11395       bind(nothing);
11396       ret(lr);
11397 
11398       return entry;
11399     }
11400     // In C, approximately:
11401 
11402     // void
11403     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11404     //                     julong Pn_base[], julong Pm_base[],
11405     //                     julong inv, int len) {
11406     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11407     //   julong *Pa, *Pb, *Pn, *Pm;
11408     //   julong Ra, Rb, Rn, Rm;
11409 
11410     //   int i;
11411 
11412     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11413 
11414     //   for (i = 0; i < len; i++) {
11415     //     int j;
11416 
11417     //     Pa = Pa_base;
11418     //     Pb = Pb_base + i;
11419     //     Pm = Pm_base;
11420     //     Pn = Pn_base + i;
11421 
11422     //     Ra = *Pa;
11423     //     Rb = *Pb;
11424     //     Rm = *Pm;
11425     //     Rn = *Pn;
11426 
11427     //     int iters = i;
11428     //     for (j = 0; iters--; j++) {
11429     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11430     //       MACC(Ra, Rb, t0, t1, t2);
11431     //       Ra = *++Pa;
11432     //       Rb = *--Pb;
11433     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11434     //       MACC(Rm, Rn, t0, t1, t2);
11435     //       Rm = *++Pm;
11436     //       Rn = *--Pn;
11437     //     }
11438 
11439     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11440     //     MACC(Ra, Rb, t0, t1, t2);
11441     //     *Pm = Rm = t0 * inv;
11442     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11443     //     MACC(Rm, Rn, t0, t1, t2);
11444 
11445     //     assert(t0 == 0, "broken Montgomery multiply");
11446 
11447     //     t0 = t1; t1 = t2; t2 = 0;
11448     //   }
11449 
11450     //   for (i = len; i < 2*len; i++) {
11451     //     int j;
11452 
11453     //     Pa = Pa_base + i-len;
11454     //     Pb = Pb_base + len;
11455     //     Pm = Pm_base + i-len;
11456     //     Pn = Pn_base + len;
11457 
11458     //     Ra = *++Pa;
11459     //     Rb = *--Pb;
11460     //     Rm = *++Pm;
11461     //     Rn = *--Pn;
11462 
11463     //     int iters = len*2-i-1;
11464     //     for (j = i-len+1; iters--; j++) {
11465     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11466     //       MACC(Ra, Rb, t0, t1, t2);
11467     //       Ra = *++Pa;
11468     //       Rb = *--Pb;
11469     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11470     //       MACC(Rm, Rn, t0, t1, t2);
11471     //       Rm = *++Pm;
11472     //       Rn = *--Pn;
11473     //     }
11474 
11475     //     Pm_base[i-len] = t0;
11476     //     t0 = t1; t1 = t2; t2 = 0;
11477     //   }
11478 
11479     //   while (t0)
11480     //     t0 = sub(Pm_base, Pn_base, t0, len);
11481     // }
11482 
11483     /**
11484      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11485      * multiplies than Montgomery multiplication so it should be up to
11486      * 25% faster.  However, its loop control is more complex and it
11487      * may actually run slower on some machines.
11488      *
11489      * Arguments:
11490      *
11491      * Inputs:
11492      *   c_rarg0   - int array elements a
11493      *   c_rarg1   - int array elements n (the modulus)
11494      *   c_rarg2   - int length
11495      *   c_rarg3   - int inv
11496      *   c_rarg4   - int array elements m (the result)
11497      *
11498      */
11499     address generate_square() {
11500       Label argh;
11501       bind(argh);
11502       stop("MontgomeryMultiply total_allocation must be <= 8192");
11503 
11504       align(CodeEntryAlignment);
11505       address entry = pc();
11506 
11507       enter();
11508 
11509       // Make room.
11510       cmpw(Rlen, 512);
11511       br(Assembler::HI, argh);
11512       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11513       andr(sp, Ra, -2 * wordSize);
11514 
11515       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11516 
11517       {
11518         // Copy input args, reversing as we go.  We use Ra as a
11519         // temporary variable.
11520         reverse(Ra, Pa_base, Rlen, t0, t1);
11521         reverse(Ra, Pn_base, Rlen, t0, t1);
11522       }
11523 
11524       // Push all call-saved registers and also Pm_base which we'll need
11525       // at the end.
11526       save_regs();
11527 
11528       mov(Pm_base, Ra);
11529 
11530       mov(t0, zr);
11531       mov(t1, zr);
11532       mov(t2, zr);
11533 
11534       block_comment("for (int i = 0; i < len; i++) {");
11535       mov(Ri, zr); {
11536         Label loop, end;
11537         bind(loop);
11538         cmp(Ri, Rlen);
11539         br(Assembler::GE, end);
11540 
11541         pre1(Ri);
11542 
11543         block_comment("for (j = (i+1)/2; j; j--) {"); {
11544           add(Rj, Ri, 1);
11545           lsr(Rj, Rj, 1);
11546           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11547         } block_comment("  } // j");
11548 
11549         last_squaring(Ri);
11550 
11551         block_comment("  for (j = i/2; j; j--) {"); {
11552           lsr(Rj, Ri, 1);
11553           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11554         } block_comment("  } // j");
11555 
11556         post1_squaring();
11557         add(Ri, Ri, 1);
11558         cmp(Ri, Rlen);
11559         br(Assembler::LT, loop);
11560 
11561         bind(end);
11562         block_comment("} // i");
11563       }
11564 
11565       block_comment("for (int i = len; i < 2*len; i++) {");
11566       mov(Ri, Rlen); {
11567         Label loop, end;
11568         bind(loop);
11569         cmp(Ri, Rlen, Assembler::LSL, 1);
11570         br(Assembler::GE, end);
11571 
11572         pre2(Ri, Rlen);
11573 
11574         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11575           lsl(Rj, Rlen, 1);
11576           sub(Rj, Rj, Ri);
11577           sub(Rj, Rj, 1);
11578           lsr(Rj, Rj, 1);
11579           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11580         } block_comment("  } // j");
11581 
11582         last_squaring(Ri);
11583 
11584         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11585           lsl(Rj, Rlen, 1);
11586           sub(Rj, Rj, Ri);
11587           lsr(Rj, Rj, 1);
11588           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11589         } block_comment("  } // j");
11590 
11591         post2(Ri, Rlen);
11592         add(Ri, Ri, 1);
11593         cmp(Ri, Rlen, Assembler::LSL, 1);
11594 
11595         br(Assembler::LT, loop);
11596         bind(end);
11597         block_comment("} // i");
11598       }
11599 
11600       normalize(Rlen);
11601 
11602       mov(Ra, Pm_base);  // Save Pm_base in Ra
11603       restore_regs();  // Restore caller's Pm_base
11604 
11605       // Copy our result into caller's Pm_base
11606       reverse(Pm_base, Ra, Rlen, t0, t1);
11607 
11608       leave();
11609       ret(lr);
11610 
11611       return entry;
11612     }
11613     // In C, approximately:
11614 
11615     // void
11616     // montgomery_square(julong Pa_base[], julong Pn_base[],
11617     //                   julong Pm_base[], julong inv, int len) {
11618     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11619     //   julong *Pa, *Pb, *Pn, *Pm;
11620     //   julong Ra, Rb, Rn, Rm;
11621 
11622     //   int i;
11623 
11624     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11625 
11626     //   for (i = 0; i < len; i++) {
11627     //     int j;
11628 
11629     //     Pa = Pa_base;
11630     //     Pb = Pa_base + i;
11631     //     Pm = Pm_base;
11632     //     Pn = Pn_base + i;
11633 
11634     //     Ra = *Pa;
11635     //     Rb = *Pb;
11636     //     Rm = *Pm;
11637     //     Rn = *Pn;
11638 
11639     //     int iters = (i+1)/2;
11640     //     for (j = 0; iters--; j++) {
11641     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11642     //       MACC2(Ra, Rb, t0, t1, t2);
11643     //       Ra = *++Pa;
11644     //       Rb = *--Pb;
11645     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11646     //       MACC(Rm, Rn, t0, t1, t2);
11647     //       Rm = *++Pm;
11648     //       Rn = *--Pn;
11649     //     }
11650     //     if ((i & 1) == 0) {
11651     //       assert(Ra == Pa_base[j], "must be");
11652     //       MACC(Ra, Ra, t0, t1, t2);
11653     //     }
11654     //     iters = i/2;
11655     //     assert(iters == i-j, "must be");
11656     //     for (; iters--; j++) {
11657     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11658     //       MACC(Rm, Rn, t0, t1, t2);
11659     //       Rm = *++Pm;
11660     //       Rn = *--Pn;
11661     //     }
11662 
11663     //     *Pm = Rm = t0 * inv;
11664     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11665     //     MACC(Rm, Rn, t0, t1, t2);
11666 
11667     //     assert(t0 == 0, "broken Montgomery multiply");
11668 
11669     //     t0 = t1; t1 = t2; t2 = 0;
11670     //   }
11671 
11672     //   for (i = len; i < 2*len; i++) {
11673     //     int start = i-len+1;
11674     //     int end = start + (len - start)/2;
11675     //     int j;
11676 
11677     //     Pa = Pa_base + i-len;
11678     //     Pb = Pa_base + len;
11679     //     Pm = Pm_base + i-len;
11680     //     Pn = Pn_base + len;
11681 
11682     //     Ra = *++Pa;
11683     //     Rb = *--Pb;
11684     //     Rm = *++Pm;
11685     //     Rn = *--Pn;
11686 
11687     //     int iters = (2*len-i-1)/2;
11688     //     assert(iters == end-start, "must be");
11689     //     for (j = start; iters--; j++) {
11690     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11691     //       MACC2(Ra, Rb, t0, t1, t2);
11692     //       Ra = *++Pa;
11693     //       Rb = *--Pb;
11694     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11695     //       MACC(Rm, Rn, t0, t1, t2);
11696     //       Rm = *++Pm;
11697     //       Rn = *--Pn;
11698     //     }
11699     //     if ((i & 1) == 0) {
11700     //       assert(Ra == Pa_base[j], "must be");
11701     //       MACC(Ra, Ra, t0, t1, t2);
11702     //     }
11703     //     iters =  (2*len-i)/2;
11704     //     assert(iters == len-j, "must be");
11705     //     for (; iters--; j++) {
11706     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11707     //       MACC(Rm, Rn, t0, t1, t2);
11708     //       Rm = *++Pm;
11709     //       Rn = *--Pn;
11710     //     }
11711     //     Pm_base[i-len] = t0;
11712     //     t0 = t1; t1 = t2; t2 = 0;
11713     //   }
11714 
11715     //   while (t0)
11716     //     t0 = sub(Pm_base, Pn_base, t0, len);
11717     // }
11718   };
11719 
11720   // Initialization
11721   void generate_preuniverse_stubs() {
11722     // preuniverse stubs are not needed for aarch64
11723   }
11724 
11725   void generate_initial_stubs() {
11726     // Generate initial stubs and initializes the entry points
11727 
11728     // entry points that exist in all platforms Note: This is code
11729     // that could be shared among different platforms - however the
11730     // benefit seems to be smaller than the disadvantage of having a
11731     // much more complicated generator structure. See also comment in
11732     // stubRoutines.hpp.
11733 
11734     StubRoutines::_forward_exception_entry = generate_forward_exception();
11735 
11736     StubRoutines::_call_stub_entry =
11737       generate_call_stub(StubRoutines::_call_stub_return_address);
11738 
11739     // is referenced by megamorphic call
11740     StubRoutines::_catch_exception_entry = generate_catch_exception();
11741 
11742     // Initialize table for copy memory (arraycopy) check.
11743     if (UnsafeMemoryAccess::_table == nullptr) {
11744       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11745     }
11746 
11747     if (UseCRC32Intrinsics) {
11748       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11749     }
11750 
11751     if (UseCRC32CIntrinsics) {
11752       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11753     }
11754 
11755     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11756       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11757     }
11758 
11759     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11760       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11761     }
11762 
11763     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11764         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11765       StubRoutines::_hf2f = generate_float16ToFloat();
11766       StubRoutines::_f2hf = generate_floatToFloat16();
11767     }
11768   }
11769 
11770   void generate_continuation_stubs() {
11771     // Continuation stubs:
11772     StubRoutines::_cont_thaw          = generate_cont_thaw();
11773     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11774     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11775     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11776   }
11777 
11778   void generate_final_stubs() {
11779     // support for verify_oop (must happen after universe_init)
11780     if (VerifyOops) {
11781       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11782     }
11783 
11784     // arraycopy stubs used by compilers
11785     generate_arraycopy_stubs();
11786 
11787     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11788 
11789     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11790 
11791     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11792     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11793 
11794 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11795 
11796     generate_atomic_entry_points();
11797 
11798 #endif // LINUX
11799 
11800 #ifdef COMPILER2
11801     if (UseSecondarySupersTable) {
11802       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11803       if (! InlineSecondarySupersTest) {
11804         generate_lookup_secondary_supers_table_stub();
11805       }
11806     }
11807 #endif
11808 
11809     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11810 
11811     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11812   }
11813 
11814   void generate_compiler_stubs() {
11815 #if COMPILER2_OR_JVMCI
11816 
11817     if (UseSVE == 0) {
11818       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11819     }
11820 
11821     // array equals stub for large arrays.
11822     if (!UseSimpleArrayEquals) {
11823       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11824     }
11825 
11826     // arrays_hascode stub for large arrays.
11827     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11828     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11829     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11830     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11831     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11832 
11833     // byte_array_inflate stub for large arrays.
11834     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11835 
11836     // countPositives stub for large arrays.
11837     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11838 
11839     generate_compare_long_strings();
11840 
11841     generate_string_indexof_stubs();
11842 
11843 #ifdef COMPILER2
11844     if (UseMultiplyToLenIntrinsic) {
11845       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11846     }
11847 
11848     if (UseSquareToLenIntrinsic) {
11849       StubRoutines::_squareToLen = generate_squareToLen();
11850     }
11851 
11852     if (UseMulAddIntrinsic) {
11853       StubRoutines::_mulAdd = generate_mulAdd();
11854     }
11855 
11856     if (UseSIMDForBigIntegerShiftIntrinsics) {
11857       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11858       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11859     }
11860 
11861     if (UseMontgomeryMultiplyIntrinsic) {
11862       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11863       StubCodeMark mark(this, stub_id);
11864       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11865       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11866     }
11867 
11868     if (UseMontgomerySquareIntrinsic) {
11869       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11870       StubCodeMark mark(this, stub_id);
11871       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11872       // We use generate_multiply() rather than generate_square()
11873       // because it's faster for the sizes of modulus we care about.
11874       StubRoutines::_montgomerySquare = g.generate_multiply();
11875     }
11876 
11877 #endif // COMPILER2
11878 
11879     if (UseChaCha20Intrinsics) {
11880       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11881     }
11882 
11883     if (UseKyberIntrinsics) {
11884       StubRoutines::_kyberNtt = generate_kyberNtt();
11885       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11886       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11887       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11888       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11889       StubRoutines::_kyber12To16 = generate_kyber12To16();
11890       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11891     }
11892 
11893     if (UseDilithiumIntrinsics) {
11894       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11895       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11896       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11897       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11898       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11899     }
11900 
11901     if (UseBASE64Intrinsics) {
11902         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11903         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11904     }
11905 
11906     // data cache line writeback
11907     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11908     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11909 
11910     if (UseAESIntrinsics) {
11911       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11912       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11913       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11914       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11915       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11916     }
11917     if (UseGHASHIntrinsics) {
11918       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11919       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11920     }
11921     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11922       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11923     }
11924 
11925     if (UseMD5Intrinsics) {
11926       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11927       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11928     }
11929     if (UseSHA1Intrinsics) {
11930       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11931       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11932     }
11933     if (UseSHA256Intrinsics) {
11934       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11935       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11936     }
11937     if (UseSHA512Intrinsics) {
11938       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11939       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11940     }
11941     if (UseSHA3Intrinsics) {
11942 
11943       StubRoutines::_double_keccak         = generate_double_keccak();
11944       if (UseSIMDForSHA3Intrinsic) {
11945          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11946          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11947       } else {
11948          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11949          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11950       }
11951     }
11952 
11953     if (UsePoly1305Intrinsics) {
11954       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11955     }
11956 
11957     // generate Adler32 intrinsics code
11958     if (UseAdler32Intrinsics) {
11959       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11960     }
11961 
11962 #endif // COMPILER2_OR_JVMCI
11963   }
11964 
11965  public:
11966   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11967     switch(blob_id) {
11968     case BlobId::stubgen_preuniverse_id:
11969       generate_preuniverse_stubs();
11970       break;
11971     case BlobId::stubgen_initial_id:
11972       generate_initial_stubs();
11973       break;
11974      case BlobId::stubgen_continuation_id:
11975       generate_continuation_stubs();
11976       break;
11977     case BlobId::stubgen_compiler_id:
11978       generate_compiler_stubs();
11979       break;
11980     case BlobId::stubgen_final_id:
11981       generate_final_stubs();
11982       break;
11983     default:
11984       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11985       break;
11986     };
11987   }
11988 }; // end class declaration
11989 
11990 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11991   StubGenerator g(code, blob_id);
11992 }
11993 
11994 
11995 #if defined (LINUX)
11996 
11997 // Define pointers to atomic stubs and initialize them to point to the
11998 // code in atomic_aarch64.S.
11999 
12000 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12001   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12002     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12003   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12004     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12005 
12006 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12007 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12008 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12009 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12010 DEFAULT_ATOMIC_OP(xchg, 4, )
12011 DEFAULT_ATOMIC_OP(xchg, 8, )
12012 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12013 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12014 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12015 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12016 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12017 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12018 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12019 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12020 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12021 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12022 
12023 #undef DEFAULT_ATOMIC_OP
12024 
12025 #endif // LINUX