1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubId stub_id = StubId::stubgen_call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     __ ldr(j_rarg2, result);
  332     Label is_long, is_float, is_double, exit;
  333     __ ldr(j_rarg1, result_type);
  334     __ cmp(j_rarg1, (u1)T_OBJECT);
  335     __ br(Assembler::EQ, is_long);
  336     __ cmp(j_rarg1, (u1)T_LONG);
  337     __ br(Assembler::EQ, is_long);
  338     __ cmp(j_rarg1, (u1)T_FLOAT);
  339     __ br(Assembler::EQ, is_float);
  340     __ cmp(j_rarg1, (u1)T_DOUBLE);
  341     __ br(Assembler::EQ, is_double);
  342 
  343     // handle T_INT case
  344     __ strw(r0, Address(j_rarg2));
  345 
  346     __ BIND(exit);
  347 
  348     // pop parameters
  349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  350 
  351 #ifdef ASSERT
  352     // verify that threads correspond
  353     {
  354       Label L, S;
  355       __ ldr(rscratch1, thread);
  356       __ cmp(rthread, rscratch1);
  357       __ br(Assembler::NE, S);
  358       __ get_thread(rscratch1);
  359       __ cmp(rthread, rscratch1);
  360       __ br(Assembler::EQ, L);
  361       __ BIND(S);
  362       __ stop("StubRoutines::call_stub: threads must correspond");
  363       __ BIND(L);
  364     }
  365 #endif
  366 
  367     __ pop_cont_fastpath(rthread);
  368 
  369     // restore callee-save registers
  370     __ ldpd(v15, v14,  d15_save);
  371     __ ldpd(v13, v12,  d13_save);
  372     __ ldpd(v11, v10,  d11_save);
  373     __ ldpd(v9,  v8,   d9_save);
  374 
  375     __ ldp(r28, r27,   r28_save);
  376     __ ldp(r26, r25,   r26_save);
  377     __ ldp(r24, r23,   r24_save);
  378     __ ldp(r22, r21,   r22_save);
  379     __ ldp(r20, r19,   r20_save);
  380 
  381     // restore fpcr
  382     __ ldr(rscratch1,  fpcr_save);
  383     __ set_fpcr(rscratch1);
  384 
  385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  386     __ ldrw(c_rarg2, result_type);
  387     __ ldr(c_rarg3,  method);
  388     __ ldp(c_rarg4, c_rarg5,  entry_point);
  389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  390 
  391     // leave frame and return to caller
  392     __ leave();
  393     __ ret(lr);
  394 
  395     // handle return types different from T_INT
  396 
  397     __ BIND(is_long);
  398     __ str(r0, Address(j_rarg2, 0));
  399     __ br(Assembler::AL, exit);
  400 
  401     __ BIND(is_float);
  402     __ strs(j_farg0, Address(j_rarg2, 0));
  403     __ br(Assembler::AL, exit);
  404 
  405     __ BIND(is_double);
  406     __ strd(j_farg0, Address(j_rarg2, 0));
  407     __ br(Assembler::AL, exit);
  408 
  409     return start;
  410   }
  411 
  412   // Return point for a Java call if there's an exception thrown in
  413   // Java code.  The exception is caught and transformed into a
  414   // pending exception stored in JavaThread that can be tested from
  415   // within the VM.
  416   //
  417   // Note: Usually the parameters are removed by the callee. In case
  418   // of an exception crossing an activation frame boundary, that is
  419   // not the case if the callee is compiled code => need to setup the
  420   // rsp.
  421   //
  422   // r0: exception oop
  423 
  424   address generate_catch_exception() {
  425     StubId stub_id = StubId::stubgen_catch_exception_id;
  426     StubCodeMark mark(this, stub_id);
  427     address start = __ pc();
  428 
  429     // same as in generate_call_stub():
  430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  431     const Address thread        (rfp, thread_off         * wordSize);
  432 
  433 #ifdef ASSERT
  434     // verify that threads correspond
  435     {
  436       Label L, S;
  437       __ ldr(rscratch1, thread);
  438       __ cmp(rthread, rscratch1);
  439       __ br(Assembler::NE, S);
  440       __ get_thread(rscratch1);
  441       __ cmp(rthread, rscratch1);
  442       __ br(Assembler::EQ, L);
  443       __ bind(S);
  444       __ stop("StubRoutines::catch_exception: threads must correspond");
  445       __ bind(L);
  446     }
  447 #endif
  448 
  449     // set pending exception
  450     __ verify_oop(r0);
  451 
  452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  453     __ mov(rscratch1, (address)__FILE__);
  454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  455     __ movw(rscratch1, (int)__LINE__);
  456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  457 
  458     // complete return to VM
  459     assert(StubRoutines::_call_stub_return_address != nullptr,
  460            "_call_stub_return_address must have been generated before");
  461     __ b(StubRoutines::_call_stub_return_address);
  462 
  463     return start;
  464   }
  465 
  466   // Continuation point for runtime calls returning with a pending
  467   // exception.  The pending exception check happened in the runtime
  468   // or native call stub.  The pending exception in Thread is
  469   // converted into a Java-level exception.
  470   //
  471   // Contract with Java-level exception handlers:
  472   // r0: exception
  473   // r3: throwing pc
  474   //
  475   // NOTE: At entry of this stub, exception-pc must be in LR !!
  476 
  477   // NOTE: this is always used as a jump target within generated code
  478   // so it just needs to be generated code with no x86 prolog
  479 
  480   address generate_forward_exception() {
  481     StubId stub_id = StubId::stubgen_forward_exception_id;
  482     StubCodeMark mark(this, stub_id);
  483     address start = __ pc();
  484 
  485     // Upon entry, LR points to the return address returning into
  486     // Java (interpreted or compiled) code; i.e., the return address
  487     // becomes the throwing pc.
  488     //
  489     // Arguments pushed before the runtime call are still on the stack
  490     // but the exception handler will reset the stack pointer ->
  491     // ignore them.  A potential result in registers can be ignored as
  492     // well.
  493 
  494 #ifdef ASSERT
  495     // make sure this code is only executed if there is a pending exception
  496     {
  497       Label L;
  498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  499       __ cbnz(rscratch1, L);
  500       __ stop("StubRoutines::forward exception: no pending exception (1)");
  501       __ bind(L);
  502     }
  503 #endif
  504 
  505     // compute exception handler into r19
  506 
  507     // call the VM to find the handler address associated with the
  508     // caller address. pass thread in r0 and caller pc (ret address)
  509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  510     // the stack.
  511     __ mov(c_rarg1, lr);
  512     // lr will be trashed by the VM call so we move it to R19
  513     // (callee-saved) because we also need to pass it to the handler
  514     // returned by this call.
  515     __ mov(r19, lr);
  516     BLOCK_COMMENT("call exception_handler_for_return_address");
  517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  518                          SharedRuntime::exception_handler_for_return_address),
  519                     rthread, c_rarg1);
  520     // Reinitialize the ptrue predicate register, in case the external runtime
  521     // call clobbers ptrue reg, as we may return to SVE compiled code.
  522     __ reinitialize_ptrue();
  523 
  524     // we should not really care that lr is no longer the callee
  525     // address. we saved the value the handler needs in r19 so we can
  526     // just copy it to r3. however, the C2 handler will push its own
  527     // frame and then calls into the VM and the VM code asserts that
  528     // the PC for the frame above the handler belongs to a compiled
  529     // Java method. So, we restore lr here to satisfy that assert.
  530     __ mov(lr, r19);
  531     // setup r0 & r3 & clear pending exception
  532     __ mov(r3, r19);
  533     __ mov(r19, r0);
  534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  536 
  537 #ifdef ASSERT
  538     // make sure exception is set
  539     {
  540       Label L;
  541       __ cbnz(r0, L);
  542       __ stop("StubRoutines::forward exception: no pending exception (2)");
  543       __ bind(L);
  544     }
  545 #endif
  546 
  547     // continue at exception handler
  548     // r0: exception
  549     // r3: throwing pc
  550     // r19: exception handler
  551     __ verify_oop(r0);
  552     __ br(r19);
  553 
  554     return start;
  555   }
  556 
  557   // Non-destructive plausibility checks for oops
  558   //
  559   // Arguments:
  560   //    r0: oop to verify
  561   //    rscratch1: error message
  562   //
  563   // Stack after saving c_rarg3:
  564   //    [tos + 0]: saved c_rarg3
  565   //    [tos + 1]: saved c_rarg2
  566   //    [tos + 2]: saved lr
  567   //    [tos + 3]: saved rscratch2
  568   //    [tos + 4]: saved r0
  569   //    [tos + 5]: saved rscratch1
  570   address generate_verify_oop() {
  571     StubId stub_id = StubId::stubgen_verify_oop_id;
  572     StubCodeMark mark(this, stub_id);
  573     address start = __ pc();
  574 
  575     Label exit, error;
  576 
  577     // save c_rarg2 and c_rarg3
  578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  579 
  580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ ldr(c_rarg3, Address(c_rarg2));
  583     __ add(c_rarg3, c_rarg3, 1);
  584     __ str(c_rarg3, Address(c_rarg2));
  585 
  586     // object is in r0
  587     // make sure object is 'reasonable'
  588     __ cbz(r0, exit); // if obj is null it is OK
  589 
  590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  592 
  593     // return if everything seems ok
  594     __ bind(exit);
  595 
  596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  597     __ ret(lr);
  598 
  599     // handle errors
  600     __ bind(error);
  601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  602 
  603     __ push(RegSet::range(r0, r29), sp);
  604     // debug(char* msg, int64_t pc, int64_t regs[])
  605     __ mov(c_rarg0, rscratch1);      // pass address of error message
  606     __ mov(c_rarg1, lr);             // pass return address
  607     __ mov(c_rarg2, sp);             // pass address of regs on stack
  608 #ifndef PRODUCT
  609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  610 #endif
  611     BLOCK_COMMENT("call MacroAssembler::debug");
  612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  613     __ blr(rscratch1);
  614     __ hlt(0);
  615 
  616     return start;
  617   }
  618 
  619   // Generate indices for iota vector.
  620   address generate_iota_indices(StubId stub_id) {
  621     __ align(CodeEntryAlignment);
  622     StubCodeMark mark(this, stub_id);
  623     address start = __ pc();
  624     // B
  625     __ emit_data64(0x0706050403020100, relocInfo::none);
  626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  627     // H
  628     __ emit_data64(0x0003000200010000, relocInfo::none);
  629     __ emit_data64(0x0007000600050004, relocInfo::none);
  630     // S
  631     __ emit_data64(0x0000000100000000, relocInfo::none);
  632     __ emit_data64(0x0000000300000002, relocInfo::none);
  633     // D
  634     __ emit_data64(0x0000000000000000, relocInfo::none);
  635     __ emit_data64(0x0000000000000001, relocInfo::none);
  636     // S - FP
  637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  639     // D - FP
  640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  642     return start;
  643   }
  644 
  645   // The inner part of zero_words().  This is the bulk operation,
  646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  647   // caller is responsible for zeroing the last few words.
  648   //
  649   // Inputs:
  650   // r10: the HeapWord-aligned base address of an array to zero.
  651   // r11: the count in HeapWords, r11 > 0.
  652   //
  653   // Returns r10 and r11, adjusted for the caller to clear.
  654   // r10: the base address of the tail of words left to clear.
  655   // r11: the number of words in the tail.
  656   //      r11 < MacroAssembler::zero_words_block_size.
  657 
  658   address generate_zero_blocks() {
  659     Label done;
  660     Label base_aligned;
  661 
  662     Register base = r10, cnt = r11;
  663 
  664     __ align(CodeEntryAlignment);
  665     StubId stub_id = StubId::stubgen_zero_blocks_id;
  666     StubCodeMark mark(this, stub_id);
  667     address start = __ pc();
  668 
  669     if (UseBlockZeroing) {
  670       int zva_length = VM_Version::zva_length();
  671 
  672       // Ensure ZVA length can be divided by 16. This is required by
  673       // the subsequent operations.
  674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  675 
  676       __ tbz(base, 3, base_aligned);
  677       __ str(zr, Address(__ post(base, 8)));
  678       __ sub(cnt, cnt, 1);
  679       __ bind(base_aligned);
  680 
  681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  682       // alignment.
  683       Label small;
  684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  685       __ subs(rscratch1, cnt, low_limit >> 3);
  686       __ br(Assembler::LT, small);
  687       __ zero_dcache_blocks(base, cnt);
  688       __ bind(small);
  689     }
  690 
  691     {
  692       // Number of stp instructions we'll unroll
  693       const int unroll =
  694         MacroAssembler::zero_words_block_size / 2;
  695       // Clear the remaining blocks.
  696       Label loop;
  697       __ subs(cnt, cnt, unroll * 2);
  698       __ br(Assembler::LT, done);
  699       __ bind(loop);
  700       for (int i = 0; i < unroll; i++)
  701         __ stp(zr, zr, __ post(base, 16));
  702       __ subs(cnt, cnt, unroll * 2);
  703       __ br(Assembler::GE, loop);
  704       __ bind(done);
  705       __ add(cnt, cnt, unroll * 2);
  706     }
  707 
  708     __ ret(lr);
  709 
  710     return start;
  711   }
  712 
  713 
  714   typedef enum {
  715     copy_forwards = 1,
  716     copy_backwards = -1
  717   } copy_direction;
  718 
  719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  720   // for arraycopy stubs.
  721   class ArrayCopyBarrierSetHelper : StackObj {
  722     BarrierSetAssembler* _bs_asm;
  723     MacroAssembler* _masm;
  724     DecoratorSet _decorators;
  725     BasicType _type;
  726     Register _gct1;
  727     Register _gct2;
  728     Register _gct3;
  729     FloatRegister _gcvt1;
  730     FloatRegister _gcvt2;
  731     FloatRegister _gcvt3;
  732 
  733   public:
  734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  735                               DecoratorSet decorators,
  736                               BasicType type,
  737                               Register gct1,
  738                               Register gct2,
  739                               Register gct3,
  740                               FloatRegister gcvt1,
  741                               FloatRegister gcvt2,
  742                               FloatRegister gcvt3)
  743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  744         _masm(masm),
  745         _decorators(decorators),
  746         _type(type),
  747         _gct1(gct1),
  748         _gct2(gct2),
  749         _gct3(gct3),
  750         _gcvt1(gcvt1),
  751         _gcvt2(gcvt2),
  752         _gcvt3(gcvt3) {
  753     }
  754 
  755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  757                             dst1, dst2, src,
  758                             _gct1, _gct2, _gcvt1);
  759     }
  760 
  761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  763                              dst, src1, src2,
  764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  765     }
  766 
  767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  769                             dst1, dst2, src,
  770                             _gct1);
  771     }
  772 
  773     void copy_store_at_16(Address dst, Register src1, Register src2) {
  774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  775                              dst, src1, src2,
  776                              _gct1, _gct2, _gct3);
  777     }
  778 
  779     void copy_load_at_8(Register dst, Address src) {
  780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  781                             dst, noreg, src,
  782                             _gct1);
  783     }
  784 
  785     void copy_store_at_8(Address dst, Register src) {
  786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  787                              dst, src, noreg,
  788                              _gct1, _gct2, _gct3);
  789     }
  790   };
  791 
  792   // Bulk copy of blocks of 8 words.
  793   //
  794   // count is a count of words.
  795   //
  796   // Precondition: count >= 8
  797   //
  798   // Postconditions:
  799   //
  800   // The least significant bit of count contains the remaining count
  801   // of words to copy.  The rest of count is trash.
  802   //
  803   // s and d are adjusted to point to the remaining words to copy
  804   //
  805   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
  806     BasicType type;
  807     copy_direction direction;
  808 
  809     switch (stub_id) {
  810     case StubId::stubgen_copy_byte_f_id:
  811       direction = copy_forwards;
  812       type = T_BYTE;
  813       break;
  814     case StubId::stubgen_copy_byte_b_id:
  815       direction = copy_backwards;
  816       type = T_BYTE;
  817       break;
  818     case StubId::stubgen_copy_oop_f_id:
  819       direction = copy_forwards;
  820       type = T_OBJECT;
  821       break;
  822     case StubId::stubgen_copy_oop_b_id:
  823       direction = copy_backwards;
  824       type = T_OBJECT;
  825       break;
  826     case StubId::stubgen_copy_oop_uninit_f_id:
  827       direction = copy_forwards;
  828       type = T_OBJECT;
  829       break;
  830     case StubId::stubgen_copy_oop_uninit_b_id:
  831       direction = copy_backwards;
  832       type = T_OBJECT;
  833       break;
  834     default:
  835       ShouldNotReachHere();
  836     }
  837 
  838     int unit = wordSize * direction;
  839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  840 
  841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  843     const Register stride = r14;
  844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  847 
  848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  849     assert_different_registers(s, d, count, rscratch1, rscratch2);
  850 
  851     Label again, drain;
  852 
  853     __ align(CodeEntryAlignment);
  854 
  855     StubCodeMark mark(this, stub_id);
  856 
  857     address start = __ pc();
  858 
  859     Label unaligned_copy_long;
  860     if (AvoidUnalignedAccesses) {
  861       __ tbnz(d, 3, unaligned_copy_long);
  862     }
  863 
  864     if (direction == copy_forwards) {
  865       __ sub(s, s, bias);
  866       __ sub(d, d, bias);
  867     }
  868 
  869 #ifdef ASSERT
  870     // Make sure we are never given < 8 words
  871     {
  872       Label L;
  873       __ cmp(count, (u1)8);
  874       __ br(Assembler::GE, L);
  875       __ stop("genrate_copy_longs called with < 8 words");
  876       __ bind(L);
  877     }
  878 #endif
  879 
  880     // Fill 8 registers
  881     if (UseSIMDForMemoryOps) {
  882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  884     } else {
  885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  889     }
  890 
  891     __ subs(count, count, 16);
  892     __ br(Assembler::LO, drain);
  893 
  894     int prefetch = PrefetchCopyIntervalInBytes;
  895     bool use_stride = false;
  896     if (direction == copy_backwards) {
  897       use_stride = prefetch > 256;
  898       prefetch = -prefetch;
  899       if (use_stride) __ mov(stride, prefetch);
  900     }
  901 
  902     __ bind(again);
  903 
  904     if (PrefetchCopyIntervalInBytes > 0)
  905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  906 
  907     if (UseSIMDForMemoryOps) {
  908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  912     } else {
  913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  921     }
  922 
  923     __ subs(count, count, 8);
  924     __ br(Assembler::HS, again);
  925 
  926     // Drain
  927     __ bind(drain);
  928     if (UseSIMDForMemoryOps) {
  929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  931     } else {
  932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936     }
  937 
  938     {
  939       Label L1, L2;
  940       __ tbz(count, exact_log2(4), L1);
  941       if (UseSIMDForMemoryOps) {
  942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  944       } else {
  945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  949       }
  950       __ bind(L1);
  951 
  952       if (direction == copy_forwards) {
  953         __ add(s, s, bias);
  954         __ add(d, d, bias);
  955       }
  956 
  957       __ tbz(count, 1, L2);
  958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  960       __ bind(L2);
  961     }
  962 
  963     __ ret(lr);
  964 
  965     if (AvoidUnalignedAccesses) {
  966       Label drain, again;
  967       // Register order for storing. Order is different for backward copy.
  968 
  969       __ bind(unaligned_copy_long);
  970 
  971       // source address is even aligned, target odd aligned
  972       //
  973       // when forward copying word pairs we read long pairs at offsets
  974       // {0, 2, 4, 6} (in long words). when backwards copying we read
  975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  976       // address by -2 in the forwards case so we can compute the
  977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  978       // or -1.
  979       //
  980       // when forward copying we need to store 1 word, 3 pairs and
  981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  982       // zero offset We adjust the destination by -1 which means we
  983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  984       //
  985       // When backwards copyng we need to store 1 word, 3 pairs and
  986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  987       // offsets {1, 3, 5, 7, 8} * unit.
  988 
  989       if (direction == copy_forwards) {
  990         __ sub(s, s, 16);
  991         __ sub(d, d, 8);
  992       }
  993 
  994       // Fill 8 registers
  995       //
  996       // for forwards copy s was offset by -16 from the original input
  997       // value of s so the register contents are at these offsets
  998       // relative to the 64 bit block addressed by that original input
  999       // and so on for each successive 64 byte block when s is updated
 1000       //
 1001       // t0 at offset 0,  t1 at offset 8
 1002       // t2 at offset 16, t3 at offset 24
 1003       // t4 at offset 32, t5 at offset 40
 1004       // t6 at offset 48, t7 at offset 56
 1005 
 1006       // for backwards copy s was not offset so the register contents
 1007       // are at these offsets into the preceding 64 byte block
 1008       // relative to that original input and so on for each successive
 1009       // preceding 64 byte block when s is updated. this explains the
 1010       // slightly counter-intuitive looking pattern of register usage
 1011       // in the stp instructions for backwards copy.
 1012       //
 1013       // t0 at offset -16, t1 at offset -8
 1014       // t2 at offset -32, t3 at offset -24
 1015       // t4 at offset -48, t5 at offset -40
 1016       // t6 at offset -64, t7 at offset -56
 1017 
 1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1022 
 1023       __ subs(count, count, 16);
 1024       __ br(Assembler::LO, drain);
 1025 
 1026       int prefetch = PrefetchCopyIntervalInBytes;
 1027       bool use_stride = false;
 1028       if (direction == copy_backwards) {
 1029         use_stride = prefetch > 256;
 1030         prefetch = -prefetch;
 1031         if (use_stride) __ mov(stride, prefetch);
 1032       }
 1033 
 1034       __ bind(again);
 1035 
 1036       if (PrefetchCopyIntervalInBytes > 0)
 1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1038 
 1039       if (direction == copy_forwards) {
 1040         // allowing for the offset of -8 the store instructions place
 1041         // registers into the target 64 bit block at the following
 1042         // offsets
 1043         //
 1044         // t0 at offset 0
 1045         // t1 at offset 8,  t2 at offset 16
 1046         // t3 at offset 24, t4 at offset 32
 1047         // t5 at offset 40, t6 at offset 48
 1048         // t7 at offset 56
 1049 
 1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1059       } else {
 1060         // d was not offset when we started so the registers are
 1061         // written into the 64 bit block preceding d with the following
 1062         // offsets
 1063         //
 1064         // t1 at offset -8
 1065         // t3 at offset -24, t0 at offset -16
 1066         // t5 at offset -48, t2 at offset -32
 1067         // t7 at offset -56, t4 at offset -48
 1068         //                   t6 at offset -64
 1069         //
 1070         // note that this matches the offsets previously noted for the
 1071         // loads
 1072 
 1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1082       }
 1083 
 1084       __ subs(count, count, 8);
 1085       __ br(Assembler::HS, again);
 1086 
 1087       // Drain
 1088       //
 1089       // this uses the same pattern of offsets and register arguments
 1090       // as above
 1091       __ bind(drain);
 1092       if (direction == copy_forwards) {
 1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1098       } else {
 1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1104       }
 1105       // now we need to copy any remaining part block which may
 1106       // include a 4 word block subblock and/or a 2 word subblock.
 1107       // bits 2 and 1 in the count are the tell-tale for whether we
 1108       // have each such subblock
 1109       {
 1110         Label L1, L2;
 1111         __ tbz(count, exact_log2(4), L1);
 1112         // this is the same as above but copying only 4 longs hence
 1113         // with only one intervening stp between the str instructions
 1114         // but note that the offsets and registers still follow the
 1115         // same pattern
 1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1118         if (direction == copy_forwards) {
 1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1122         } else {
 1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1126         }
 1127         __ bind(L1);
 1128 
 1129         __ tbz(count, 1, L2);
 1130         // this is the same as above but copying only 2 longs hence
 1131         // there is no intervening stp between the str instructions
 1132         // but note that the offset and register patterns are still
 1133         // the same
 1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1135         if (direction == copy_forwards) {
 1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1141         }
 1142         __ bind(L2);
 1143 
 1144         // for forwards copy we need to re-adjust the offsets we
 1145         // applied so that s and d are follow the last words written
 1146 
 1147         if (direction == copy_forwards) {
 1148           __ add(s, s, 16);
 1149           __ add(d, d, 8);
 1150         }
 1151 
 1152       }
 1153 
 1154       __ ret(lr);
 1155     }
 1156 
 1157     return start;
 1158   }
 1159 
 1160   // Small copy: less than 16 bytes.
 1161   //
 1162   // NB: Ignores all of the bits of count which represent more than 15
 1163   // bytes, so a caller doesn't have to mask them.
 1164 
 1165   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1166     bool is_backwards = step < 0;
 1167     size_t granularity = g_uabs(step);
 1168     int direction = is_backwards ? -1 : 1;
 1169 
 1170     Label Lword, Lint, Lshort, Lbyte;
 1171 
 1172     assert(granularity
 1173            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1174 
 1175     const Register t0 = r3;
 1176     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1177     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1178 
 1179     // ??? I don't know if this bit-test-and-branch is the right thing
 1180     // to do.  It does a lot of jumping, resulting in several
 1181     // mispredicted branches.  It might make more sense to do this
 1182     // with something like Duff's device with a single computed branch.
 1183 
 1184     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1185     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1186     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1187     __ bind(Lword);
 1188 
 1189     if (granularity <= sizeof (jint)) {
 1190       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1191       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1192       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1193       __ bind(Lint);
 1194     }
 1195 
 1196     if (granularity <= sizeof (jshort)) {
 1197       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1198       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1199       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1200       __ bind(Lshort);
 1201     }
 1202 
 1203     if (granularity <= sizeof (jbyte)) {
 1204       __ tbz(count, 0, Lbyte);
 1205       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1206       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1207       __ bind(Lbyte);
 1208     }
 1209   }
 1210 
 1211   // All-singing all-dancing memory copy.
 1212   //
 1213   // Copy count units of memory from s to d.  The size of a unit is
 1214   // step, which can be positive or negative depending on the direction
 1215   // of copy.  If is_aligned is false, we align the source address.
 1216   //
 1217 
 1218   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1219                    Register s, Register d, Register count, int step) {
 1220     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1221     bool is_backwards = step < 0;
 1222     unsigned int granularity = g_uabs(step);
 1223     const Register t0 = r3, t1 = r4;
 1224 
 1225     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1226     // load all the data before writing anything
 1227     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1228     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1229     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1230     const Register send = r17, dend = r16;
 1231     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1232     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1233     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1234 
 1235     if (PrefetchCopyIntervalInBytes > 0)
 1236       __ prfm(Address(s, 0), PLDL1KEEP);
 1237     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1238     __ br(Assembler::HI, copy_big);
 1239 
 1240     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1241     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1242 
 1243     __ cmp(count, u1(16/granularity));
 1244     __ br(Assembler::LS, copy16);
 1245 
 1246     __ cmp(count, u1(64/granularity));
 1247     __ br(Assembler::HI, copy80);
 1248 
 1249     __ cmp(count, u1(32/granularity));
 1250     __ br(Assembler::LS, copy32);
 1251 
 1252     // 33..64 bytes
 1253     if (UseSIMDForMemoryOps) {
 1254       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1255       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1256       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1257       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1258     } else {
 1259       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1260       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1261       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1262       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1263 
 1264       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1265       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1266       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1267       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1268     }
 1269     __ b(finish);
 1270 
 1271     // 17..32 bytes
 1272     __ bind(copy32);
 1273     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1274     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1275 
 1276     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1277     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1278     __ b(finish);
 1279 
 1280     // 65..80/96 bytes
 1281     // (96 bytes if SIMD because we do 32 byes per instruction)
 1282     __ bind(copy80);
 1283     if (UseSIMDForMemoryOps) {
 1284       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1285       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1286       // Unaligned pointers can be an issue for copying.
 1287       // The issue has more chances to happen when granularity of data is
 1288       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1289       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1290       // The most performance drop has been seen for the range 65-80 bytes.
 1291       // For such cases using the pair of ldp/stp instead of the third pair of
 1292       // ldpq/stpq fixes the performance issue.
 1293       if (granularity < sizeof (jint)) {
 1294         Label copy96;
 1295         __ cmp(count, u1(80/granularity));
 1296         __ br(Assembler::HI, copy96);
 1297         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1298 
 1299         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1300         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1301 
 1302         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1303         __ b(finish);
 1304 
 1305         __ bind(copy96);
 1306       }
 1307       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1308 
 1309       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1310       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1311 
 1312       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1313     } else {
 1314       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1315       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1316       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1317       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1318       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1319 
 1320       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1321       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1322       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1323       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1324       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1325     }
 1326     __ b(finish);
 1327 
 1328     // 0..16 bytes
 1329     __ bind(copy16);
 1330     __ cmp(count, u1(8/granularity));
 1331     __ br(Assembler::LO, copy8);
 1332 
 1333     // 8..16 bytes
 1334     bs.copy_load_at_8(t0, Address(s, 0));
 1335     bs.copy_load_at_8(t1, Address(send, -8));
 1336     bs.copy_store_at_8(Address(d, 0), t0);
 1337     bs.copy_store_at_8(Address(dend, -8), t1);
 1338     __ b(finish);
 1339 
 1340     if (granularity < 8) {
 1341       // 4..7 bytes
 1342       __ bind(copy8);
 1343       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1344       __ ldrw(t0, Address(s, 0));
 1345       __ ldrw(t1, Address(send, -4));
 1346       __ strw(t0, Address(d, 0));
 1347       __ strw(t1, Address(dend, -4));
 1348       __ b(finish);
 1349       if (granularity < 4) {
 1350         // 0..3 bytes
 1351         __ bind(copy4);
 1352         __ cbz(count, finish); // get rid of 0 case
 1353         if (granularity == 2) {
 1354           __ ldrh(t0, Address(s, 0));
 1355           __ strh(t0, Address(d, 0));
 1356         } else { // granularity == 1
 1357           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1358           // the first and last byte.
 1359           // Handle the 3 byte case by loading and storing base + count/2
 1360           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1361           // This does means in the 1 byte case we load/store the same
 1362           // byte 3 times.
 1363           __ lsr(count, count, 1);
 1364           __ ldrb(t0, Address(s, 0));
 1365           __ ldrb(t1, Address(send, -1));
 1366           __ ldrb(t2, Address(s, count));
 1367           __ strb(t0, Address(d, 0));
 1368           __ strb(t1, Address(dend, -1));
 1369           __ strb(t2, Address(d, count));
 1370         }
 1371         __ b(finish);
 1372       }
 1373     }
 1374 
 1375     __ bind(copy_big);
 1376     if (is_backwards) {
 1377       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1378       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1379     }
 1380 
 1381     // Now we've got the small case out of the way we can align the
 1382     // source address on a 2-word boundary.
 1383 
 1384     // Here we will materialize a count in r15, which is used by copy_memory_small
 1385     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1386     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1387     // can not be used as a temp register, as it contains the count.
 1388 
 1389     Label aligned;
 1390 
 1391     if (is_aligned) {
 1392       // We may have to adjust by 1 word to get s 2-word-aligned.
 1393       __ tbz(s, exact_log2(wordSize), aligned);
 1394       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1395       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1396       __ sub(count, count, wordSize/granularity);
 1397     } else {
 1398       if (is_backwards) {
 1399         __ andr(r15, s, 2 * wordSize - 1);
 1400       } else {
 1401         __ neg(r15, s);
 1402         __ andr(r15, r15, 2 * wordSize - 1);
 1403       }
 1404       // r15 is the byte adjustment needed to align s.
 1405       __ cbz(r15, aligned);
 1406       int shift = exact_log2(granularity);
 1407       if (shift > 0) {
 1408         __ lsr(r15, r15, shift);
 1409       }
 1410       __ sub(count, count, r15);
 1411 
 1412 #if 0
 1413       // ?? This code is only correct for a disjoint copy.  It may or
 1414       // may not make sense to use it in that case.
 1415 
 1416       // Copy the first pair; s and d may not be aligned.
 1417       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1418       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1419 
 1420       // Align s and d, adjust count
 1421       if (is_backwards) {
 1422         __ sub(s, s, r15);
 1423         __ sub(d, d, r15);
 1424       } else {
 1425         __ add(s, s, r15);
 1426         __ add(d, d, r15);
 1427       }
 1428 #else
 1429       copy_memory_small(decorators, type, s, d, r15, step);
 1430 #endif
 1431     }
 1432 
 1433     __ bind(aligned);
 1434 
 1435     // s is now 2-word-aligned.
 1436 
 1437     // We have a count of units and some trailing bytes. Adjust the
 1438     // count and do a bulk copy of words. If the shift is zero
 1439     // perform a move instead to benefit from zero latency moves.
 1440     int shift = exact_log2(wordSize/granularity);
 1441     if (shift > 0) {
 1442       __ lsr(r15, count, shift);
 1443     } else {
 1444       __ mov(r15, count);
 1445     }
 1446     if (direction == copy_forwards) {
 1447       if (type != T_OBJECT) {
 1448         __ bl(StubRoutines::aarch64::copy_byte_f());
 1449       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1450         __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
 1451       } else {
 1452         __ bl(StubRoutines::aarch64::copy_oop_f());
 1453       }
 1454     } else {
 1455       if (type != T_OBJECT) {
 1456         __ bl(StubRoutines::aarch64::copy_byte_b());
 1457       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1458         __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
 1459       } else {
 1460         __ bl(StubRoutines::aarch64::copy_oop_b());
 1461       }
 1462     }
 1463 
 1464     // And the tail.
 1465     copy_memory_small(decorators, type, s, d, count, step);
 1466 
 1467     if (granularity >= 8) __ bind(copy8);
 1468     if (granularity >= 4) __ bind(copy4);
 1469     __ bind(finish);
 1470   }
 1471 
 1472 
 1473   void clobber_registers() {
 1474 #ifdef ASSERT
 1475     RegSet clobbered
 1476       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1477     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1478     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1479     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1480       __ mov(*it, rscratch1);
 1481     }
 1482 #endif
 1483 
 1484   }
 1485 
 1486   // Scan over array at a for count oops, verifying each one.
 1487   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1488   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1489     Label loop, end;
 1490     __ mov(rscratch1, a);
 1491     __ mov(rscratch2, zr);
 1492     __ bind(loop);
 1493     __ cmp(rscratch2, count);
 1494     __ br(Assembler::HS, end);
 1495     if (size == wordSize) {
 1496       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1497       __ verify_oop(temp);
 1498     } else {
 1499       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1500       __ decode_heap_oop(temp); // calls verify_oop
 1501     }
 1502     __ add(rscratch2, rscratch2, 1);
 1503     __ b(loop);
 1504     __ bind(end);
 1505   }
 1506 
 1507   // Arguments:
 1508   //   stub_id - is used to name the stub and identify all details of
 1509   //             how to perform the copy.
 1510   //
 1511   //   entry - is assigned to the stub's post push entry point unless
 1512   //           it is null
 1513   //
 1514   // Inputs:
 1515   //   c_rarg0   - source array address
 1516   //   c_rarg1   - destination array address
 1517   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1518   //
 1519   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1520   // the hardware handle it.  The two dwords within qwords that span
 1521   // cache line boundaries will still be loaded and stored atomically.
 1522   //
 1523   // Side Effects: nopush_entry is set to the (post push) entry point
 1524   //               so it can be used by the corresponding conjoint
 1525   //               copy method
 1526   //
 1527   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1528     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1529     RegSet saved_reg = RegSet::of(s, d, count);
 1530     int size;
 1531     bool aligned;
 1532     bool is_oop;
 1533     bool dest_uninitialized;
 1534     switch (stub_id) {
 1535     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1536       size = sizeof(jbyte);
 1537       aligned = false;
 1538       is_oop = false;
 1539       dest_uninitialized = false;
 1540       break;
 1541     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1542       size = sizeof(jbyte);
 1543       aligned = true;
 1544       is_oop = false;
 1545       dest_uninitialized = false;
 1546       break;
 1547     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1548       size = sizeof(jshort);
 1549       aligned = false;
 1550       is_oop = false;
 1551       dest_uninitialized = false;
 1552       break;
 1553     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1554       size = sizeof(jshort);
 1555       aligned = true;
 1556       is_oop = false;
 1557       dest_uninitialized = false;
 1558       break;
 1559     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1560       size = sizeof(jint);
 1561       aligned = false;
 1562       is_oop = false;
 1563       dest_uninitialized = false;
 1564       break;
 1565     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1566       size = sizeof(jint);
 1567       aligned = true;
 1568       is_oop = false;
 1569       dest_uninitialized = false;
 1570       break;
 1571     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1572       // since this is always aligned we can (should!) use the same
 1573       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1574       ShouldNotReachHere();
 1575       break;
 1576     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1577       size = sizeof(jlong);
 1578       aligned = true;
 1579       is_oop = false;
 1580       dest_uninitialized = false;
 1581       break;
 1582     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1583       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1584       aligned = !UseCompressedOops;
 1585       is_oop = true;
 1586       dest_uninitialized = false;
 1587       break;
 1588     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1589       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1590       aligned = !UseCompressedOops;
 1591       is_oop = true;
 1592       dest_uninitialized = false;
 1593       break;
 1594     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1595       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1596       aligned = !UseCompressedOops;
 1597       is_oop = true;
 1598       dest_uninitialized = true;
 1599       break;
 1600     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1601       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1602       aligned = !UseCompressedOops;
 1603       is_oop = true;
 1604       dest_uninitialized = true;
 1605       break;
 1606     default:
 1607       ShouldNotReachHere();
 1608       break;
 1609     }
 1610 
 1611     __ align(CodeEntryAlignment);
 1612     StubCodeMark mark(this, stub_id);
 1613     address start = __ pc();
 1614     __ enter();
 1615 
 1616     if (nopush_entry != nullptr) {
 1617       *nopush_entry = __ pc();
 1618       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1619       BLOCK_COMMENT("Entry:");
 1620     }
 1621 
 1622     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1623     if (dest_uninitialized) {
 1624       decorators |= IS_DEST_UNINITIALIZED;
 1625     }
 1626     if (aligned) {
 1627       decorators |= ARRAYCOPY_ALIGNED;
 1628     }
 1629 
 1630     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1631     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1632 
 1633     if (is_oop) {
 1634       // save regs before copy_memory
 1635       __ push(RegSet::of(d, count), sp);
 1636     }
 1637     {
 1638       // UnsafeMemoryAccess page error: continue after unsafe access
 1639       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1640       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1641       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1642     }
 1643 
 1644     if (is_oop) {
 1645       __ pop(RegSet::of(d, count), sp);
 1646       if (VerifyOops)
 1647         verify_oop_array(size, d, count, r16);
 1648     }
 1649 
 1650     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1651 
 1652     __ leave();
 1653     __ mov(r0, zr); // return 0
 1654     __ ret(lr);
 1655     return start;
 1656   }
 1657 
 1658   // Arguments:
 1659   //   stub_id - is used to name the stub and identify all details of
 1660   //             how to perform the copy.
 1661   //
 1662   //   nooverlap_target - identifes the (post push) entry for the
 1663   //             corresponding disjoint copy routine which can be
 1664   //             jumped to if the ranges do not actually overlap
 1665   //
 1666   //   entry - is assigned to the stub's post push entry point unless
 1667   //           it is null
 1668   //
 1669   //
 1670   // Inputs:
 1671   //   c_rarg0   - source array address
 1672   //   c_rarg1   - destination array address
 1673   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1674   //
 1675   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1676   // the hardware handle it.  The two dwords within qwords that span
 1677   // cache line boundaries will still be loaded and stored atomically.
 1678   //
 1679   // Side Effects:
 1680   //   nopush_entry is set to the no-overlap entry point so it can be
 1681   //   used by some other conjoint copy method
 1682   //
 1683   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1684     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1685     RegSet saved_regs = RegSet::of(s, d, count);
 1686     int size;
 1687     bool aligned;
 1688     bool is_oop;
 1689     bool dest_uninitialized;
 1690     switch (stub_id) {
 1691     case StubId::stubgen_jbyte_arraycopy_id:
 1692       size = sizeof(jbyte);
 1693       aligned = false;
 1694       is_oop = false;
 1695       dest_uninitialized = false;
 1696       break;
 1697     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1698       size = sizeof(jbyte);
 1699       aligned = true;
 1700       is_oop = false;
 1701       dest_uninitialized = false;
 1702       break;
 1703     case StubId::stubgen_jshort_arraycopy_id:
 1704       size = sizeof(jshort);
 1705       aligned = false;
 1706       is_oop = false;
 1707       dest_uninitialized = false;
 1708       break;
 1709     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1710       size = sizeof(jshort);
 1711       aligned = true;
 1712       is_oop = false;
 1713       dest_uninitialized = false;
 1714       break;
 1715     case StubId::stubgen_jint_arraycopy_id:
 1716       size = sizeof(jint);
 1717       aligned = false;
 1718       is_oop = false;
 1719       dest_uninitialized = false;
 1720       break;
 1721     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1722       size = sizeof(jint);
 1723       aligned = true;
 1724       is_oop = false;
 1725       dest_uninitialized = false;
 1726       break;
 1727     case StubId::stubgen_jlong_arraycopy_id:
 1728       // since this is always aligned we can (should!) use the same
 1729       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1730       ShouldNotReachHere();
 1731       break;
 1732     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1733       size = sizeof(jlong);
 1734       aligned = true;
 1735       is_oop = false;
 1736       dest_uninitialized = false;
 1737       break;
 1738     case StubId::stubgen_oop_arraycopy_id:
 1739       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1740       aligned = !UseCompressedOops;
 1741       is_oop = true;
 1742       dest_uninitialized = false;
 1743       break;
 1744     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1745       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1746       aligned = !UseCompressedOops;
 1747       is_oop = true;
 1748       dest_uninitialized = false;
 1749       break;
 1750     case StubId::stubgen_oop_arraycopy_uninit_id:
 1751       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1752       aligned = !UseCompressedOops;
 1753       is_oop = true;
 1754       dest_uninitialized = true;
 1755       break;
 1756     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1757       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1758       aligned = !UseCompressedOops;
 1759       is_oop = true;
 1760       dest_uninitialized = true;
 1761       break;
 1762     default:
 1763       ShouldNotReachHere();
 1764     }
 1765 
 1766     StubCodeMark mark(this, stub_id);
 1767     address start = __ pc();
 1768     __ enter();
 1769 
 1770     if (nopush_entry != nullptr) {
 1771       *nopush_entry = __ pc();
 1772       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1773       BLOCK_COMMENT("Entry:");
 1774     }
 1775 
 1776     // use fwd copy when (d-s) above_equal (count*size)
 1777     Label L_overlapping;
 1778     __ sub(rscratch1, d, s);
 1779     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1780     __ br(Assembler::LO, L_overlapping);
 1781     __ b(RuntimeAddress(nooverlap_target));
 1782     __ bind(L_overlapping);
 1783 
 1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1785     if (dest_uninitialized) {
 1786       decorators |= IS_DEST_UNINITIALIZED;
 1787     }
 1788     if (aligned) {
 1789       decorators |= ARRAYCOPY_ALIGNED;
 1790     }
 1791 
 1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1794 
 1795     if (is_oop) {
 1796       // save regs before copy_memory
 1797       __ push(RegSet::of(d, count), sp);
 1798     }
 1799     {
 1800       // UnsafeMemoryAccess page error: continue after unsafe access
 1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1804     }
 1805     if (is_oop) {
 1806       __ pop(RegSet::of(d, count), sp);
 1807       if (VerifyOops)
 1808         verify_oop_array(size, d, count, r16);
 1809     }
 1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1811     __ leave();
 1812     __ mov(r0, zr); // return 0
 1813     __ ret(lr);
 1814     return start;
 1815   }
 1816 
 1817   // Helper for generating a dynamic type check.
 1818   // Smashes rscratch1, rscratch2.
 1819   void generate_type_check(Register sub_klass,
 1820                            Register super_check_offset,
 1821                            Register super_klass,
 1822                            Register temp1,
 1823                            Register temp2,
 1824                            Register result,
 1825                            Label& L_success) {
 1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1827 
 1828     BLOCK_COMMENT("type_check:");
 1829 
 1830     Label L_miss;
 1831 
 1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1833                                      super_check_offset);
 1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1835 
 1836     // Fall through on failure!
 1837     __ BIND(L_miss);
 1838   }
 1839 
 1840   //
 1841   //  Generate checkcasting array copy stub
 1842   //
 1843   //  Input:
 1844   //    c_rarg0   - source array address
 1845   //    c_rarg1   - destination array address
 1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1847   //    c_rarg3   - size_t ckoff (super_check_offset)
 1848   //    c_rarg4   - oop ckval (super_klass)
 1849   //
 1850   //  Output:
 1851   //    r0 ==  0  -  success
 1852   //    r0 == -1^K - failure, where K is partial transfer count
 1853   //
 1854   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 1855     bool dest_uninitialized;
 1856     switch (stub_id) {
 1857     case StubId::stubgen_checkcast_arraycopy_id:
 1858       dest_uninitialized = false;
 1859       break;
 1860     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1861       dest_uninitialized = true;
 1862       break;
 1863     default:
 1864       ShouldNotReachHere();
 1865     }
 1866 
 1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1868 
 1869     // Input registers (after setup_arg_regs)
 1870     const Register from        = c_rarg0;   // source array address
 1871     const Register to          = c_rarg1;   // destination array address
 1872     const Register count       = c_rarg2;   // elementscount
 1873     const Register ckoff       = c_rarg3;   // super_check_offset
 1874     const Register ckval       = c_rarg4;   // super_klass
 1875 
 1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1877 
 1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1879     const Register copied_oop  = r22;       // actual oop copied
 1880     const Register count_save  = r21;       // orig elementscount
 1881     const Register start_to    = r20;       // destination array start address
 1882     const Register r19_klass   = r19;       // oop._klass
 1883 
 1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1886 
 1887     //---------------------------------------------------------------
 1888     // Assembler stub will be used for this call to arraycopy
 1889     // if the two arrays are subtypes of Object[] but the
 1890     // destination array type is not equal to or a supertype
 1891     // of the source type.  Each element must be separately
 1892     // checked.
 1893 
 1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1895                                copied_oop, r19_klass, count_save);
 1896 
 1897     __ align(CodeEntryAlignment);
 1898     StubCodeMark mark(this, stub_id);
 1899     address start = __ pc();
 1900 
 1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1902 
 1903 #ifdef ASSERT
 1904     // caller guarantees that the arrays really are different
 1905     // otherwise, we would have to make conjoint checks
 1906     { Label L;
 1907       __ b(L);                  // conjoint check not yet implemented
 1908       __ stop("checkcast_copy within a single array");
 1909       __ bind(L);
 1910     }
 1911 #endif //ASSERT
 1912 
 1913     // Caller of this entry point must set up the argument registers.
 1914     if (nopush_entry != nullptr) {
 1915       *nopush_entry = __ pc();
 1916       BLOCK_COMMENT("Entry:");
 1917     }
 1918 
 1919      // Empty array:  Nothing to do.
 1920     __ cbz(count, L_done);
 1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1922 
 1923 #ifdef ASSERT
 1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1925     // The ckoff and ckval must be mutually consistent,
 1926     // even though caller generates both.
 1927     { Label L;
 1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1929       __ ldrw(start_to, Address(ckval, sco_offset));
 1930       __ cmpw(ckoff, start_to);
 1931       __ br(Assembler::EQ, L);
 1932       __ stop("super_check_offset inconsistent");
 1933       __ bind(L);
 1934     }
 1935 #endif //ASSERT
 1936 
 1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1938     bool is_oop = true;
 1939     int element_size = UseCompressedOops ? 4 : 8;
 1940     if (dest_uninitialized) {
 1941       decorators |= IS_DEST_UNINITIALIZED;
 1942     }
 1943 
 1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1946 
 1947     // save the original count
 1948     __ mov(count_save, count);
 1949 
 1950     // Copy from low to high addresses
 1951     __ mov(start_to, to);              // Save destination array start address
 1952     __ b(L_load_element);
 1953 
 1954     // ======== begin loop ========
 1955     // (Loop is rotated; its entry is L_load_element.)
 1956     // Loop control:
 1957     //   for (; count != 0; count--) {
 1958     //     copied_oop = load_heap_oop(from++);
 1959     //     ... generate_type_check ...;
 1960     //     store_heap_oop(to++, copied_oop);
 1961     //   }
 1962     __ align(OptoLoopAlignment);
 1963 
 1964     __ BIND(L_store_element);
 1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1966                       __ post(to, element_size), copied_oop, noreg,
 1967                       gct1, gct2, gct3);
 1968     __ sub(count, count, 1);
 1969     __ cbz(count, L_do_card_marks);
 1970 
 1971     // ======== loop entry is here ========
 1972     __ BIND(L_load_element);
 1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1974                      copied_oop, noreg, __ post(from, element_size),
 1975                      gct1);
 1976     __ cbz(copied_oop, L_store_element);
 1977 
 1978     __ load_klass(r19_klass, copied_oop);// query the object klass
 1979 
 1980     BLOCK_COMMENT("type_check:");
 1981     generate_type_check(/*sub_klass*/r19_klass,
 1982                         /*super_check_offset*/ckoff,
 1983                         /*super_klass*/ckval,
 1984                         /*r_array_base*/gct1,
 1985                         /*temp2*/gct2,
 1986                         /*result*/r10, L_store_element);
 1987 
 1988     // Fall through on failure!
 1989 
 1990     // ======== end loop ========
 1991 
 1992     // It was a real error; we must depend on the caller to finish the job.
 1993     // Register count = remaining oops, count_orig = total oops.
 1994     // Emit GC store barriers for the oops we have copied and report
 1995     // their number to the caller.
 1996 
 1997     __ subs(count, count_save, count);     // K = partially copied oop count
 1998     __ eon(count, count, zr);              // report (-1^K) to caller
 1999     __ br(Assembler::EQ, L_done_pop);
 2000 
 2001     __ BIND(L_do_card_marks);
 2002     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2003 
 2004     __ bind(L_done_pop);
 2005     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2007 
 2008     __ bind(L_done);
 2009     __ mov(r0, count);
 2010     __ leave();
 2011     __ ret(lr);
 2012 
 2013     return start;
 2014   }
 2015 
 2016   // Perform range checks on the proposed arraycopy.
 2017   // Kills temp, but nothing else.
 2018   // Also, clean the sign bits of src_pos and dst_pos.
 2019   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2020                               Register src_pos, // source position (c_rarg1)
 2021                               Register dst,     // destination array oo (c_rarg2)
 2022                               Register dst_pos, // destination position (c_rarg3)
 2023                               Register length,
 2024                               Register temp,
 2025                               Label& L_failed) {
 2026     BLOCK_COMMENT("arraycopy_range_checks:");
 2027 
 2028     assert_different_registers(rscratch1, temp);
 2029 
 2030     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2031     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2032     __ addw(temp, length, src_pos);
 2033     __ cmpw(temp, rscratch1);
 2034     __ br(Assembler::HI, L_failed);
 2035 
 2036     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2037     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2038     __ addw(temp, length, dst_pos);
 2039     __ cmpw(temp, rscratch1);
 2040     __ br(Assembler::HI, L_failed);
 2041 
 2042     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2043     __ movw(src_pos, src_pos);
 2044     __ movw(dst_pos, dst_pos);
 2045 
 2046     BLOCK_COMMENT("arraycopy_range_checks done");
 2047   }
 2048 
 2049   // These stubs get called from some dumb test routine.
 2050   // I'll write them properly when they're called from
 2051   // something that's actually doing something.
 2052   static void fake_arraycopy_stub(address src, address dst, int count) {
 2053     assert(count == 0, "huh?");
 2054   }
 2055 
 2056 
 2057   //
 2058   //  Generate 'unsafe' array copy stub
 2059   //  Though just as safe as the other stubs, it takes an unscaled
 2060   //  size_t argument instead of an element count.
 2061   //
 2062   //  Input:
 2063   //    c_rarg0   - source array address
 2064   //    c_rarg1   - destination array address
 2065   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2066   //
 2067   // Examines the alignment of the operands and dispatches
 2068   // to a long, int, short, or byte copy loop.
 2069   //
 2070   address generate_unsafe_copy(address byte_copy_entry,
 2071                                address short_copy_entry,
 2072                                address int_copy_entry,
 2073                                address long_copy_entry) {
 2074     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2075 
 2076     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2077     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2078 
 2079     __ align(CodeEntryAlignment);
 2080     StubCodeMark mark(this, stub_id);
 2081     address start = __ pc();
 2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2083 
 2084     // bump this on entry, not on exit:
 2085     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2086 
 2087     __ orr(rscratch1, s, d);
 2088     __ orr(rscratch1, rscratch1, count);
 2089 
 2090     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2091     __ cbz(rscratch1, L_long_aligned);
 2092     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2093     __ cbz(rscratch1, L_int_aligned);
 2094     __ tbz(rscratch1, 0, L_short_aligned);
 2095     __ b(RuntimeAddress(byte_copy_entry));
 2096 
 2097     __ BIND(L_short_aligned);
 2098     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2099     __ b(RuntimeAddress(short_copy_entry));
 2100     __ BIND(L_int_aligned);
 2101     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2102     __ b(RuntimeAddress(int_copy_entry));
 2103     __ BIND(L_long_aligned);
 2104     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2105     __ b(RuntimeAddress(long_copy_entry));
 2106 
 2107     return start;
 2108   }
 2109 
 2110   //
 2111   //  Generate generic array copy stubs
 2112   //
 2113   //  Input:
 2114   //    c_rarg0    -  src oop
 2115   //    c_rarg1    -  src_pos (32-bits)
 2116   //    c_rarg2    -  dst oop
 2117   //    c_rarg3    -  dst_pos (32-bits)
 2118   //    c_rarg4    -  element count (32-bits)
 2119   //
 2120   //  Output:
 2121   //    r0 ==  0  -  success
 2122   //    r0 == -1^K - failure, where K is partial transfer count
 2123   //
 2124   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2125                                 address int_copy_entry, address oop_copy_entry,
 2126                                 address long_copy_entry, address checkcast_copy_entry) {
 2127     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2128 
 2129     Label L_failed, L_objArray;
 2130     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2131 
 2132     // Input registers
 2133     const Register src        = c_rarg0;  // source array oop
 2134     const Register src_pos    = c_rarg1;  // source position
 2135     const Register dst        = c_rarg2;  // destination array oop
 2136     const Register dst_pos    = c_rarg3;  // destination position
 2137     const Register length     = c_rarg4;
 2138 
 2139 
 2140     // Registers used as temps
 2141     const Register dst_klass  = c_rarg5;
 2142 
 2143     __ align(CodeEntryAlignment);
 2144 
 2145     StubCodeMark mark(this, stub_id);
 2146 
 2147     address start = __ pc();
 2148 
 2149     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2150 
 2151     // bump this on entry, not on exit:
 2152     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2153 
 2154     //-----------------------------------------------------------------------
 2155     // Assembler stub will be used for this call to arraycopy
 2156     // if the following conditions are met:
 2157     //
 2158     // (1) src and dst must not be null.
 2159     // (2) src_pos must not be negative.
 2160     // (3) dst_pos must not be negative.
 2161     // (4) length  must not be negative.
 2162     // (5) src klass and dst klass should be the same and not null.
 2163     // (6) src and dst should be arrays.
 2164     // (7) src_pos + length must not exceed length of src.
 2165     // (8) dst_pos + length must not exceed length of dst.
 2166     //
 2167 
 2168     //  if (src == nullptr) return -1;
 2169     __ cbz(src, L_failed);
 2170 
 2171     //  if (src_pos < 0) return -1;
 2172     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2173 
 2174     //  if (dst == nullptr) return -1;
 2175     __ cbz(dst, L_failed);
 2176 
 2177     //  if (dst_pos < 0) return -1;
 2178     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2179 
 2180     // registers used as temp
 2181     const Register scratch_length    = r16; // elements count to copy
 2182     const Register scratch_src_klass = r17; // array klass
 2183     const Register lh                = r15; // layout helper
 2184 
 2185     //  if (length < 0) return -1;
 2186     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2187     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2188 
 2189     __ load_klass(scratch_src_klass, src);
 2190 #ifdef ASSERT
 2191     //  assert(src->klass() != nullptr);
 2192     {
 2193       BLOCK_COMMENT("assert klasses not null {");
 2194       Label L1, L2;
 2195       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2196       __ bind(L1);
 2197       __ stop("broken null klass");
 2198       __ bind(L2);
 2199       __ load_klass(rscratch1, dst);
 2200       __ cbz(rscratch1, L1);     // this would be broken also
 2201       BLOCK_COMMENT("} assert klasses not null done");
 2202     }
 2203 #endif
 2204 
 2205     // Load layout helper (32-bits)
 2206     //
 2207     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2208     // 32        30    24            16              8     2                 0
 2209     //
 2210     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2211     //
 2212 
 2213     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2214 
 2215     // Handle objArrays completely differently...
 2216     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2217     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2218     __ movw(rscratch1, objArray_lh);
 2219     __ eorw(rscratch2, lh, rscratch1);
 2220     __ cbzw(rscratch2, L_objArray);
 2221 
 2222     //  if (src->klass() != dst->klass()) return -1;
 2223     __ load_klass(rscratch2, dst);
 2224     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2225     __ cbnz(rscratch2, L_failed);
 2226 
 2227     //  if (!src->is_Array()) return -1;
 2228     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2229 
 2230     // At this point, it is known to be a typeArray (array_tag 0x3).
 2231 #ifdef ASSERT
 2232     {
 2233       BLOCK_COMMENT("assert primitive array {");
 2234       Label L;
 2235       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2236       __ cmpw(lh, rscratch2);
 2237       __ br(Assembler::GE, L);
 2238       __ stop("must be a primitive array");
 2239       __ bind(L);
 2240       BLOCK_COMMENT("} assert primitive array done");
 2241     }
 2242 #endif
 2243 
 2244     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2245                            rscratch2, L_failed);
 2246 
 2247     // TypeArrayKlass
 2248     //
 2249     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2250     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2251     //
 2252 
 2253     const Register rscratch1_offset = rscratch1;    // array offset
 2254     const Register r15_elsize = lh; // element size
 2255 
 2256     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2257            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2258     __ add(src, src, rscratch1_offset);           // src array offset
 2259     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2260     BLOCK_COMMENT("choose copy loop based on element size");
 2261 
 2262     // next registers should be set before the jump to corresponding stub
 2263     const Register from     = c_rarg0;  // source array address
 2264     const Register to       = c_rarg1;  // destination array address
 2265     const Register count    = c_rarg2;  // elements count
 2266 
 2267     // 'from', 'to', 'count' registers should be set in such order
 2268     // since they are the same as 'src', 'src_pos', 'dst'.
 2269 
 2270     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2271 
 2272     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2273     // size in bytes).  We do a simple bitwise binary search.
 2274   __ BIND(L_copy_bytes);
 2275     __ tbnz(r15_elsize, 1, L_copy_ints);
 2276     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2277     __ lea(from, Address(src, src_pos));// src_addr
 2278     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2279     __ movw(count, scratch_length); // length
 2280     __ b(RuntimeAddress(byte_copy_entry));
 2281 
 2282   __ BIND(L_copy_shorts);
 2283     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2285     __ movw(count, scratch_length); // length
 2286     __ b(RuntimeAddress(short_copy_entry));
 2287 
 2288   __ BIND(L_copy_ints);
 2289     __ tbnz(r15_elsize, 0, L_copy_longs);
 2290     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2291     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2292     __ movw(count, scratch_length); // length
 2293     __ b(RuntimeAddress(int_copy_entry));
 2294 
 2295   __ BIND(L_copy_longs);
 2296 #ifdef ASSERT
 2297     {
 2298       BLOCK_COMMENT("assert long copy {");
 2299       Label L;
 2300       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2301       __ cmpw(r15_elsize, LogBytesPerLong);
 2302       __ br(Assembler::EQ, L);
 2303       __ stop("must be long copy, but elsize is wrong");
 2304       __ bind(L);
 2305       BLOCK_COMMENT("} assert long copy done");
 2306     }
 2307 #endif
 2308     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2310     __ movw(count, scratch_length); // length
 2311     __ b(RuntimeAddress(long_copy_entry));
 2312 
 2313     // ObjArrayKlass
 2314   __ BIND(L_objArray);
 2315     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2316 
 2317     Label L_plain_copy, L_checkcast_copy;
 2318     //  test array classes for subtyping
 2319     __ load_klass(r15, dst);
 2320     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2321     __ br(Assembler::NE, L_checkcast_copy);
 2322 
 2323     // Identically typed arrays can be copied without element-wise checks.
 2324     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2325                            rscratch2, L_failed);
 2326 
 2327     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2328     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2329     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2330     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2331     __ movw(count, scratch_length); // length
 2332   __ BIND(L_plain_copy);
 2333     __ b(RuntimeAddress(oop_copy_entry));
 2334 
 2335   __ BIND(L_checkcast_copy);
 2336     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2337     {
 2338       // Before looking at dst.length, make sure dst is also an objArray.
 2339       __ ldrw(rscratch1, Address(r15, lh_offset));
 2340       __ movw(rscratch2, objArray_lh);
 2341       __ eorw(rscratch1, rscratch1, rscratch2);
 2342       __ cbnzw(rscratch1, L_failed);
 2343 
 2344       // It is safe to examine both src.length and dst.length.
 2345       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2346                              r15, L_failed);
 2347 
 2348       __ load_klass(dst_klass, dst); // reload
 2349 
 2350       // Marshal the base address arguments now, freeing registers.
 2351       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2352       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2354       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2355       __ movw(count, length);           // length (reloaded)
 2356       Register sco_temp = c_rarg3;      // this register is free now
 2357       assert_different_registers(from, to, count, sco_temp,
 2358                                  dst_klass, scratch_src_klass);
 2359       // assert_clean_int(count, sco_temp);
 2360 
 2361       // Generate the type check.
 2362       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2364 
 2365       // Smashes rscratch1, rscratch2
 2366       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2367                           L_plain_copy);
 2368 
 2369       // Fetch destination element klass from the ObjArrayKlass header.
 2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2373 
 2374       // the checkcast_copy loop needs two extra arguments:
 2375       assert(c_rarg3 == sco_temp, "#3 already in place");
 2376       // Set up arguments for checkcast_copy_entry.
 2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2378       __ b(RuntimeAddress(checkcast_copy_entry));
 2379     }
 2380 
 2381   __ BIND(L_failed);
 2382     __ mov(r0, -1);
 2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2384     __ ret(lr);
 2385 
 2386     return start;
 2387   }
 2388 
 2389   //
 2390   // Generate stub for array fill. If "aligned" is true, the
 2391   // "to" address is assumed to be heapword aligned.
 2392   //
 2393   // Arguments for generated stub:
 2394   //   to:    c_rarg0
 2395   //   value: c_rarg1
 2396   //   count: c_rarg2 treated as signed
 2397   //
 2398   address generate_fill(StubId stub_id) {
 2399     BasicType t;
 2400     bool aligned;
 2401 
 2402     switch (stub_id) {
 2403     case StubId::stubgen_jbyte_fill_id:
 2404       t = T_BYTE;
 2405       aligned = false;
 2406       break;
 2407     case StubId::stubgen_jshort_fill_id:
 2408       t = T_SHORT;
 2409       aligned = false;
 2410       break;
 2411     case StubId::stubgen_jint_fill_id:
 2412       t = T_INT;
 2413       aligned = false;
 2414       break;
 2415     case StubId::stubgen_arrayof_jbyte_fill_id:
 2416       t = T_BYTE;
 2417       aligned = true;
 2418       break;
 2419     case StubId::stubgen_arrayof_jshort_fill_id:
 2420       t = T_SHORT;
 2421       aligned = true;
 2422       break;
 2423     case StubId::stubgen_arrayof_jint_fill_id:
 2424       t = T_INT;
 2425       aligned = true;
 2426       break;
 2427     default:
 2428       ShouldNotReachHere();
 2429     };
 2430 
 2431     __ align(CodeEntryAlignment);
 2432     StubCodeMark mark(this, stub_id);
 2433     address start = __ pc();
 2434 
 2435     BLOCK_COMMENT("Entry:");
 2436 
 2437     const Register to        = c_rarg0;  // source array address
 2438     const Register value     = c_rarg1;  // value
 2439     const Register count     = c_rarg2;  // elements count
 2440 
 2441     const Register bz_base = r10;        // base for block_zero routine
 2442     const Register cnt_words = r11;      // temp register
 2443 
 2444     __ enter();
 2445 
 2446     Label L_fill_elements, L_exit1;
 2447 
 2448     int shift = -1;
 2449     switch (t) {
 2450       case T_BYTE:
 2451         shift = 0;
 2452         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2453         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2454         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2455         __ br(Assembler::LO, L_fill_elements);
 2456         break;
 2457       case T_SHORT:
 2458         shift = 1;
 2459         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2461         __ br(Assembler::LO, L_fill_elements);
 2462         break;
 2463       case T_INT:
 2464         shift = 2;
 2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2466         __ br(Assembler::LO, L_fill_elements);
 2467         break;
 2468       default: ShouldNotReachHere();
 2469     }
 2470 
 2471     // Align source address at 8 bytes address boundary.
 2472     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2473     if (!aligned) {
 2474       switch (t) {
 2475         case T_BYTE:
 2476           // One byte misalignment happens only for byte arrays.
 2477           __ tbz(to, 0, L_skip_align1);
 2478           __ strb(value, Address(__ post(to, 1)));
 2479           __ subw(count, count, 1);
 2480           __ bind(L_skip_align1);
 2481           // Fallthrough
 2482         case T_SHORT:
 2483           // Two bytes misalignment happens only for byte and short (char) arrays.
 2484           __ tbz(to, 1, L_skip_align2);
 2485           __ strh(value, Address(__ post(to, 2)));
 2486           __ subw(count, count, 2 >> shift);
 2487           __ bind(L_skip_align2);
 2488           // Fallthrough
 2489         case T_INT:
 2490           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2491           __ tbz(to, 2, L_skip_align4);
 2492           __ strw(value, Address(__ post(to, 4)));
 2493           __ subw(count, count, 4 >> shift);
 2494           __ bind(L_skip_align4);
 2495           break;
 2496         default: ShouldNotReachHere();
 2497       }
 2498     }
 2499 
 2500     //
 2501     //  Fill large chunks
 2502     //
 2503     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2504     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2505     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2506     if (UseBlockZeroing) {
 2507       Label non_block_zeroing, rest;
 2508       // If the fill value is zero we can use the fast zero_words().
 2509       __ cbnz(value, non_block_zeroing);
 2510       __ mov(bz_base, to);
 2511       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2512       address tpc = __ zero_words(bz_base, cnt_words);
 2513       if (tpc == nullptr) {
 2514         fatal("CodeCache is full at generate_fill");
 2515       }
 2516       __ b(rest);
 2517       __ bind(non_block_zeroing);
 2518       __ fill_words(to, cnt_words, value);
 2519       __ bind(rest);
 2520     } else {
 2521       __ fill_words(to, cnt_words, value);
 2522     }
 2523 
 2524     // Remaining count is less than 8 bytes. Fill it by a single store.
 2525     // Note that the total length is no less than 8 bytes.
 2526     if (t == T_BYTE || t == T_SHORT) {
 2527       Label L_exit1;
 2528       __ cbzw(count, L_exit1);
 2529       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2530       __ str(value, Address(to, -8));    // overwrite some elements
 2531       __ bind(L_exit1);
 2532       __ leave();
 2533       __ ret(lr);
 2534     }
 2535 
 2536     // Handle copies less than 8 bytes.
 2537     Label L_fill_2, L_fill_4, L_exit2;
 2538     __ bind(L_fill_elements);
 2539     switch (t) {
 2540       case T_BYTE:
 2541         __ tbz(count, 0, L_fill_2);
 2542         __ strb(value, Address(__ post(to, 1)));
 2543         __ bind(L_fill_2);
 2544         __ tbz(count, 1, L_fill_4);
 2545         __ strh(value, Address(__ post(to, 2)));
 2546         __ bind(L_fill_4);
 2547         __ tbz(count, 2, L_exit2);
 2548         __ strw(value, Address(to));
 2549         break;
 2550       case T_SHORT:
 2551         __ tbz(count, 0, L_fill_4);
 2552         __ strh(value, Address(__ post(to, 2)));
 2553         __ bind(L_fill_4);
 2554         __ tbz(count, 1, L_exit2);
 2555         __ strw(value, Address(to));
 2556         break;
 2557       case T_INT:
 2558         __ cbzw(count, L_exit2);
 2559         __ strw(value, Address(to));
 2560         break;
 2561       default: ShouldNotReachHere();
 2562     }
 2563     __ bind(L_exit2);
 2564     __ leave();
 2565     __ ret(lr);
 2566     return start;
 2567   }
 2568 
 2569   address generate_unsafecopy_common_error_exit() {
 2570     address start_pc = __ pc();
 2571       __ leave();
 2572       __ mov(r0, 0);
 2573       __ ret(lr);
 2574     return start_pc;
 2575   }
 2576 
 2577   //
 2578   //  Generate 'unsafe' set memory stub
 2579   //  Though just as safe as the other stubs, it takes an unscaled
 2580   //  size_t (# bytes) argument instead of an element count.
 2581   //
 2582   //  This fill operation is atomicity preserving: as long as the
 2583   //  address supplied is sufficiently aligned, all writes of up to 64
 2584   //  bits in size are single-copy atomic.
 2585   //
 2586   //  Input:
 2587   //    c_rarg0   - destination array address
 2588   //    c_rarg1   - byte count (size_t)
 2589   //    c_rarg2   - byte value
 2590   //
 2591   address generate_unsafe_setmemory() {
 2592     __ align(CodeEntryAlignment);
 2593     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2594     address start = __ pc();
 2595 
 2596     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2597     Label tail;
 2598 
 2599     UnsafeMemoryAccessMark umam(this, true, false);
 2600 
 2601     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2602 
 2603     __ dup(v0, __ T16B, value);
 2604 
 2605     if (AvoidUnalignedAccesses) {
 2606       __ cmp(count, (u1)16);
 2607       __ br(__ LO, tail);
 2608 
 2609       __ mov(rscratch1, 16);
 2610       __ andr(rscratch2, dest, 15);
 2611       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2612       __ strq(v0, Address(dest));
 2613       __ sub(count, count, rscratch1);
 2614       __ add(dest, dest, rscratch1);
 2615     }
 2616 
 2617     __ subs(count, count, (u1)64);
 2618     __ br(__ LO, tail);
 2619     {
 2620       Label again;
 2621       __ bind(again);
 2622       __ stpq(v0, v0, Address(dest));
 2623       __ stpq(v0, v0, Address(dest, 32));
 2624 
 2625       __ subs(count, count, 64);
 2626       __ add(dest, dest, 64);
 2627       __ br(__ HS, again);
 2628     }
 2629 
 2630     __ bind(tail);
 2631     // The count of bytes is off by 64, but we don't need to correct
 2632     // it because we're only going to use the least-significant few
 2633     // count bits from here on.
 2634     // __ add(count, count, 64);
 2635 
 2636     {
 2637       Label dont;
 2638       __ tbz(count, exact_log2(32), dont);
 2639       __ stpq(v0, v0, __ post(dest, 32));
 2640       __ bind(dont);
 2641     }
 2642     {
 2643       Label dont;
 2644       __ tbz(count, exact_log2(16), dont);
 2645       __ strq(v0, __ post(dest, 16));
 2646       __ bind(dont);
 2647     }
 2648     {
 2649       Label dont;
 2650       __ tbz(count, exact_log2(8), dont);
 2651       __ strd(v0, __ post(dest, 8));
 2652       __ bind(dont);
 2653     }
 2654 
 2655     Label finished;
 2656     __ tst(count, 7);
 2657     __ br(__ EQ, finished);
 2658 
 2659     {
 2660       Label dont;
 2661       __ tbz(count, exact_log2(4), dont);
 2662       __ strs(v0, __ post(dest, 4));
 2663       __ bind(dont);
 2664     }
 2665     {
 2666       Label dont;
 2667       __ tbz(count, exact_log2(2), dont);
 2668       __ bfi(value, value, 8, 8);
 2669       __ strh(value, __ post(dest, 2));
 2670       __ bind(dont);
 2671     }
 2672     {
 2673       Label dont;
 2674       __ tbz(count, exact_log2(1), dont);
 2675       __ strb(value, Address(dest));
 2676       __ bind(dont);
 2677     }
 2678 
 2679     __ bind(finished);
 2680     __ leave();
 2681     __ ret(lr);
 2682 
 2683     return start;
 2684   }
 2685 
 2686   address generate_data_cache_writeback() {
 2687     const Register line        = c_rarg0;  // address of line to write back
 2688 
 2689     __ align(CodeEntryAlignment);
 2690 
 2691     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2692     StubCodeMark mark(this, stub_id);
 2693 
 2694     address start = __ pc();
 2695     __ enter();
 2696     __ cache_wb(Address(line, 0));
 2697     __ leave();
 2698     __ ret(lr);
 2699 
 2700     return start;
 2701   }
 2702 
 2703   address generate_data_cache_writeback_sync() {
 2704     const Register is_pre     = c_rarg0;  // pre or post sync
 2705 
 2706     __ align(CodeEntryAlignment);
 2707 
 2708     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2709     StubCodeMark mark(this, stub_id);
 2710 
 2711     // pre wbsync is a no-op
 2712     // post wbsync translates to an sfence
 2713 
 2714     Label skip;
 2715     address start = __ pc();
 2716     __ enter();
 2717     __ cbnz(is_pre, skip);
 2718     __ cache_wbsync(false);
 2719     __ bind(skip);
 2720     __ leave();
 2721     __ ret(lr);
 2722 
 2723     return start;
 2724   }
 2725 
 2726   void generate_arraycopy_stubs() {
 2727     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 2728     // entry immediately following their stack push. This can be used
 2729     // as a post-push branch target for compatible stubs when they
 2730     // identify a special case that can be handled by the fallback
 2731     // stub e.g a disjoint copy stub may be use as a special case
 2732     // fallback for its compatible conjoint copy stub.
 2733     //
 2734     // A no push entry is always returned in the following local and
 2735     // then published by assigning to the appropriate entry field in
 2736     // class StubRoutines. The entry value is then passed to the
 2737     // generator for the compatible stub. That means the entry must be
 2738     // listed when saving to/restoring from the AOT cache, ensuring
 2739     // that the inter-stub jumps are noted at AOT-cache save and
 2740     // relocated at AOT cache load.
 2741     address nopush_entry;
 2742 
 2743     // generate the common exit first so later stubs can rely on it if
 2744     // they want an UnsafeMemoryAccess exit non-local to the stub
 2745     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2746     // register the stub as the default exit with class UnsafeMemoryAccess
 2747     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2748 
 2749     // generate and publish arch64-specific bulk copy routines first
 2750     // so we can call them from other copy stubs
 2751     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2752     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2753 
 2754     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2755     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2756 
 2757     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2758     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2759 
 2760     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2761 
 2762     //*** jbyte
 2763     // Always need aligned and unaligned versions
 2764     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2765     // disjoint nopush entry is needed by conjoint copy
 2766     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2767     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 2768     // conjoint nopush entry is needed by generic/unsafe copy
 2769     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 2770     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2771     // disjoint arrayof nopush entry is needed by conjoint copy
 2772     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2773     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 2774 
 2775     //*** jshort
 2776     // Always need aligned and unaligned versions
 2777     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 2778     // disjoint nopush entry is needed by conjoint copy
 2779     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 2780     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 2781     // conjoint nopush entry is used by generic/unsafe copy
 2782     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 2783     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 2784     // disjoint arrayof nopush entry is needed by conjoint copy
 2785     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 2786     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 2787 
 2788     //*** jint
 2789     // Aligned versions
 2790     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 2791     // disjoint arrayof nopush entry is needed by conjoint copy
 2792     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 2793     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 2794     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2795     // jint_arraycopy_nopush always points to the unaligned version
 2796     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 2797     // disjoint nopush entry is needed by conjoint copy
 2798     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 2799     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 2800     // conjoint nopush entry is needed by generic/unsafe copy
 2801     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 2802 
 2803     //*** jlong
 2804     // It is always aligned
 2805     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 2806     // disjoint arrayof nopush entry is needed by conjoint copy
 2807     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 2808     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 2809     // conjoint nopush entry is needed by generic/unsafe copy
 2810     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 2811     // disjoint normal/nopush and conjoint normal entries are not
 2812     // generated since the arrayof versions are the same
 2813     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2814     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 2815     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2816 
 2817     //*** oops
 2818     {
 2819       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2820         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 2821       // disjoint arrayof nopush entry is needed by conjoint copy
 2822       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 2823       StubRoutines::_arrayof_oop_arraycopy
 2824         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 2825       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 2826       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 2827       // Aligned versions without pre-barriers
 2828       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2829         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 2830       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 2831       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 2832       // note that we don't need a returned nopush entry because the
 2833       // generic/unsafe copy does not cater for uninit arrays.
 2834       StubRoutines::_arrayof_oop_arraycopy_uninit
 2835         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 2836     }
 2837 
 2838     // for oop copies reuse arrayof entries for non-arrayof cases
 2839     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2840     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 2841     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2842     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2843     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 2844     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2845 
 2846     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 2847     // checkcast nopush entry is needed by generic copy
 2848     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 2849     // note that we don't need a returned nopush entry because the
 2850     // generic copy does not cater for uninit arrays.
 2851     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2852 
 2853     // unsafe arraycopy may fallback on conjoint stubs
 2854     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2855                                                               StubRoutines::_jshort_arraycopy_nopush,
 2856                                                               StubRoutines::_jint_arraycopy_nopush,
 2857                                                               StubRoutines::_jlong_arraycopy_nopush);
 2858 
 2859     // generic arraycopy may fallback on conjoint stubs
 2860     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2861                                                                StubRoutines::_jshort_arraycopy_nopush,
 2862                                                                StubRoutines::_jint_arraycopy_nopush,
 2863                                                                StubRoutines::_oop_arraycopy_nopush,
 2864                                                                StubRoutines::_jlong_arraycopy_nopush,
 2865                                                                StubRoutines::_checkcast_arraycopy_nopush);
 2866 
 2867     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2868     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2869     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2870     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2871     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2872     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2873   }
 2874 
 2875   void generate_math_stubs() { Unimplemented(); }
 2876 
 2877   // Arguments:
 2878   //
 2879   // Inputs:
 2880   //   c_rarg0   - source byte array address
 2881   //   c_rarg1   - destination byte array address
 2882   //   c_rarg2   - sessionKe (key) in little endian int array
 2883   //
 2884   address generate_aescrypt_encryptBlock() {
 2885     __ align(CodeEntryAlignment);
 2886     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2887     StubCodeMark mark(this, stub_id);
 2888 
 2889     const Register from        = c_rarg0;  // source array address
 2890     const Register to          = c_rarg1;  // destination array address
 2891     const Register key         = c_rarg2;  // key array address
 2892     const Register keylen      = rscratch1;
 2893 
 2894     address start = __ pc();
 2895     __ enter();
 2896 
 2897     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2898 
 2899     __ aesenc_loadkeys(key, keylen);
 2900     __ aesecb_encrypt(from, to, keylen);
 2901 
 2902     __ mov(r0, 0);
 2903 
 2904     __ leave();
 2905     __ ret(lr);
 2906 
 2907     return start;
 2908   }
 2909 
 2910   // Arguments:
 2911   //
 2912   // Inputs:
 2913   //   c_rarg0   - source byte array address
 2914   //   c_rarg1   - destination byte array address
 2915   //   c_rarg2   - sessionKd (key) in little endian int array
 2916   //
 2917   address generate_aescrypt_decryptBlock() {
 2918     assert(UseAES, "need AES cryptographic extension support");
 2919     __ align(CodeEntryAlignment);
 2920     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2921     StubCodeMark mark(this, stub_id);
 2922     Label L_doLast;
 2923 
 2924     const Register from        = c_rarg0;  // source array address
 2925     const Register to          = c_rarg1;  // destination array address
 2926     const Register key         = c_rarg2;  // key array address
 2927     const Register keylen      = rscratch1;
 2928 
 2929     address start = __ pc();
 2930     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2931 
 2932     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2933 
 2934     __ aesecb_decrypt(from, to, key, keylen);
 2935 
 2936     __ mov(r0, 0);
 2937 
 2938     __ leave();
 2939     __ ret(lr);
 2940 
 2941     return start;
 2942   }
 2943 
 2944   // Arguments:
 2945   //
 2946   // Inputs:
 2947   //   c_rarg0   - source byte array address
 2948   //   c_rarg1   - destination byte array address
 2949   //   c_rarg2   - sessionKe (key) in little endian int array
 2950   //   c_rarg3   - r vector byte array address
 2951   //   c_rarg4   - input length
 2952   //
 2953   // Output:
 2954   //   x0        - input length
 2955   //
 2956   address generate_cipherBlockChaining_encryptAESCrypt() {
 2957     assert(UseAES, "need AES cryptographic extension support");
 2958     __ align(CodeEntryAlignment);
 2959     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2960     StubCodeMark mark(this, stub_id);
 2961 
 2962     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2963 
 2964     const Register from        = c_rarg0;  // source array address
 2965     const Register to          = c_rarg1;  // destination array address
 2966     const Register key         = c_rarg2;  // key array address
 2967     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2968                                            // and left with the results of the last encryption block
 2969     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2970     const Register keylen      = rscratch1;
 2971 
 2972     address start = __ pc();
 2973 
 2974       __ enter();
 2975 
 2976       __ movw(rscratch2, len_reg);
 2977 
 2978       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2979 
 2980       __ ld1(v0, __ T16B, rvec);
 2981 
 2982       __ cmpw(keylen, 52);
 2983       __ br(Assembler::CC, L_loadkeys_44);
 2984       __ br(Assembler::EQ, L_loadkeys_52);
 2985 
 2986       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2987       __ rev32(v17, __ T16B, v17);
 2988       __ rev32(v18, __ T16B, v18);
 2989     __ BIND(L_loadkeys_52);
 2990       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2991       __ rev32(v19, __ T16B, v19);
 2992       __ rev32(v20, __ T16B, v20);
 2993     __ BIND(L_loadkeys_44);
 2994       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2995       __ rev32(v21, __ T16B, v21);
 2996       __ rev32(v22, __ T16B, v22);
 2997       __ rev32(v23, __ T16B, v23);
 2998       __ rev32(v24, __ T16B, v24);
 2999       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3000       __ rev32(v25, __ T16B, v25);
 3001       __ rev32(v26, __ T16B, v26);
 3002       __ rev32(v27, __ T16B, v27);
 3003       __ rev32(v28, __ T16B, v28);
 3004       __ ld1(v29, v30, v31, __ T16B, key);
 3005       __ rev32(v29, __ T16B, v29);
 3006       __ rev32(v30, __ T16B, v30);
 3007       __ rev32(v31, __ T16B, v31);
 3008 
 3009     __ BIND(L_aes_loop);
 3010       __ ld1(v1, __ T16B, __ post(from, 16));
 3011       __ eor(v0, __ T16B, v0, v1);
 3012 
 3013       __ br(Assembler::CC, L_rounds_44);
 3014       __ br(Assembler::EQ, L_rounds_52);
 3015 
 3016       __ aese(v0, v17); __ aesmc(v0, v0);
 3017       __ aese(v0, v18); __ aesmc(v0, v0);
 3018     __ BIND(L_rounds_52);
 3019       __ aese(v0, v19); __ aesmc(v0, v0);
 3020       __ aese(v0, v20); __ aesmc(v0, v0);
 3021     __ BIND(L_rounds_44);
 3022       __ aese(v0, v21); __ aesmc(v0, v0);
 3023       __ aese(v0, v22); __ aesmc(v0, v0);
 3024       __ aese(v0, v23); __ aesmc(v0, v0);
 3025       __ aese(v0, v24); __ aesmc(v0, v0);
 3026       __ aese(v0, v25); __ aesmc(v0, v0);
 3027       __ aese(v0, v26); __ aesmc(v0, v0);
 3028       __ aese(v0, v27); __ aesmc(v0, v0);
 3029       __ aese(v0, v28); __ aesmc(v0, v0);
 3030       __ aese(v0, v29); __ aesmc(v0, v0);
 3031       __ aese(v0, v30);
 3032       __ eor(v0, __ T16B, v0, v31);
 3033 
 3034       __ st1(v0, __ T16B, __ post(to, 16));
 3035 
 3036       __ subw(len_reg, len_reg, 16);
 3037       __ cbnzw(len_reg, L_aes_loop);
 3038 
 3039       __ st1(v0, __ T16B, rvec);
 3040 
 3041       __ mov(r0, rscratch2);
 3042 
 3043       __ leave();
 3044       __ ret(lr);
 3045 
 3046       return start;
 3047   }
 3048 
 3049   // Arguments:
 3050   //
 3051   // Inputs:
 3052   //   c_rarg0   - source byte array address
 3053   //   c_rarg1   - destination byte array address
 3054   //   c_rarg2   - sessionKd (key) in little endian int array
 3055   //   c_rarg3   - r vector byte array address
 3056   //   c_rarg4   - input length
 3057   //
 3058   // Output:
 3059   //   r0        - input length
 3060   //
 3061   address generate_cipherBlockChaining_decryptAESCrypt() {
 3062     assert(UseAES, "need AES cryptographic extension support");
 3063     __ align(CodeEntryAlignment);
 3064     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3065     StubCodeMark mark(this, stub_id);
 3066 
 3067     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3068 
 3069     const Register from        = c_rarg0;  // source array address
 3070     const Register to          = c_rarg1;  // destination array address
 3071     const Register key         = c_rarg2;  // key array address
 3072     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3073                                            // and left with the results of the last encryption block
 3074     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3075     const Register keylen      = rscratch1;
 3076 
 3077     address start = __ pc();
 3078 
 3079       __ enter();
 3080 
 3081       __ movw(rscratch2, len_reg);
 3082 
 3083       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3084 
 3085       __ ld1(v2, __ T16B, rvec);
 3086 
 3087       __ ld1(v31, __ T16B, __ post(key, 16));
 3088       __ rev32(v31, __ T16B, v31);
 3089 
 3090       __ cmpw(keylen, 52);
 3091       __ br(Assembler::CC, L_loadkeys_44);
 3092       __ br(Assembler::EQ, L_loadkeys_52);
 3093 
 3094       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3095       __ rev32(v17, __ T16B, v17);
 3096       __ rev32(v18, __ T16B, v18);
 3097     __ BIND(L_loadkeys_52);
 3098       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3099       __ rev32(v19, __ T16B, v19);
 3100       __ rev32(v20, __ T16B, v20);
 3101     __ BIND(L_loadkeys_44);
 3102       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3103       __ rev32(v21, __ T16B, v21);
 3104       __ rev32(v22, __ T16B, v22);
 3105       __ rev32(v23, __ T16B, v23);
 3106       __ rev32(v24, __ T16B, v24);
 3107       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3108       __ rev32(v25, __ T16B, v25);
 3109       __ rev32(v26, __ T16B, v26);
 3110       __ rev32(v27, __ T16B, v27);
 3111       __ rev32(v28, __ T16B, v28);
 3112       __ ld1(v29, v30, __ T16B, key);
 3113       __ rev32(v29, __ T16B, v29);
 3114       __ rev32(v30, __ T16B, v30);
 3115 
 3116     __ BIND(L_aes_loop);
 3117       __ ld1(v0, __ T16B, __ post(from, 16));
 3118       __ orr(v1, __ T16B, v0, v0);
 3119 
 3120       __ br(Assembler::CC, L_rounds_44);
 3121       __ br(Assembler::EQ, L_rounds_52);
 3122 
 3123       __ aesd(v0, v17); __ aesimc(v0, v0);
 3124       __ aesd(v0, v18); __ aesimc(v0, v0);
 3125     __ BIND(L_rounds_52);
 3126       __ aesd(v0, v19); __ aesimc(v0, v0);
 3127       __ aesd(v0, v20); __ aesimc(v0, v0);
 3128     __ BIND(L_rounds_44);
 3129       __ aesd(v0, v21); __ aesimc(v0, v0);
 3130       __ aesd(v0, v22); __ aesimc(v0, v0);
 3131       __ aesd(v0, v23); __ aesimc(v0, v0);
 3132       __ aesd(v0, v24); __ aesimc(v0, v0);
 3133       __ aesd(v0, v25); __ aesimc(v0, v0);
 3134       __ aesd(v0, v26); __ aesimc(v0, v0);
 3135       __ aesd(v0, v27); __ aesimc(v0, v0);
 3136       __ aesd(v0, v28); __ aesimc(v0, v0);
 3137       __ aesd(v0, v29); __ aesimc(v0, v0);
 3138       __ aesd(v0, v30);
 3139       __ eor(v0, __ T16B, v0, v31);
 3140       __ eor(v0, __ T16B, v0, v2);
 3141 
 3142       __ st1(v0, __ T16B, __ post(to, 16));
 3143       __ orr(v2, __ T16B, v1, v1);
 3144 
 3145       __ subw(len_reg, len_reg, 16);
 3146       __ cbnzw(len_reg, L_aes_loop);
 3147 
 3148       __ st1(v2, __ T16B, rvec);
 3149 
 3150       __ mov(r0, rscratch2);
 3151 
 3152       __ leave();
 3153       __ ret(lr);
 3154 
 3155     return start;
 3156   }
 3157 
 3158   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3159   // Inputs: 128-bits. in is preserved.
 3160   // The least-significant 64-bit word is in the upper dword of each vector.
 3161   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3162   // Output: result
 3163   void be_add_128_64(FloatRegister result, FloatRegister in,
 3164                      FloatRegister inc, FloatRegister tmp) {
 3165     assert_different_registers(result, tmp, inc);
 3166 
 3167     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3168                                            // input
 3169     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3170     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3171                                            // MSD == 0 (must be!) to LSD
 3172     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3173   }
 3174 
 3175   // CTR AES crypt.
 3176   // Arguments:
 3177   //
 3178   // Inputs:
 3179   //   c_rarg0   - source byte array address
 3180   //   c_rarg1   - destination byte array address
 3181   //   c_rarg2   - sessionKe (key) in little endian int array
 3182   //   c_rarg3   - counter vector byte array address
 3183   //   c_rarg4   - input length
 3184   //   c_rarg5   - saved encryptedCounter start
 3185   //   c_rarg6   - saved used length
 3186   //
 3187   // Output:
 3188   //   r0       - input length
 3189   //
 3190   address generate_counterMode_AESCrypt() {
 3191     const Register in = c_rarg0;
 3192     const Register out = c_rarg1;
 3193     const Register key = c_rarg2;
 3194     const Register counter = c_rarg3;
 3195     const Register saved_len = c_rarg4, len = r10;
 3196     const Register saved_encrypted_ctr = c_rarg5;
 3197     const Register used_ptr = c_rarg6, used = r12;
 3198 
 3199     const Register offset = r7;
 3200     const Register keylen = r11;
 3201 
 3202     const unsigned char block_size = 16;
 3203     const int bulk_width = 4;
 3204     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3205     // performance with larger data sizes, but it also means that the
 3206     // fast path isn't used until you have at least 8 blocks, and up
 3207     // to 127 bytes of data will be executed on the slow path. For
 3208     // that reason, and also so as not to blow away too much icache, 4
 3209     // blocks seems like a sensible compromise.
 3210 
 3211     // Algorithm:
 3212     //
 3213     //    if (len == 0) {
 3214     //        goto DONE;
 3215     //    }
 3216     //    int result = len;
 3217     //    do {
 3218     //        if (used >= blockSize) {
 3219     //            if (len >= bulk_width * blockSize) {
 3220     //                CTR_large_block();
 3221     //                if (len == 0)
 3222     //                    goto DONE;
 3223     //            }
 3224     //            for (;;) {
 3225     //                16ByteVector v0 = counter;
 3226     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3227     //                used = 0;
 3228     //                if (len < blockSize)
 3229     //                    break;    /* goto NEXT */
 3230     //                16ByteVector v1 = load16Bytes(in, offset);
 3231     //                v1 = v1 ^ encryptedCounter;
 3232     //                store16Bytes(out, offset);
 3233     //                used = blockSize;
 3234     //                offset += blockSize;
 3235     //                len -= blockSize;
 3236     //                if (len == 0)
 3237     //                    goto DONE;
 3238     //            }
 3239     //        }
 3240     //      NEXT:
 3241     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3242     //        len--;
 3243     //    } while (len != 0);
 3244     //  DONE:
 3245     //    return result;
 3246     //
 3247     // CTR_large_block()
 3248     //    Wide bulk encryption of whole blocks.
 3249 
 3250     __ align(CodeEntryAlignment);
 3251     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3252     StubCodeMark mark(this, stub_id);
 3253     const address start = __ pc();
 3254     __ enter();
 3255 
 3256     Label DONE, CTR_large_block, large_block_return;
 3257     __ ldrw(used, Address(used_ptr));
 3258     __ cbzw(saved_len, DONE);
 3259 
 3260     __ mov(len, saved_len);
 3261     __ mov(offset, 0);
 3262 
 3263     // Compute #rounds for AES based on the length of the key array
 3264     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3265 
 3266     __ aesenc_loadkeys(key, keylen);
 3267 
 3268     {
 3269       Label L_CTR_loop, NEXT;
 3270 
 3271       __ bind(L_CTR_loop);
 3272 
 3273       __ cmp(used, block_size);
 3274       __ br(__ LO, NEXT);
 3275 
 3276       // Maybe we have a lot of data
 3277       __ subsw(rscratch1, len, bulk_width * block_size);
 3278       __ br(__ HS, CTR_large_block);
 3279       __ BIND(large_block_return);
 3280       __ cbzw(len, DONE);
 3281 
 3282       // Setup the counter
 3283       __ movi(v4, __ T4S, 0);
 3284       __ movi(v5, __ T4S, 1);
 3285       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3286 
 3287       // 128-bit big-endian increment
 3288       __ ld1(v0, __ T16B, counter);
 3289       __ rev64(v16, __ T16B, v0);
 3290       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3291       __ rev64(v16, __ T16B, v16);
 3292       __ st1(v16, __ T16B, counter);
 3293       // Previous counter value is in v0
 3294       // v4 contains { 0, 1 }
 3295 
 3296       {
 3297         // We have fewer than bulk_width blocks of data left. Encrypt
 3298         // them one by one until there is less than a full block
 3299         // remaining, being careful to save both the encrypted counter
 3300         // and the counter.
 3301 
 3302         Label inner_loop;
 3303         __ bind(inner_loop);
 3304         // Counter to encrypt is in v0
 3305         __ aesecb_encrypt(noreg, noreg, keylen);
 3306         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3307 
 3308         // Do we have a remaining full block?
 3309 
 3310         __ mov(used, 0);
 3311         __ cmp(len, block_size);
 3312         __ br(__ LO, NEXT);
 3313 
 3314         // Yes, we have a full block
 3315         __ ldrq(v1, Address(in, offset));
 3316         __ eor(v1, __ T16B, v1, v0);
 3317         __ strq(v1, Address(out, offset));
 3318         __ mov(used, block_size);
 3319         __ add(offset, offset, block_size);
 3320 
 3321         __ subw(len, len, block_size);
 3322         __ cbzw(len, DONE);
 3323 
 3324         // Increment the counter, store it back
 3325         __ orr(v0, __ T16B, v16, v16);
 3326         __ rev64(v16, __ T16B, v16);
 3327         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3328         __ rev64(v16, __ T16B, v16);
 3329         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3330 
 3331         __ b(inner_loop);
 3332       }
 3333 
 3334       __ BIND(NEXT);
 3335 
 3336       // Encrypt a single byte, and loop.
 3337       // We expect this to be a rare event.
 3338       __ ldrb(rscratch1, Address(in, offset));
 3339       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3340       __ eor(rscratch1, rscratch1, rscratch2);
 3341       __ strb(rscratch1, Address(out, offset));
 3342       __ add(offset, offset, 1);
 3343       __ add(used, used, 1);
 3344       __ subw(len, len,1);
 3345       __ cbnzw(len, L_CTR_loop);
 3346     }
 3347 
 3348     __ bind(DONE);
 3349     __ strw(used, Address(used_ptr));
 3350     __ mov(r0, saved_len);
 3351 
 3352     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3353     __ ret(lr);
 3354 
 3355     // Bulk encryption
 3356 
 3357     __ BIND (CTR_large_block);
 3358     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3359 
 3360     if (bulk_width == 8) {
 3361       __ sub(sp, sp, 4 * 16);
 3362       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3363     }
 3364     __ sub(sp, sp, 4 * 16);
 3365     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3366     RegSet saved_regs = (RegSet::of(in, out, offset)
 3367                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3368     __ push(saved_regs, sp);
 3369     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3370     __ add(in, in, offset);
 3371     __ add(out, out, offset);
 3372 
 3373     // Keys should already be loaded into the correct registers
 3374 
 3375     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3376     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3377 
 3378     // AES/CTR loop
 3379     {
 3380       Label L_CTR_loop;
 3381       __ BIND(L_CTR_loop);
 3382 
 3383       // Setup the counters
 3384       __ movi(v8, __ T4S, 0);
 3385       __ movi(v9, __ T4S, 1);
 3386       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3387 
 3388       for (int i = 0; i < bulk_width; i++) {
 3389         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3390         __ rev64(v0_ofs, __ T16B, v16);
 3391         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3392       }
 3393 
 3394       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3395 
 3396       // Encrypt the counters
 3397       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3398 
 3399       if (bulk_width == 8) {
 3400         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3401       }
 3402 
 3403       // XOR the encrypted counters with the inputs
 3404       for (int i = 0; i < bulk_width; i++) {
 3405         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3406         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3407         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3408       }
 3409 
 3410       // Write the encrypted data
 3411       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3412       if (bulk_width == 8) {
 3413         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3414       }
 3415 
 3416       __ subw(len, len, 16 * bulk_width);
 3417       __ cbnzw(len, L_CTR_loop);
 3418     }
 3419 
 3420     // Save the counter back where it goes
 3421     __ rev64(v16, __ T16B, v16);
 3422     __ st1(v16, __ T16B, counter);
 3423 
 3424     __ pop(saved_regs, sp);
 3425 
 3426     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3427     if (bulk_width == 8) {
 3428       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3429     }
 3430 
 3431     __ andr(rscratch1, len, -16 * bulk_width);
 3432     __ sub(len, len, rscratch1);
 3433     __ add(offset, offset, rscratch1);
 3434     __ mov(used, 16);
 3435     __ strw(used, Address(used_ptr));
 3436     __ b(large_block_return);
 3437 
 3438     return start;
 3439   }
 3440 
 3441   // Vector AES Galois Counter Mode implementation. Parameters:
 3442   //
 3443   // in = c_rarg0
 3444   // len = c_rarg1
 3445   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3446   // out = c_rarg3
 3447   // key = c_rarg4
 3448   // state = c_rarg5 - GHASH.state
 3449   // subkeyHtbl = c_rarg6 - powers of H
 3450   // counter = c_rarg7 - 16 bytes of CTR
 3451   // return - number of processed bytes
 3452   address generate_galoisCounterMode_AESCrypt() {
 3453     Label ghash_polynomial; // local data generated after code
 3454 
 3455    __ align(CodeEntryAlignment);
 3456     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3457     StubCodeMark mark(this, stub_id);
 3458     address start = __ pc();
 3459     __ enter();
 3460 
 3461     const Register in = c_rarg0;
 3462     const Register len = c_rarg1;
 3463     const Register ct = c_rarg2;
 3464     const Register out = c_rarg3;
 3465     // and updated with the incremented counter in the end
 3466 
 3467     const Register key = c_rarg4;
 3468     const Register state = c_rarg5;
 3469 
 3470     const Register subkeyHtbl = c_rarg6;
 3471 
 3472     const Register counter = c_rarg7;
 3473 
 3474     const Register keylen = r10;
 3475     // Save state before entering routine
 3476     __ sub(sp, sp, 4 * 16);
 3477     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3478     __ sub(sp, sp, 4 * 16);
 3479     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3480 
 3481     // __ andr(len, len, -512);
 3482     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3483     __ str(len, __ pre(sp, -2 * wordSize));
 3484 
 3485     Label DONE;
 3486     __ cbz(len, DONE);
 3487 
 3488     // Compute #rounds for AES based on the length of the key array
 3489     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3490 
 3491     __ aesenc_loadkeys(key, keylen);
 3492     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3493     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3494 
 3495     // AES/CTR loop
 3496     {
 3497       Label L_CTR_loop;
 3498       __ BIND(L_CTR_loop);
 3499 
 3500       // Setup the counters
 3501       __ movi(v8, __ T4S, 0);
 3502       __ movi(v9, __ T4S, 1);
 3503       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3504 
 3505       assert(v0->encoding() < v8->encoding(), "");
 3506       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3507         FloatRegister f = as_FloatRegister(i);
 3508         __ rev32(f, __ T16B, v16);
 3509         __ addv(v16, __ T4S, v16, v8);
 3510       }
 3511 
 3512       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3513 
 3514       // Encrypt the counters
 3515       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3516 
 3517       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3518 
 3519       // XOR the encrypted counters with the inputs
 3520       for (int i = 0; i < 8; i++) {
 3521         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3522         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3523         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3524       }
 3525       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3526       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3527 
 3528       __ subw(len, len, 16 * 8);
 3529       __ cbnzw(len, L_CTR_loop);
 3530     }
 3531 
 3532     __ rev32(v16, __ T16B, v16);
 3533     __ st1(v16, __ T16B, counter);
 3534 
 3535     __ ldr(len, Address(sp));
 3536     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3537 
 3538     // GHASH/CTR loop
 3539     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3540                                 len, /*unrolls*/4);
 3541 
 3542 #ifdef ASSERT
 3543     { Label L;
 3544       __ cmp(len, (unsigned char)0);
 3545       __ br(Assembler::EQ, L);
 3546       __ stop("stubGenerator: abort");
 3547       __ bind(L);
 3548   }
 3549 #endif
 3550 
 3551   __ bind(DONE);
 3552     // Return the number of bytes processed
 3553     __ ldr(r0, __ post(sp, 2 * wordSize));
 3554 
 3555     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3556     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3557 
 3558     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3559     __ ret(lr);
 3560 
 3561     // bind label and generate polynomial data
 3562     __ align(wordSize * 2);
 3563     __ bind(ghash_polynomial);
 3564     __ emit_int64(0x87);  // The low-order bits of the field
 3565                           // polynomial (i.e. p = z^7+z^2+z+1)
 3566                           // repeated in the low and high parts of a
 3567                           // 128-bit vector
 3568     __ emit_int64(0x87);
 3569 
 3570     return start;
 3571   }
 3572 
 3573   class Cached64Bytes {
 3574   private:
 3575     MacroAssembler *_masm;
 3576     Register _regs[8];
 3577 
 3578   public:
 3579     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3580       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3581       auto it = rs.begin();
 3582       for (auto &r: _regs) {
 3583         r = *it;
 3584         ++it;
 3585       }
 3586     }
 3587 
 3588     void gen_loads(Register base) {
 3589       for (int i = 0; i < 8; i += 2) {
 3590         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3591       }
 3592     }
 3593 
 3594     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3595     void extract_u32(Register dest, int i) {
 3596       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3597     }
 3598   };
 3599 
 3600   // Utility routines for md5.
 3601   // Clobbers r10 and r11.
 3602   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3603               int k, int s, int t) {
 3604     Register rscratch3 = r10;
 3605     Register rscratch4 = r11;
 3606 
 3607     __ eorw(rscratch3, r3, r4);
 3608     __ movw(rscratch2, t);
 3609     __ andw(rscratch3, rscratch3, r2);
 3610     __ addw(rscratch4, r1, rscratch2);
 3611     reg_cache.extract_u32(rscratch1, k);
 3612     __ eorw(rscratch3, rscratch3, r4);
 3613     __ addw(rscratch4, rscratch4, rscratch1);
 3614     __ addw(rscratch3, rscratch3, rscratch4);
 3615     __ rorw(rscratch2, rscratch3, 32 - s);
 3616     __ addw(r1, rscratch2, r2);
 3617   }
 3618 
 3619   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3620               int k, int s, int t) {
 3621     Register rscratch3 = r10;
 3622     Register rscratch4 = r11;
 3623 
 3624     reg_cache.extract_u32(rscratch1, k);
 3625     __ movw(rscratch2, t);
 3626     __ addw(rscratch4, r1, rscratch2);
 3627     __ addw(rscratch4, rscratch4, rscratch1);
 3628     __ bicw(rscratch2, r3, r4);
 3629     __ andw(rscratch3, r2, r4);
 3630     __ addw(rscratch2, rscratch2, rscratch4);
 3631     __ addw(rscratch2, rscratch2, rscratch3);
 3632     __ rorw(rscratch2, rscratch2, 32 - s);
 3633     __ addw(r1, rscratch2, r2);
 3634   }
 3635 
 3636   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3637               int k, int s, int t) {
 3638     Register rscratch3 = r10;
 3639     Register rscratch4 = r11;
 3640 
 3641     __ eorw(rscratch3, r3, r4);
 3642     __ movw(rscratch2, t);
 3643     __ addw(rscratch4, r1, rscratch2);
 3644     reg_cache.extract_u32(rscratch1, k);
 3645     __ eorw(rscratch3, rscratch3, r2);
 3646     __ addw(rscratch4, rscratch4, rscratch1);
 3647     __ addw(rscratch3, rscratch3, rscratch4);
 3648     __ rorw(rscratch2, rscratch3, 32 - s);
 3649     __ addw(r1, rscratch2, r2);
 3650   }
 3651 
 3652   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3653               int k, int s, int t) {
 3654     Register rscratch3 = r10;
 3655     Register rscratch4 = r11;
 3656 
 3657     __ movw(rscratch3, t);
 3658     __ ornw(rscratch2, r2, r4);
 3659     __ addw(rscratch4, r1, rscratch3);
 3660     reg_cache.extract_u32(rscratch1, k);
 3661     __ eorw(rscratch3, rscratch2, r3);
 3662     __ addw(rscratch4, rscratch4, rscratch1);
 3663     __ addw(rscratch3, rscratch3, rscratch4);
 3664     __ rorw(rscratch2, rscratch3, 32 - s);
 3665     __ addw(r1, rscratch2, r2);
 3666   }
 3667 
 3668   // Arguments:
 3669   //
 3670   // Inputs:
 3671   //   c_rarg0   - byte[]  source+offset
 3672   //   c_rarg1   - int[]   SHA.state
 3673   //   c_rarg2   - int     offset
 3674   //   c_rarg3   - int     limit
 3675   //
 3676   address generate_md5_implCompress(StubId stub_id) {
 3677     bool multi_block;
 3678     switch (stub_id) {
 3679     case StubId::stubgen_md5_implCompress_id:
 3680       multi_block = false;
 3681       break;
 3682     case StubId::stubgen_md5_implCompressMB_id:
 3683       multi_block = true;
 3684       break;
 3685     default:
 3686       ShouldNotReachHere();
 3687     }
 3688     __ align(CodeEntryAlignment);
 3689 
 3690     StubCodeMark mark(this, stub_id);
 3691     address start = __ pc();
 3692 
 3693     Register buf       = c_rarg0;
 3694     Register state     = c_rarg1;
 3695     Register ofs       = c_rarg2;
 3696     Register limit     = c_rarg3;
 3697     Register a         = r4;
 3698     Register b         = r5;
 3699     Register c         = r6;
 3700     Register d         = r7;
 3701     Register rscratch3 = r10;
 3702     Register rscratch4 = r11;
 3703 
 3704     Register state_regs[2] = { r12, r13 };
 3705     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3706     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3707 
 3708     __ push(saved_regs, sp);
 3709 
 3710     __ ldp(state_regs[0], state_regs[1], Address(state));
 3711     __ ubfx(a, state_regs[0],  0, 32);
 3712     __ ubfx(b, state_regs[0], 32, 32);
 3713     __ ubfx(c, state_regs[1],  0, 32);
 3714     __ ubfx(d, state_regs[1], 32, 32);
 3715 
 3716     Label md5_loop;
 3717     __ BIND(md5_loop);
 3718 
 3719     reg_cache.gen_loads(buf);
 3720 
 3721     // Round 1
 3722     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3723     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3724     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3725     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3726     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3727     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3728     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3729     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3730     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3731     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3732     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3733     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3734     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3735     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3736     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3737     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3738 
 3739     // Round 2
 3740     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3741     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3742     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3743     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3744     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3745     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3746     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3747     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3748     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3749     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3750     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3751     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3752     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3753     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3754     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3755     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3756 
 3757     // Round 3
 3758     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3759     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3760     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3761     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3762     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3763     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3764     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3765     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3766     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3767     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3768     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3769     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3770     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3771     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3772     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3773     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3774 
 3775     // Round 4
 3776     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3777     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3778     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3779     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3780     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3781     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3782     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3783     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3784     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3785     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3786     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3787     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3788     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3789     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3790     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3791     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3792 
 3793     __ addw(a, state_regs[0], a);
 3794     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3795     __ addw(b, rscratch2, b);
 3796     __ addw(c, state_regs[1], c);
 3797     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3798     __ addw(d, rscratch4, d);
 3799 
 3800     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3801     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3802 
 3803     if (multi_block) {
 3804       __ add(buf, buf, 64);
 3805       __ add(ofs, ofs, 64);
 3806       __ cmp(ofs, limit);
 3807       __ br(Assembler::LE, md5_loop);
 3808       __ mov(c_rarg0, ofs); // return ofs
 3809     }
 3810 
 3811     // write hash values back in the correct order
 3812     __ stp(state_regs[0], state_regs[1], Address(state));
 3813 
 3814     __ pop(saved_regs, sp);
 3815 
 3816     __ ret(lr);
 3817 
 3818     return start;
 3819   }
 3820 
 3821   // Arguments:
 3822   //
 3823   // Inputs:
 3824   //   c_rarg0   - byte[]  source+offset
 3825   //   c_rarg1   - int[]   SHA.state
 3826   //   c_rarg2   - int     offset
 3827   //   c_rarg3   - int     limit
 3828   //
 3829   address generate_sha1_implCompress(StubId stub_id) {
 3830     bool multi_block;
 3831     switch (stub_id) {
 3832     case StubId::stubgen_sha1_implCompress_id:
 3833       multi_block = false;
 3834       break;
 3835     case StubId::stubgen_sha1_implCompressMB_id:
 3836       multi_block = true;
 3837       break;
 3838     default:
 3839       ShouldNotReachHere();
 3840     }
 3841 
 3842     __ align(CodeEntryAlignment);
 3843 
 3844     StubCodeMark mark(this, stub_id);
 3845     address start = __ pc();
 3846 
 3847     Register buf   = c_rarg0;
 3848     Register state = c_rarg1;
 3849     Register ofs   = c_rarg2;
 3850     Register limit = c_rarg3;
 3851 
 3852     Label keys;
 3853     Label sha1_loop;
 3854 
 3855     // load the keys into v0..v3
 3856     __ adr(rscratch1, keys);
 3857     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3858     // load 5 words state into v6, v7
 3859     __ ldrq(v6, Address(state, 0));
 3860     __ ldrs(v7, Address(state, 16));
 3861 
 3862 
 3863     __ BIND(sha1_loop);
 3864     // load 64 bytes of data into v16..v19
 3865     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3866     __ rev32(v16, __ T16B, v16);
 3867     __ rev32(v17, __ T16B, v17);
 3868     __ rev32(v18, __ T16B, v18);
 3869     __ rev32(v19, __ T16B, v19);
 3870 
 3871     // do the sha1
 3872     __ addv(v4, __ T4S, v16, v0);
 3873     __ orr(v20, __ T16B, v6, v6);
 3874 
 3875     FloatRegister d0 = v16;
 3876     FloatRegister d1 = v17;
 3877     FloatRegister d2 = v18;
 3878     FloatRegister d3 = v19;
 3879 
 3880     for (int round = 0; round < 20; round++) {
 3881       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3882       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3883       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3884       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3885       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3886 
 3887       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3888       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3889       __ sha1h(tmp2, __ T4S, v20);
 3890       if (round < 5)
 3891         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3892       else if (round < 10 || round >= 15)
 3893         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3894       else
 3895         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3896       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3897 
 3898       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3899     }
 3900 
 3901     __ addv(v7, __ T2S, v7, v21);
 3902     __ addv(v6, __ T4S, v6, v20);
 3903 
 3904     if (multi_block) {
 3905       __ add(ofs, ofs, 64);
 3906       __ cmp(ofs, limit);
 3907       __ br(Assembler::LE, sha1_loop);
 3908       __ mov(c_rarg0, ofs); // return ofs
 3909     }
 3910 
 3911     __ strq(v6, Address(state, 0));
 3912     __ strs(v7, Address(state, 16));
 3913 
 3914     __ ret(lr);
 3915 
 3916     __ bind(keys);
 3917     __ emit_int32(0x5a827999);
 3918     __ emit_int32(0x6ed9eba1);
 3919     __ emit_int32(0x8f1bbcdc);
 3920     __ emit_int32(0xca62c1d6);
 3921 
 3922     return start;
 3923   }
 3924 
 3925 
 3926   // Arguments:
 3927   //
 3928   // Inputs:
 3929   //   c_rarg0   - byte[]  source+offset
 3930   //   c_rarg1   - int[]   SHA.state
 3931   //   c_rarg2   - int     offset
 3932   //   c_rarg3   - int     limit
 3933   //
 3934   address generate_sha256_implCompress(StubId stub_id) {
 3935     bool multi_block;
 3936     switch (stub_id) {
 3937     case StubId::stubgen_sha256_implCompress_id:
 3938       multi_block = false;
 3939       break;
 3940     case StubId::stubgen_sha256_implCompressMB_id:
 3941       multi_block = true;
 3942       break;
 3943     default:
 3944       ShouldNotReachHere();
 3945     }
 3946 
 3947     static const uint32_t round_consts[64] = {
 3948       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3949       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3950       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3951       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3952       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3953       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3954       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3955       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3956       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3957       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3958       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3959       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3960       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3961       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3962       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3963       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3964     };
 3965 
 3966     __ align(CodeEntryAlignment);
 3967 
 3968     StubCodeMark mark(this, stub_id);
 3969     address start = __ pc();
 3970 
 3971     Register buf   = c_rarg0;
 3972     Register state = c_rarg1;
 3973     Register ofs   = c_rarg2;
 3974     Register limit = c_rarg3;
 3975 
 3976     Label sha1_loop;
 3977 
 3978     __ stpd(v8, v9, __ pre(sp, -32));
 3979     __ stpd(v10, v11, Address(sp, 16));
 3980 
 3981 // dga == v0
 3982 // dgb == v1
 3983 // dg0 == v2
 3984 // dg1 == v3
 3985 // dg2 == v4
 3986 // t0 == v6
 3987 // t1 == v7
 3988 
 3989     // load 16 keys to v16..v31
 3990     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3991     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3992     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3993     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3994     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3995 
 3996     // load 8 words (256 bits) state
 3997     __ ldpq(v0, v1, state);
 3998 
 3999     __ BIND(sha1_loop);
 4000     // load 64 bytes of data into v8..v11
 4001     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4002     __ rev32(v8, __ T16B, v8);
 4003     __ rev32(v9, __ T16B, v9);
 4004     __ rev32(v10, __ T16B, v10);
 4005     __ rev32(v11, __ T16B, v11);
 4006 
 4007     __ addv(v6, __ T4S, v8, v16);
 4008     __ orr(v2, __ T16B, v0, v0);
 4009     __ orr(v3, __ T16B, v1, v1);
 4010 
 4011     FloatRegister d0 = v8;
 4012     FloatRegister d1 = v9;
 4013     FloatRegister d2 = v10;
 4014     FloatRegister d3 = v11;
 4015 
 4016 
 4017     for (int round = 0; round < 16; round++) {
 4018       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4019       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4020       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4021       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4022 
 4023       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4024        __ orr(v4, __ T16B, v2, v2);
 4025       if (round < 15)
 4026         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4027       __ sha256h(v2, __ T4S, v3, tmp2);
 4028       __ sha256h2(v3, __ T4S, v4, tmp2);
 4029       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4030 
 4031       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4032     }
 4033 
 4034     __ addv(v0, __ T4S, v0, v2);
 4035     __ addv(v1, __ T4S, v1, v3);
 4036 
 4037     if (multi_block) {
 4038       __ add(ofs, ofs, 64);
 4039       __ cmp(ofs, limit);
 4040       __ br(Assembler::LE, sha1_loop);
 4041       __ mov(c_rarg0, ofs); // return ofs
 4042     }
 4043 
 4044     __ ldpd(v10, v11, Address(sp, 16));
 4045     __ ldpd(v8, v9, __ post(sp, 32));
 4046 
 4047     __ stpq(v0, v1, state);
 4048 
 4049     __ ret(lr);
 4050 
 4051     return start;
 4052   }
 4053 
 4054   // Double rounds for sha512.
 4055   void sha512_dround(int dr,
 4056                      FloatRegister vi0, FloatRegister vi1,
 4057                      FloatRegister vi2, FloatRegister vi3,
 4058                      FloatRegister vi4, FloatRegister vrc0,
 4059                      FloatRegister vrc1, FloatRegister vin0,
 4060                      FloatRegister vin1, FloatRegister vin2,
 4061                      FloatRegister vin3, FloatRegister vin4) {
 4062       if (dr < 36) {
 4063         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4064       }
 4065       __ addv(v5, __ T2D, vrc0, vin0);
 4066       __ ext(v6, __ T16B, vi2, vi3, 8);
 4067       __ ext(v5, __ T16B, v5, v5, 8);
 4068       __ ext(v7, __ T16B, vi1, vi2, 8);
 4069       __ addv(vi3, __ T2D, vi3, v5);
 4070       if (dr < 32) {
 4071         __ ext(v5, __ T16B, vin3, vin4, 8);
 4072         __ sha512su0(vin0, __ T2D, vin1);
 4073       }
 4074       __ sha512h(vi3, __ T2D, v6, v7);
 4075       if (dr < 32) {
 4076         __ sha512su1(vin0, __ T2D, vin2, v5);
 4077       }
 4078       __ addv(vi4, __ T2D, vi1, vi3);
 4079       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4080   }
 4081 
 4082   // Arguments:
 4083   //
 4084   // Inputs:
 4085   //   c_rarg0   - byte[]  source+offset
 4086   //   c_rarg1   - int[]   SHA.state
 4087   //   c_rarg2   - int     offset
 4088   //   c_rarg3   - int     limit
 4089   //
 4090   address generate_sha512_implCompress(StubId stub_id) {
 4091     bool multi_block;
 4092     switch (stub_id) {
 4093     case StubId::stubgen_sha512_implCompress_id:
 4094       multi_block = false;
 4095       break;
 4096     case StubId::stubgen_sha512_implCompressMB_id:
 4097       multi_block = true;
 4098       break;
 4099     default:
 4100       ShouldNotReachHere();
 4101     }
 4102 
 4103     static const uint64_t round_consts[80] = {
 4104       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4105       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4106       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4107       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4108       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4109       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4110       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4111       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4112       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4113       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4114       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4115       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4116       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4117       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4118       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4119       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4120       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4121       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4122       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4123       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4124       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4125       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4126       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4127       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4128       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4129       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4130       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4131     };
 4132 
 4133     __ align(CodeEntryAlignment);
 4134 
 4135     StubCodeMark mark(this, stub_id);
 4136     address start = __ pc();
 4137 
 4138     Register buf   = c_rarg0;
 4139     Register state = c_rarg1;
 4140     Register ofs   = c_rarg2;
 4141     Register limit = c_rarg3;
 4142 
 4143     __ stpd(v8, v9, __ pre(sp, -64));
 4144     __ stpd(v10, v11, Address(sp, 16));
 4145     __ stpd(v12, v13, Address(sp, 32));
 4146     __ stpd(v14, v15, Address(sp, 48));
 4147 
 4148     Label sha512_loop;
 4149 
 4150     // load state
 4151     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4152 
 4153     // load first 4 round constants
 4154     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4155     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4156 
 4157     __ BIND(sha512_loop);
 4158     // load 128B of data into v12..v19
 4159     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4160     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4161     __ rev64(v12, __ T16B, v12);
 4162     __ rev64(v13, __ T16B, v13);
 4163     __ rev64(v14, __ T16B, v14);
 4164     __ rev64(v15, __ T16B, v15);
 4165     __ rev64(v16, __ T16B, v16);
 4166     __ rev64(v17, __ T16B, v17);
 4167     __ rev64(v18, __ T16B, v18);
 4168     __ rev64(v19, __ T16B, v19);
 4169 
 4170     __ mov(rscratch2, rscratch1);
 4171 
 4172     __ mov(v0, __ T16B, v8);
 4173     __ mov(v1, __ T16B, v9);
 4174     __ mov(v2, __ T16B, v10);
 4175     __ mov(v3, __ T16B, v11);
 4176 
 4177     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4178     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4179     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4180     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4181     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4182     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4183     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4184     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4185     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4186     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4187     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4188     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4189     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4190     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4191     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4192     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4193     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4194     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4195     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4196     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4197     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4198     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4199     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4200     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4201     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4202     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4203     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4204     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4205     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4206     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4207     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4208     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4209     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4210     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4211     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4212     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4213     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4214     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4215     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4216     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4217 
 4218     __ addv(v8, __ T2D, v8, v0);
 4219     __ addv(v9, __ T2D, v9, v1);
 4220     __ addv(v10, __ T2D, v10, v2);
 4221     __ addv(v11, __ T2D, v11, v3);
 4222 
 4223     if (multi_block) {
 4224       __ add(ofs, ofs, 128);
 4225       __ cmp(ofs, limit);
 4226       __ br(Assembler::LE, sha512_loop);
 4227       __ mov(c_rarg0, ofs); // return ofs
 4228     }
 4229 
 4230     __ st1(v8, v9, v10, v11, __ T2D, state);
 4231 
 4232     __ ldpd(v14, v15, Address(sp, 48));
 4233     __ ldpd(v12, v13, Address(sp, 32));
 4234     __ ldpd(v10, v11, Address(sp, 16));
 4235     __ ldpd(v8, v9, __ post(sp, 64));
 4236 
 4237     __ ret(lr);
 4238 
 4239     return start;
 4240   }
 4241 
 4242   // Execute one round of keccak of two computations in parallel.
 4243   // One of the states should be loaded into the lower halves of
 4244   // the vector registers v0-v24, the other should be loaded into
 4245   // the upper halves of those registers. The ld1r instruction loads
 4246   // the round constant into both halves of register v31.
 4247   // Intermediate results c0...c5 and d0...d5 are computed
 4248   // in registers v25...v30.
 4249   // All vector instructions that are used operate on both register
 4250   // halves in parallel.
 4251   // If only a single computation is needed, one can only load the lower halves.
 4252   void keccak_round(Register rscratch1) {
 4253   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4254   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4255   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4256   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4257   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4258   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4259   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4260   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4261   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4262   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4263 
 4264   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4265   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4266   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4267   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4268   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4269 
 4270   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4271   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4272   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4273   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4274   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4275   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4276   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4277   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4278   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4279   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4280   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4281   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4282   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4283   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4284   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4285   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4286   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4287   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4288   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4289   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4290   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4291   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4292   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4293   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4294   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4295 
 4296   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4297   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4298   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4299   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4300   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4301 
 4302   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4303 
 4304   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4305   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4306   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4307   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4308   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4309 
 4310   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4311   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4312   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4313   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4314   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4315 
 4316   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4317   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4318   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4319   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4320   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4321 
 4322   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4323   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4324   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4325   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4326   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4327 
 4328   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4329   }
 4330 
 4331   // Arguments:
 4332   //
 4333   // Inputs:
 4334   //   c_rarg0   - byte[]  source+offset
 4335   //   c_rarg1   - byte[]  SHA.state
 4336   //   c_rarg2   - int     block_size
 4337   //   c_rarg3   - int     offset
 4338   //   c_rarg4   - int     limit
 4339   //
 4340   address generate_sha3_implCompress(StubId stub_id) {
 4341     bool multi_block;
 4342     switch (stub_id) {
 4343     case StubId::stubgen_sha3_implCompress_id:
 4344       multi_block = false;
 4345       break;
 4346     case StubId::stubgen_sha3_implCompressMB_id:
 4347       multi_block = true;
 4348       break;
 4349     default:
 4350       ShouldNotReachHere();
 4351     }
 4352 
 4353     static const uint64_t round_consts[24] = {
 4354       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4355       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4356       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4357       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4358       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4359       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4360       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4361       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4362     };
 4363 
 4364     __ align(CodeEntryAlignment);
 4365 
 4366     StubCodeMark mark(this, stub_id);
 4367     address start = __ pc();
 4368 
 4369     Register buf           = c_rarg0;
 4370     Register state         = c_rarg1;
 4371     Register block_size    = c_rarg2;
 4372     Register ofs           = c_rarg3;
 4373     Register limit         = c_rarg4;
 4374 
 4375     Label sha3_loop, rounds24_loop;
 4376     Label sha3_512_or_sha3_384, shake128;
 4377 
 4378     __ stpd(v8, v9, __ pre(sp, -64));
 4379     __ stpd(v10, v11, Address(sp, 16));
 4380     __ stpd(v12, v13, Address(sp, 32));
 4381     __ stpd(v14, v15, Address(sp, 48));
 4382 
 4383     // load state
 4384     __ add(rscratch1, state, 32);
 4385     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4386     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4387     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4388     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4389     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4390     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4391     __ ld1(v24, __ T1D, rscratch1);
 4392 
 4393     __ BIND(sha3_loop);
 4394 
 4395     // 24 keccak rounds
 4396     __ movw(rscratch2, 24);
 4397 
 4398     // load round_constants base
 4399     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4400 
 4401     // load input
 4402     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4403     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4404     __ eor(v0, __ T8B, v0, v25);
 4405     __ eor(v1, __ T8B, v1, v26);
 4406     __ eor(v2, __ T8B, v2, v27);
 4407     __ eor(v3, __ T8B, v3, v28);
 4408     __ eor(v4, __ T8B, v4, v29);
 4409     __ eor(v5, __ T8B, v5, v30);
 4410     __ eor(v6, __ T8B, v6, v31);
 4411 
 4412     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4413     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4414 
 4415     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4416     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4417     __ eor(v7, __ T8B, v7, v25);
 4418     __ eor(v8, __ T8B, v8, v26);
 4419     __ eor(v9, __ T8B, v9, v27);
 4420     __ eor(v10, __ T8B, v10, v28);
 4421     __ eor(v11, __ T8B, v11, v29);
 4422     __ eor(v12, __ T8B, v12, v30);
 4423     __ eor(v13, __ T8B, v13, v31);
 4424 
 4425     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4426     __ eor(v14, __ T8B, v14, v25);
 4427     __ eor(v15, __ T8B, v15, v26);
 4428     __ eor(v16, __ T8B, v16, v27);
 4429 
 4430     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4431     __ andw(c_rarg5, block_size, 48);
 4432     __ cbzw(c_rarg5, rounds24_loop);
 4433 
 4434     __ tbnz(block_size, 5, shake128);
 4435     // block_size == 144, bit5 == 0, SHA3-224
 4436     __ ldrd(v28, __ post(buf, 8));
 4437     __ eor(v17, __ T8B, v17, v28);
 4438     __ b(rounds24_loop);
 4439 
 4440     __ BIND(shake128);
 4441     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4442     __ eor(v17, __ T8B, v17, v28);
 4443     __ eor(v18, __ T8B, v18, v29);
 4444     __ eor(v19, __ T8B, v19, v30);
 4445     __ eor(v20, __ T8B, v20, v31);
 4446     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4447 
 4448     __ BIND(sha3_512_or_sha3_384);
 4449     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4450     __ eor(v7, __ T8B, v7, v25);
 4451     __ eor(v8, __ T8B, v8, v26);
 4452     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4453 
 4454     // SHA3-384
 4455     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4456     __ eor(v9,  __ T8B, v9,  v27);
 4457     __ eor(v10, __ T8B, v10, v28);
 4458     __ eor(v11, __ T8B, v11, v29);
 4459     __ eor(v12, __ T8B, v12, v30);
 4460 
 4461     __ BIND(rounds24_loop);
 4462     __ subw(rscratch2, rscratch2, 1);
 4463 
 4464     keccak_round(rscratch1);
 4465 
 4466     __ cbnzw(rscratch2, rounds24_loop);
 4467 
 4468     if (multi_block) {
 4469       __ add(ofs, ofs, block_size);
 4470       __ cmp(ofs, limit);
 4471       __ br(Assembler::LE, sha3_loop);
 4472       __ mov(c_rarg0, ofs); // return ofs
 4473     }
 4474 
 4475     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4476     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4477     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4478     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4479     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4480     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4481     __ st1(v24, __ T1D, state);
 4482 
 4483     // restore callee-saved registers
 4484     __ ldpd(v14, v15, Address(sp, 48));
 4485     __ ldpd(v12, v13, Address(sp, 32));
 4486     __ ldpd(v10, v11, Address(sp, 16));
 4487     __ ldpd(v8, v9, __ post(sp, 64));
 4488 
 4489     __ ret(lr);
 4490 
 4491     return start;
 4492   }
 4493 
 4494   // Inputs:
 4495   //   c_rarg0   - long[]  state0
 4496   //   c_rarg1   - long[]  state1
 4497   address generate_double_keccak() {
 4498     static const uint64_t round_consts[24] = {
 4499       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4500       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4501       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4502       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4503       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4504       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4505       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4506       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4507     };
 4508 
 4509     // Implements the double_keccak() method of the
 4510     // sun.secyrity.provider.SHA3Parallel class
 4511     __ align(CodeEntryAlignment);
 4512     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4513     address start = __ pc();
 4514     __ enter();
 4515 
 4516     Register state0        = c_rarg0;
 4517     Register state1        = c_rarg1;
 4518 
 4519     Label rounds24_loop;
 4520 
 4521     // save callee-saved registers
 4522     __ stpd(v8, v9, __ pre(sp, -64));
 4523     __ stpd(v10, v11, Address(sp, 16));
 4524     __ stpd(v12, v13, Address(sp, 32));
 4525     __ stpd(v14, v15, Address(sp, 48));
 4526 
 4527     // load states
 4528     __ add(rscratch1, state0, 32);
 4529     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4530     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4531     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4532     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4533     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4534     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4535     __ ld1(v24, __ D, 0, rscratch1);
 4536     __ add(rscratch1, state1, 32);
 4537     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4538     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4539     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4540     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4541     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4542     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4543     __ ld1(v24, __ D, 1, rscratch1);
 4544 
 4545     // 24 keccak rounds
 4546     __ movw(rscratch2, 24);
 4547 
 4548     // load round_constants base
 4549     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4550 
 4551     __ BIND(rounds24_loop);
 4552     __ subw(rscratch2, rscratch2, 1);
 4553     keccak_round(rscratch1);
 4554     __ cbnzw(rscratch2, rounds24_loop);
 4555 
 4556     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4557     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4558     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4559     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4560     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4561     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4562     __ st1(v24, __ D, 0, state0);
 4563     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4564     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4565     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4566     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4567     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4568     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4569     __ st1(v24, __ D, 1, state1);
 4570 
 4571     // restore callee-saved vector registers
 4572     __ ldpd(v14, v15, Address(sp, 48));
 4573     __ ldpd(v12, v13, Address(sp, 32));
 4574     __ ldpd(v10, v11, Address(sp, 16));
 4575     __ ldpd(v8, v9, __ post(sp, 64));
 4576 
 4577     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4578     __ mov(r0, zr); // return 0
 4579     __ ret(lr);
 4580 
 4581     return start;
 4582   }
 4583 
 4584   // ChaCha20 block function.  This version parallelizes the 32-bit
 4585   // state elements on each of 16 vectors, producing 4 blocks of
 4586   // keystream at a time.
 4587   //
 4588   // state (int[16]) = c_rarg0
 4589   // keystream (byte[256]) = c_rarg1
 4590   // return - number of bytes of produced keystream (always 256)
 4591   //
 4592   // This implementation takes each 32-bit integer from the state
 4593   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4594   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4595   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4596   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4597   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4598   // operations represented by one QUARTERROUND function, we instead stack all
 4599   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4600   // and then do the same for the second set of 4 quarter rounds.  This removes
 4601   // some latency that would otherwise be incurred by waiting for an add to
 4602   // complete before performing an xor (which depends on the result of the
 4603   // add), etc. An adjustment happens between the first and second groups of 4
 4604   // quarter rounds, but this is done only in the inputs to the macro functions
 4605   // that generate the assembly instructions - these adjustments themselves are
 4606   // not part of the resulting assembly.
 4607   // The 4 registers v0-v3 are used during the quarter round operations as
 4608   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4609   // registers become the vectors involved in adding the start state back onto
 4610   // the post-QR working state.  After the adds are complete, each of the 16
 4611   // vectors write their first lane back to the keystream buffer, followed
 4612   // by the second lane from all vectors and so on.
 4613   address generate_chacha20Block_blockpar() {
 4614     Label L_twoRounds, L_cc20_const;
 4615     __ align(CodeEntryAlignment);
 4616     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4617     StubCodeMark mark(this, stub_id);
 4618     address start = __ pc();
 4619     __ enter();
 4620 
 4621     int i, j;
 4622     const Register state = c_rarg0;
 4623     const Register keystream = c_rarg1;
 4624     const Register loopCtr = r10;
 4625     const Register tmpAddr = r11;
 4626     const FloatRegister ctrAddOverlay = v28;
 4627     const FloatRegister lrot8Tbl = v29;
 4628 
 4629     // Organize SIMD registers in an array that facilitates
 4630     // putting repetitive opcodes into loop structures.  It is
 4631     // important that each grouping of 4 registers is monotonically
 4632     // increasing to support the requirements of multi-register
 4633     // instructions (e.g. ld4r, st4, etc.)
 4634     const FloatRegister workSt[16] = {
 4635          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4636         v20, v21, v22, v23, v24, v25, v26, v27
 4637     };
 4638 
 4639     // Pull in constant data.  The first 16 bytes are the add overlay
 4640     // which is applied to the vector holding the counter (state[12]).
 4641     // The second 16 bytes is the index register for the 8-bit left
 4642     // rotation tbl instruction.
 4643     __ adr(tmpAddr, L_cc20_const);
 4644     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4645 
 4646     // Load from memory and interlace across 16 SIMD registers,
 4647     // With each word from memory being broadcast to all lanes of
 4648     // each successive SIMD register.
 4649     //      Addr(0) -> All lanes in workSt[i]
 4650     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4651     __ mov(tmpAddr, state);
 4652     for (i = 0; i < 16; i += 4) {
 4653       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4654           __ post(tmpAddr, 16));
 4655     }
 4656     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4657 
 4658     // Before entering the loop, create 5 4-register arrays.  These
 4659     // will hold the 4 registers that represent the a/b/c/d fields
 4660     // in the quarter round operation.  For instance the "b" field
 4661     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4662     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4663     // since it is part of a diagonal organization.  The aSet and scratch
 4664     // register sets are defined at declaration time because they do not change
 4665     // organization at any point during the 20-round processing.
 4666     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4667     FloatRegister bSet[4];
 4668     FloatRegister cSet[4];
 4669     FloatRegister dSet[4];
 4670     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4671 
 4672     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4673     __ mov(loopCtr, 10);
 4674     __ BIND(L_twoRounds);
 4675 
 4676     // Set to columnar organization and do the following 4 quarter-rounds:
 4677     // QUARTERROUND(0, 4, 8, 12)
 4678     // QUARTERROUND(1, 5, 9, 13)
 4679     // QUARTERROUND(2, 6, 10, 14)
 4680     // QUARTERROUND(3, 7, 11, 15)
 4681     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4682     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4683     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4684 
 4685     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4686     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4687     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4688 
 4689     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4690     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4691     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4692 
 4693     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4694     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4695     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4696 
 4697     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4698     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4699     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4700 
 4701     // Set to diagonal organization and do the next 4 quarter-rounds:
 4702     // QUARTERROUND(0, 5, 10, 15)
 4703     // QUARTERROUND(1, 6, 11, 12)
 4704     // QUARTERROUND(2, 7, 8, 13)
 4705     // QUARTERROUND(3, 4, 9, 14)
 4706     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4707     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4708     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4709 
 4710     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4711     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4712     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4713 
 4714     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4715     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4716     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4717 
 4718     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4719     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4720     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4721 
 4722     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4723     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4724     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4725 
 4726     // Decrement and iterate
 4727     __ sub(loopCtr, loopCtr, 1);
 4728     __ cbnz(loopCtr, L_twoRounds);
 4729 
 4730     __ mov(tmpAddr, state);
 4731 
 4732     // Add the starting state back to the post-loop keystream
 4733     // state.  We read/interlace the state array from memory into
 4734     // 4 registers similar to what we did in the beginning.  Then
 4735     // add the counter overlay onto workSt[12] at the end.
 4736     for (i = 0; i < 16; i += 4) {
 4737       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4738       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4739       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4740       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4741       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4742     }
 4743     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4744 
 4745     // Write working state into the keystream buffer.  This is accomplished
 4746     // by taking the lane "i" from each of the four vectors and writing
 4747     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4748     // repeating with the next 4 vectors until all 16 vectors have been used.
 4749     // Then move to the next lane and repeat the process until all lanes have
 4750     // been written.
 4751     for (i = 0; i < 4; i++) {
 4752       for (j = 0; j < 16; j += 4) {
 4753         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4754             __ post(keystream, 16));
 4755       }
 4756     }
 4757 
 4758     __ mov(r0, 256);             // Return length of output keystream
 4759     __ leave();
 4760     __ ret(lr);
 4761 
 4762     // bind label and generate local constant data used by this stub
 4763     // The constant data is broken into two 128-bit segments to be loaded
 4764     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4765     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4766     // The second 128-bits is a table constant used for 8-bit left rotations.
 4767     __ BIND(L_cc20_const);
 4768     __ emit_int64(0x0000000100000000UL);
 4769     __ emit_int64(0x0000000300000002UL);
 4770     __ emit_int64(0x0605040702010003UL);
 4771     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4772 
 4773     return start;
 4774   }
 4775 
 4776   // Helpers to schedule parallel operation bundles across vector
 4777   // register sequences of size 2, 4 or 8.
 4778 
 4779   // Implement various primitive computations across vector sequences
 4780 
 4781   template<int N>
 4782   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4783                const VSeq<N>& v1, const VSeq<N>& v2) {
 4784     // output must not be constant
 4785     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4786     // output cannot overwrite pending inputs
 4787     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4788     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4789     for (int i = 0; i < N; i++) {
 4790       __ addv(v[i], T, v1[i], v2[i]);
 4791     }
 4792   }
 4793 
 4794   template<int N>
 4795   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4796                const VSeq<N>& v1, const VSeq<N>& v2) {
 4797     // output must not be constant
 4798     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4799     // output cannot overwrite pending inputs
 4800     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4801     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4802     for (int i = 0; i < N; i++) {
 4803       __ subv(v[i], T, v1[i], v2[i]);
 4804     }
 4805   }
 4806 
 4807   template<int N>
 4808   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4809                const VSeq<N>& v1, const VSeq<N>& v2) {
 4810     // output must not be constant
 4811     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4812     // output cannot overwrite pending inputs
 4813     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4814     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4815     for (int i = 0; i < N; i++) {
 4816       __ mulv(v[i], T, v1[i], v2[i]);
 4817     }
 4818   }
 4819 
 4820   template<int N>
 4821   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4822     // output must not be constant
 4823     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4824     // output cannot overwrite pending inputs
 4825     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4826     for (int i = 0; i < N; i++) {
 4827       __ negr(v[i], T, v1[i]);
 4828     }
 4829   }
 4830 
 4831   template<int N>
 4832   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4833                const VSeq<N>& v1, int shift) {
 4834     // output must not be constant
 4835     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4836     // output cannot overwrite pending inputs
 4837     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4838     for (int i = 0; i < N; i++) {
 4839       __ sshr(v[i], T, v1[i], shift);
 4840     }
 4841   }
 4842 
 4843   template<int N>
 4844   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4845     // output must not be constant
 4846     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4847     // output cannot overwrite pending inputs
 4848     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4849     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4850     for (int i = 0; i < N; i++) {
 4851       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4852     }
 4853   }
 4854 
 4855   template<int N>
 4856   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4857     // output must not be constant
 4858     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4859     // output cannot overwrite pending inputs
 4860     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4861     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4862     for (int i = 0; i < N; i++) {
 4863       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4864     }
 4865   }
 4866 
 4867   template<int N>
 4868   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4869     // output must not be constant
 4870     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4871     // output cannot overwrite pending inputs
 4872     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4873     for (int i = 0; i < N; i++) {
 4874       __ notr(v[i], __ T16B, v1[i]);
 4875     }
 4876   }
 4877 
 4878   template<int N>
 4879   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4880     // output must not be constant
 4881     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4882     // output cannot overwrite pending inputs
 4883     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4884     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4885     for (int i = 0; i < N; i++) {
 4886       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4887     }
 4888   }
 4889 
 4890   template<int N>
 4891   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4892     // output must not be constant
 4893     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4894     // output cannot overwrite pending inputs
 4895     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4896     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4897     for (int i = 0; i < N; i++) {
 4898       __ mlsv(v[i], T, v1[i], v2[i]);
 4899     }
 4900   }
 4901 
 4902   // load N/2 successive pairs of quadword values from memory in order
 4903   // into N successive vector registers of the sequence via the
 4904   // address supplied in base.
 4905   template<int N>
 4906   void vs_ldpq(const VSeq<N>& v, Register base) {
 4907     for (int i = 0; i < N; i += 2) {
 4908       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4909     }
 4910   }
 4911 
 4912   // load N/2 successive pairs of quadword values from memory in order
 4913   // into N vector registers of the sequence via the address supplied
 4914   // in base using post-increment addressing
 4915   template<int N>
 4916   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4917     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4918     for (int i = 0; i < N; i += 2) {
 4919       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4920     }
 4921   }
 4922 
 4923   // store N successive vector registers of the sequence into N/2
 4924   // successive pairs of quadword memory locations via the address
 4925   // supplied in base using post-increment addressing
 4926   template<int N>
 4927   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4928     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4929     for (int i = 0; i < N; i += 2) {
 4930       __ stpq(v[i], v[i+1], __ post(base, 32));
 4931     }
 4932   }
 4933 
 4934   // load N/2 pairs of quadword values from memory de-interleaved into
 4935   // N vector registers 2 at a time via the address supplied in base
 4936   // using post-increment addressing.
 4937   template<int N>
 4938   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4939     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4940     for (int i = 0; i < N; i += 2) {
 4941       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4942     }
 4943   }
 4944 
 4945   // store N vector registers interleaved into N/2 pairs of quadword
 4946   // memory locations via the address supplied in base using
 4947   // post-increment addressing.
 4948   template<int N>
 4949   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4950     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4951     for (int i = 0; i < N; i += 2) {
 4952       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4953     }
 4954   }
 4955 
 4956   // load N quadword values from memory de-interleaved into N vector
 4957   // registers 3 elements at a time via the address supplied in base.
 4958   template<int N>
 4959   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4960     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4961     for (int i = 0; i < N; i += 3) {
 4962       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4963     }
 4964   }
 4965 
 4966   // load N quadword values from memory de-interleaved into N vector
 4967   // registers 3 elements at a time via the address supplied in base
 4968   // using post-increment addressing.
 4969   template<int N>
 4970   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4971     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4972     for (int i = 0; i < N; i += 3) {
 4973       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4974     }
 4975   }
 4976 
 4977   // load N/2 pairs of quadword values from memory into N vector
 4978   // registers via the address supplied in base with each pair indexed
 4979   // using the the start offset plus the corresponding entry in the
 4980   // offsets array
 4981   template<int N>
 4982   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4983     for (int i = 0; i < N/2; i++) {
 4984       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4985     }
 4986   }
 4987 
 4988   // store N vector registers into N/2 pairs of quadword memory
 4989   // locations via the address supplied in base with each pair indexed
 4990   // using the the start offset plus the corresponding entry in the
 4991   // offsets array
 4992   template<int N>
 4993   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4994     for (int i = 0; i < N/2; i++) {
 4995       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4996     }
 4997   }
 4998 
 4999   // load N single quadword values from memory into N vector registers
 5000   // via the address supplied in base with each value indexed using
 5001   // the the start offset plus the corresponding entry in the offsets
 5002   // array
 5003   template<int N>
 5004   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5005                       int start, int (&offsets)[N]) {
 5006     for (int i = 0; i < N; i++) {
 5007       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5008     }
 5009   }
 5010 
 5011   // store N vector registers into N single quadword memory locations
 5012   // via the address supplied in base with each value indexed using
 5013   // the the start offset plus the corresponding entry in the offsets
 5014   // array
 5015   template<int N>
 5016   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5017                       int start, int (&offsets)[N]) {
 5018     for (int i = 0; i < N; i++) {
 5019       __ str(v[i], T, Address(base, start + offsets[i]));
 5020     }
 5021   }
 5022 
 5023   // load N/2 pairs of quadword values from memory de-interleaved into
 5024   // N vector registers 2 at a time via the address supplied in base
 5025   // with each pair indexed using the the start offset plus the
 5026   // corresponding entry in the offsets array
 5027   template<int N>
 5028   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5029                       Register tmp, int start, int (&offsets)[N/2]) {
 5030     for (int i = 0; i < N/2; i++) {
 5031       __ add(tmp, base, start + offsets[i]);
 5032       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5033     }
 5034   }
 5035 
 5036   // store N vector registers 2 at a time interleaved into N/2 pairs
 5037   // of quadword memory locations via the address supplied in base
 5038   // with each pair indexed using the the start offset plus the
 5039   // corresponding entry in the offsets array
 5040   template<int N>
 5041   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5042                       Register tmp, int start, int (&offsets)[N/2]) {
 5043     for (int i = 0; i < N/2; i++) {
 5044       __ add(tmp, base, start + offsets[i]);
 5045       __ st2(v[2*i], v[2*i+1], T, tmp);
 5046     }
 5047   }
 5048 
 5049   // Helper routines for various flavours of Montgomery multiply
 5050 
 5051   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5052   // multiplications in parallel
 5053   //
 5054 
 5055   // See the montMul() method of the sun.security.provider.ML_DSA
 5056   // class.
 5057   //
 5058   // Computes 4x4S results or 8x8H results
 5059   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5060   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5061   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5062   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5063   // Outputs: va - 4x4S or 4x8H vector register sequences
 5064   // vb, vc, vtmp and vq must all be disjoint
 5065   // va must be disjoint from all other inputs/temps or must equal vc
 5066   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5067   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5068   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5069                    Assembler::SIMD_Arrangement T,
 5070                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5071     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5072     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5073     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5074     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5075 
 5076     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5077     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5078 
 5079     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5080 
 5081     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5082     assert(vs_disjoint(va, vb), "va and vb overlap");
 5083     assert(vs_disjoint(va, vq), "va and vq overlap");
 5084     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5085     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5086 
 5087     // schedule 4 streams of instructions across the vector sequences
 5088     for (int i = 0; i < 4; i++) {
 5089       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5090       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5091     }
 5092 
 5093     for (int i = 0; i < 4; i++) {
 5094       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5095     }
 5096 
 5097     for (int i = 0; i < 4; i++) {
 5098       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5099     }
 5100 
 5101     for (int i = 0; i < 4; i++) {
 5102       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5103     }
 5104   }
 5105 
 5106   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5107   // multiplications in parallel
 5108   //
 5109 
 5110   // See the montMul() method of the sun.security.provider.ML_DSA
 5111   // class.
 5112   //
 5113   // Computes 4x4S results or 8x8H results
 5114   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5115   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5116   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5117   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5118   // Outputs: va - 4x4S or 4x8H vector register sequences
 5119   // vb, vc, vtmp and vq must all be disjoint
 5120   // va must be disjoint from all other inputs/temps or must equal vc
 5121   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5122   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5123   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5124                    Assembler::SIMD_Arrangement T,
 5125                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5126     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5127     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5128     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5129     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5130 
 5131     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5132     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5133 
 5134     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5135 
 5136     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5137     assert(vs_disjoint(va, vb), "va and vb overlap");
 5138     assert(vs_disjoint(va, vq), "va and vq overlap");
 5139     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5140     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5141 
 5142     // schedule 2 streams of instructions across the vector sequences
 5143     for (int i = 0; i < 2; i++) {
 5144       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5145       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5146     }
 5147 
 5148     for (int i = 0; i < 2; i++) {
 5149       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5150     }
 5151 
 5152     for (int i = 0; i < 2; i++) {
 5153       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5154     }
 5155 
 5156     for (int i = 0; i < 2; i++) {
 5157       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5158     }
 5159   }
 5160 
 5161   // Perform 16 16-bit Montgomery multiplications in parallel.
 5162   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5163                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5164     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5165     // It will assert that the register use is valid
 5166     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5167   }
 5168 
 5169   // Perform 32 16-bit Montgomery multiplications in parallel.
 5170   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5171                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5172     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5173     // It will assert that the register use is valid
 5174     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5175   }
 5176 
 5177   // Perform 64 16-bit Montgomery multiplications in parallel.
 5178   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5179                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5180     // Schedule two successive 4x8H multiplies via the montmul helper
 5181     // on the front and back halves of va, vb and vc. The helper will
 5182     // assert that the register use has no overlap conflicts on each
 5183     // individual call but we also need to ensure that the necessary
 5184     // disjoint/equality constraints are met across both calls.
 5185 
 5186     // vb, vc, vtmp and vq must be disjoint. va must either be
 5187     // disjoint from all other registers or equal vc
 5188 
 5189     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5190     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5191     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5192 
 5193     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5194     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5195 
 5196     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5197 
 5198     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5199     assert(vs_disjoint(va, vb), "va and vb overlap");
 5200     assert(vs_disjoint(va, vq), "va and vq overlap");
 5201     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5202 
 5203     // we multiply the front and back halves of each sequence 4 at a
 5204     // time because
 5205     //
 5206     // 1) we are currently only able to get 4-way instruction
 5207     // parallelism at best
 5208     //
 5209     // 2) we need registers for the constants in vq and temporary
 5210     // scratch registers to hold intermediate results so vtmp can only
 5211     // be a VSeq<4> which means we only have 4 scratch slots
 5212 
 5213     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5214     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5215   }
 5216 
 5217   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5218                                const VSeq<4>& vc,
 5219                                const VSeq<4>& vtmp,
 5220                                const VSeq<2>& vq) {
 5221     // compute a = montmul(a1, c)
 5222     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5223     // ouptut a1 = a0 - a
 5224     vs_subv(va1, __ T8H, va0, vc);
 5225     //    and a0 = a0 + a
 5226     vs_addv(va0, __ T8H, va0, vc);
 5227   }
 5228 
 5229   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5230                                const VSeq<4>& vb,
 5231                                const VSeq<4>& vtmp1,
 5232                                const VSeq<4>& vtmp2,
 5233                                const VSeq<2>& vq) {
 5234     // compute c = a0 - a1
 5235     vs_subv(vtmp1, __ T8H, va0, va1);
 5236     // output a0 = a0 + a1
 5237     vs_addv(va0, __ T8H, va0, va1);
 5238     // output a1 = b montmul c
 5239     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5240   }
 5241 
 5242   void load64shorts(const VSeq<8>& v, Register shorts) {
 5243     vs_ldpq_post(v, shorts);
 5244   }
 5245 
 5246   void load32shorts(const VSeq<4>& v, Register shorts) {
 5247     vs_ldpq_post(v, shorts);
 5248   }
 5249 
 5250   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5251     vs_stpq_post(v, tmpAddr);
 5252   }
 5253 
 5254   // Kyber NTT function.
 5255   // Implements
 5256   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5257   //
 5258   // coeffs (short[256]) = c_rarg0
 5259   // ntt_zetas (short[256]) = c_rarg1
 5260   address generate_kyberNtt() {
 5261 
 5262     __ align(CodeEntryAlignment);
 5263     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5264     StubCodeMark mark(this, stub_id);
 5265     address start = __ pc();
 5266     __ enter();
 5267 
 5268     const Register coeffs = c_rarg0;
 5269     const Register zetas = c_rarg1;
 5270 
 5271     const Register kyberConsts = r10;
 5272     const Register tmpAddr = r11;
 5273 
 5274     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5275     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5276     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5277 
 5278     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5279     // load the montmul constants
 5280     vs_ldpq(vq, kyberConsts);
 5281 
 5282     // Each level corresponds to an iteration of the outermost loop of the
 5283     // Java method seilerNTT(int[] coeffs). There are some differences
 5284     // from what is done in the seilerNTT() method, though:
 5285     // 1. The computation is using 16-bit signed values, we do not convert them
 5286     // to ints here.
 5287     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5288     // this array for each level, it is easier that way to fill up the vector
 5289     // registers.
 5290     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5291     // multiplications (this is because that way there should not be any
 5292     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5293     // that we can use the 16-bit arithmetic in the vector unit.
 5294     //
 5295     // On each level, we fill up the vector registers in such a way that the
 5296     // array elements that need to be multiplied by the zetas go into one
 5297     // set of vector registers while the corresponding ones that don't need to
 5298     // be multiplied, go into another set.
 5299     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5300     // registers interleaving the steps of 4 identical computations,
 5301     // each done on 8 16-bit values per register.
 5302 
 5303     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5304     // to the zetas occur in discrete blocks whose size is some multiple
 5305     // of 32.
 5306 
 5307     // level 0
 5308     __ add(tmpAddr, coeffs, 256);
 5309     load64shorts(vs1, tmpAddr);
 5310     load64shorts(vs2, zetas);
 5311     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5312     __ add(tmpAddr, coeffs, 0);
 5313     load64shorts(vs1, tmpAddr);
 5314     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5315     vs_addv(vs1, __ T8H, vs1, vs2);
 5316     __ add(tmpAddr, coeffs, 0);
 5317     vs_stpq_post(vs1, tmpAddr);
 5318     __ add(tmpAddr, coeffs, 256);
 5319     vs_stpq_post(vs3, tmpAddr);
 5320     // restore montmul constants
 5321     vs_ldpq(vq, kyberConsts);
 5322     load64shorts(vs1, tmpAddr);
 5323     load64shorts(vs2, zetas);
 5324     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5325     __ add(tmpAddr, coeffs, 128);
 5326     load64shorts(vs1, tmpAddr);
 5327     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5328     vs_addv(vs1, __ T8H, vs1, vs2);
 5329     __ add(tmpAddr, coeffs, 128);
 5330     store64shorts(vs1, tmpAddr);
 5331     __ add(tmpAddr, coeffs, 384);
 5332     store64shorts(vs3, tmpAddr);
 5333 
 5334     // level 1
 5335     // restore montmul constants
 5336     vs_ldpq(vq, kyberConsts);
 5337     __ add(tmpAddr, coeffs, 128);
 5338     load64shorts(vs1, tmpAddr);
 5339     load64shorts(vs2, zetas);
 5340     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5341     __ add(tmpAddr, coeffs, 0);
 5342     load64shorts(vs1, tmpAddr);
 5343     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5344     vs_addv(vs1, __ T8H, vs1, vs2);
 5345     __ add(tmpAddr, coeffs, 0);
 5346     store64shorts(vs1, tmpAddr);
 5347     store64shorts(vs3, tmpAddr);
 5348     vs_ldpq(vq, kyberConsts);
 5349     __ add(tmpAddr, coeffs, 384);
 5350     load64shorts(vs1, tmpAddr);
 5351     load64shorts(vs2, zetas);
 5352     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5353     __ add(tmpAddr, coeffs, 256);
 5354     load64shorts(vs1, tmpAddr);
 5355     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5356     vs_addv(vs1, __ T8H, vs1, vs2);
 5357     __ add(tmpAddr, coeffs, 256);
 5358     store64shorts(vs1, tmpAddr);
 5359     store64shorts(vs3, tmpAddr);
 5360 
 5361     // level 2
 5362     vs_ldpq(vq, kyberConsts);
 5363     int offsets1[4] = { 0, 32, 128, 160 };
 5364     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5365     load64shorts(vs2, zetas);
 5366     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5367     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5368     // kyber_subv_addv64();
 5369     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5370     vs_addv(vs1, __ T8H, vs1, vs2);
 5371     __ add(tmpAddr, coeffs, 0);
 5372     vs_stpq_post(vs_front(vs1), tmpAddr);
 5373     vs_stpq_post(vs_front(vs3), tmpAddr);
 5374     vs_stpq_post(vs_back(vs1), tmpAddr);
 5375     vs_stpq_post(vs_back(vs3), tmpAddr);
 5376     vs_ldpq(vq, kyberConsts);
 5377     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5378     load64shorts(vs2, zetas);
 5379     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5380     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5381     // kyber_subv_addv64();
 5382     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5383     vs_addv(vs1, __ T8H, vs1, vs2);
 5384     __ add(tmpAddr, coeffs, 256);
 5385     vs_stpq_post(vs_front(vs1), tmpAddr);
 5386     vs_stpq_post(vs_front(vs3), tmpAddr);
 5387     vs_stpq_post(vs_back(vs1), tmpAddr);
 5388     vs_stpq_post(vs_back(vs3), tmpAddr);
 5389 
 5390     // level 3
 5391     vs_ldpq(vq, kyberConsts);
 5392     int offsets2[4] = { 0, 64, 128, 192 };
 5393     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5394     load64shorts(vs2, zetas);
 5395     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5396     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5397     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5398     vs_addv(vs1, __ T8H, vs1, vs2);
 5399     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5400     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5401 
 5402     vs_ldpq(vq, kyberConsts);
 5403     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5404     load64shorts(vs2, zetas);
 5405     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5406     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5407     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5408     vs_addv(vs1, __ T8H, vs1, vs2);
 5409     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5410     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5411 
 5412     // level 4
 5413     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5414     // so they are loaded using employing an ldr at 8 distinct offsets.
 5415 
 5416     vs_ldpq(vq, kyberConsts);
 5417     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5418     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5419     load64shorts(vs2, zetas);
 5420     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5421     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5422     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5423     vs_addv(vs1, __ T8H, vs1, vs2);
 5424     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5425     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5426 
 5427     vs_ldpq(vq, kyberConsts);
 5428     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5429     load64shorts(vs2, zetas);
 5430     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5431     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5432     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5433     vs_addv(vs1, __ T8H, vs1, vs2);
 5434     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5435     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5436 
 5437     // level 5
 5438     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5439     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5440 
 5441     vs_ldpq(vq, kyberConsts);
 5442     int offsets4[4] = { 0, 32, 64, 96 };
 5443     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5444     load32shorts(vs_front(vs2), zetas);
 5445     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5446     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5447     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5448     load32shorts(vs_front(vs2), zetas);
 5449     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5450     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5451     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5452     load32shorts(vs_front(vs2), zetas);
 5453     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5454     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5455 
 5456     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5457     load32shorts(vs_front(vs2), zetas);
 5458     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5459     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5460 
 5461     // level 6
 5462     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5463     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5464 
 5465     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5466     load32shorts(vs_front(vs2), zetas);
 5467     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5468     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5469     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5470     // __ ldpq(v18, v19, __ post(zetas, 32));
 5471     load32shorts(vs_front(vs2), zetas);
 5472     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5473     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5474 
 5475     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5476     load32shorts(vs_front(vs2), zetas);
 5477     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5478     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5479 
 5480     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5481     load32shorts(vs_front(vs2), zetas);
 5482     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5483     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5484 
 5485     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5486     __ mov(r0, zr); // return 0
 5487     __ ret(lr);
 5488 
 5489     return start;
 5490   }
 5491 
 5492   // Kyber Inverse NTT function
 5493   // Implements
 5494   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5495   //
 5496   // coeffs (short[256]) = c_rarg0
 5497   // ntt_zetas (short[256]) = c_rarg1
 5498   address generate_kyberInverseNtt() {
 5499 
 5500     __ align(CodeEntryAlignment);
 5501     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5502     StubCodeMark mark(this, stub_id);
 5503     address start = __ pc();
 5504     __ enter();
 5505 
 5506     const Register coeffs = c_rarg0;
 5507     const Register zetas = c_rarg1;
 5508 
 5509     const Register kyberConsts = r10;
 5510     const Register tmpAddr = r11;
 5511     const Register tmpAddr2 = c_rarg2;
 5512 
 5513     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5514     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5515     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5516 
 5517     __ lea(kyberConsts,
 5518              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5519 
 5520     // level 0
 5521     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5522     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5523 
 5524     vs_ldpq(vq, kyberConsts);
 5525     int offsets4[4] = { 0, 32, 64, 96 };
 5526     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5527     load32shorts(vs_front(vs2), zetas);
 5528     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5529                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5530     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5531     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5532     load32shorts(vs_front(vs2), zetas);
 5533     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5534                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5535     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5536     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5537     load32shorts(vs_front(vs2), zetas);
 5538     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5539                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5540     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5541     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5542     load32shorts(vs_front(vs2), zetas);
 5543     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5544                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5545     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5546 
 5547     // level 1
 5548     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5549     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5550 
 5551     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5552     load32shorts(vs_front(vs2), zetas);
 5553     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5554                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5555     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5556     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5557     load32shorts(vs_front(vs2), zetas);
 5558     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5559                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5560     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5561 
 5562     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5563     load32shorts(vs_front(vs2), zetas);
 5564     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5565                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5566     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5567     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5568     load32shorts(vs_front(vs2), zetas);
 5569     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5570                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5571     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5572 
 5573     // level 2
 5574     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5575     // so they are loaded using employing an ldr at 8 distinct offsets.
 5576 
 5577     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5578     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5579     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5580     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5581     vs_subv(vs1, __ T8H, vs1, vs2);
 5582     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5583     load64shorts(vs2, zetas);
 5584     vs_ldpq(vq, kyberConsts);
 5585     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5586     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5587 
 5588     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5589     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5590     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5591     vs_subv(vs1, __ T8H, vs1, vs2);
 5592     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5593     load64shorts(vs2, zetas);
 5594     vs_ldpq(vq, kyberConsts);
 5595     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5596     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5597 
 5598     // Barrett reduction at indexes where overflow may happen
 5599 
 5600     // load q and the multiplier for the Barrett reduction
 5601     __ add(tmpAddr, kyberConsts, 16);
 5602     vs_ldpq(vq, tmpAddr);
 5603 
 5604     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5605     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5606     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5607     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5608     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5609     vs_sshr(vs2, __ T8H, vs2, 11);
 5610     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5611     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5612     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5613     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5614     vs_sshr(vs2, __ T8H, vs2, 11);
 5615     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5616     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5617 
 5618     // level 3
 5619     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5620     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5621 
 5622     int offsets2[4] = { 0, 64, 128, 192 };
 5623     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5624     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5625     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5626     vs_subv(vs1, __ T8H, vs1, vs2);
 5627     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5628     load64shorts(vs2, zetas);
 5629     vs_ldpq(vq, kyberConsts);
 5630     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5631     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5632 
 5633     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5634     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5635     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5636     vs_subv(vs1, __ T8H, vs1, vs2);
 5637     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5638     load64shorts(vs2, zetas);
 5639     vs_ldpq(vq, kyberConsts);
 5640     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5641     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5642 
 5643     // level 4
 5644 
 5645     int offsets1[4] = { 0, 32, 128, 160 };
 5646     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5647     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5648     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5649     vs_subv(vs1, __ T8H, vs1, vs2);
 5650     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5651     load64shorts(vs2, zetas);
 5652     vs_ldpq(vq, kyberConsts);
 5653     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5654     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5655 
 5656     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5657     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5658     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5659     vs_subv(vs1, __ T8H, vs1, vs2);
 5660     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5661     load64shorts(vs2, zetas);
 5662     vs_ldpq(vq, kyberConsts);
 5663     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5664     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5665 
 5666     // level 5
 5667 
 5668     __ add(tmpAddr, coeffs, 0);
 5669     load64shorts(vs1, tmpAddr);
 5670     __ add(tmpAddr, coeffs, 128);
 5671     load64shorts(vs2, tmpAddr);
 5672     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5673     vs_subv(vs1, __ T8H, vs1, vs2);
 5674     __ add(tmpAddr, coeffs, 0);
 5675     store64shorts(vs3, tmpAddr);
 5676     load64shorts(vs2, zetas);
 5677     vs_ldpq(vq, kyberConsts);
 5678     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5679     __ add(tmpAddr, coeffs, 128);
 5680     store64shorts(vs2, tmpAddr);
 5681 
 5682     load64shorts(vs1, tmpAddr);
 5683     __ add(tmpAddr, coeffs, 384);
 5684     load64shorts(vs2, tmpAddr);
 5685     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5686     vs_subv(vs1, __ T8H, vs1, vs2);
 5687     __ add(tmpAddr, coeffs, 256);
 5688     store64shorts(vs3, tmpAddr);
 5689     load64shorts(vs2, zetas);
 5690     vs_ldpq(vq, kyberConsts);
 5691     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5692     __ add(tmpAddr, coeffs, 384);
 5693     store64shorts(vs2, tmpAddr);
 5694 
 5695     // Barrett reduction at indexes where overflow may happen
 5696 
 5697     // load q and the multiplier for the Barrett reduction
 5698     __ add(tmpAddr, kyberConsts, 16);
 5699     vs_ldpq(vq, tmpAddr);
 5700 
 5701     int offsets0[2] = { 0, 256 };
 5702     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5703     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5704     vs_sshr(vs2, __ T8H, vs2, 11);
 5705     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5706     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5707 
 5708     // level 6
 5709 
 5710     __ add(tmpAddr, coeffs, 0);
 5711     load64shorts(vs1, tmpAddr);
 5712     __ add(tmpAddr, coeffs, 256);
 5713     load64shorts(vs2, tmpAddr);
 5714     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5715     vs_subv(vs1, __ T8H, vs1, vs2);
 5716     __ add(tmpAddr, coeffs, 0);
 5717     store64shorts(vs3, tmpAddr);
 5718     load64shorts(vs2, zetas);
 5719     vs_ldpq(vq, kyberConsts);
 5720     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5721     __ add(tmpAddr, coeffs, 256);
 5722     store64shorts(vs2, tmpAddr);
 5723 
 5724     __ add(tmpAddr, coeffs, 128);
 5725     load64shorts(vs1, tmpAddr);
 5726     __ add(tmpAddr, coeffs, 384);
 5727     load64shorts(vs2, tmpAddr);
 5728     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5729     vs_subv(vs1, __ T8H, vs1, vs2);
 5730     __ add(tmpAddr, coeffs, 128);
 5731     store64shorts(vs3, tmpAddr);
 5732     load64shorts(vs2, zetas);
 5733     vs_ldpq(vq, kyberConsts);
 5734     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5735     __ add(tmpAddr, coeffs, 384);
 5736     store64shorts(vs2, tmpAddr);
 5737 
 5738     // multiply by 2^-n
 5739 
 5740     // load toMont(2^-n mod q)
 5741     __ add(tmpAddr, kyberConsts, 48);
 5742     __ ldr(v29, __ Q, tmpAddr);
 5743 
 5744     vs_ldpq(vq, kyberConsts);
 5745     __ add(tmpAddr, coeffs, 0);
 5746     load64shorts(vs1, tmpAddr);
 5747     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5748     __ add(tmpAddr, coeffs, 0);
 5749     store64shorts(vs2, tmpAddr);
 5750 
 5751     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5752     load64shorts(vs1, tmpAddr);
 5753     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5754     __ add(tmpAddr, coeffs, 128);
 5755     store64shorts(vs2, tmpAddr);
 5756 
 5757     // now tmpAddr contains coeffs + 256
 5758     load64shorts(vs1, tmpAddr);
 5759     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5760     __ add(tmpAddr, coeffs, 256);
 5761     store64shorts(vs2, tmpAddr);
 5762 
 5763     // now tmpAddr contains coeffs + 384
 5764     load64shorts(vs1, tmpAddr);
 5765     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5766     __ add(tmpAddr, coeffs, 384);
 5767     store64shorts(vs2, tmpAddr);
 5768 
 5769     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5770     __ mov(r0, zr); // return 0
 5771     __ ret(lr);
 5772 
 5773     return start;
 5774   }
 5775 
 5776   // Kyber multiply polynomials in the NTT domain.
 5777   // Implements
 5778   // static int implKyberNttMult(
 5779   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5780   //
 5781   // result (short[256]) = c_rarg0
 5782   // ntta (short[256]) = c_rarg1
 5783   // nttb (short[256]) = c_rarg2
 5784   // zetas (short[128]) = c_rarg3
 5785   address generate_kyberNttMult() {
 5786 
 5787     __ align(CodeEntryAlignment);
 5788     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5789     StubCodeMark mark(this, stub_id);
 5790     address start = __ pc();
 5791     __ enter();
 5792 
 5793     const Register result = c_rarg0;
 5794     const Register ntta = c_rarg1;
 5795     const Register nttb = c_rarg2;
 5796     const Register zetas = c_rarg3;
 5797 
 5798     const Register kyberConsts = r10;
 5799     const Register limit = r11;
 5800 
 5801     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5802     VSeq<4> vs3(16), vs4(20);
 5803     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5804     VSeq<2> vz(28);          // pair of zetas
 5805     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5806 
 5807     __ lea(kyberConsts,
 5808              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5809 
 5810     Label kyberNttMult_loop;
 5811 
 5812     __ add(limit, result, 512);
 5813 
 5814     // load q and qinv
 5815     vs_ldpq(vq, kyberConsts);
 5816 
 5817     // load R^2 mod q (to convert back from Montgomery representation)
 5818     __ add(kyberConsts, kyberConsts, 64);
 5819     __ ldr(v27, __ Q, kyberConsts);
 5820 
 5821     __ BIND(kyberNttMult_loop);
 5822 
 5823     // load 16 zetas
 5824     vs_ldpq_post(vz, zetas);
 5825 
 5826     // load 2 sets of 32 coefficients from the two input arrays
 5827     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5828     // are striped across pairs of vector registers
 5829     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5830     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5831     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5832     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5833 
 5834     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5835     // i.e. montmul the first and second halves of vs1 in order and
 5836     // then with one sequence reversed storing the two results in vs3
 5837     //
 5838     // vs3[0] <- montmul(a0, b0)
 5839     // vs3[1] <- montmul(a1, b1)
 5840     // vs3[2] <- montmul(a0, b1)
 5841     // vs3[3] <- montmul(a1, b0)
 5842     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5843     kyber_montmul16(vs_back(vs3),
 5844                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5845 
 5846     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5847     // i.e. montmul the first and second halves of vs4 in order and
 5848     // then with one sequence reversed storing the two results in vs1
 5849     //
 5850     // vs1[0] <- montmul(a2, b2)
 5851     // vs1[1] <- montmul(a3, b3)
 5852     // vs1[2] <- montmul(a2, b3)
 5853     // vs1[3] <- montmul(a3, b2)
 5854     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5855     kyber_montmul16(vs_back(vs1),
 5856                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5857 
 5858     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5859     // We can schedule two montmuls at a time if we use a suitable vector
 5860     // sequence <vs3[1], vs1[1]>.
 5861     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5862     VSeq<2> vs5(vs3[1], delta);
 5863 
 5864     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5865     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5866     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5867 
 5868     // add results in pairs storing in vs3
 5869     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5870     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5871     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5872 
 5873     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5874     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5875     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5876 
 5877     // vs1 <- montmul(vs3, montRSquareModQ)
 5878     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5879 
 5880     // store back the two pairs of result vectors de-interleaved as 8H elements
 5881     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5882     // in memory
 5883     vs_st2_post(vs1, __ T8H, result);
 5884 
 5885     __ cmp(result, limit);
 5886     __ br(Assembler::NE, kyberNttMult_loop);
 5887 
 5888     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5889     __ mov(r0, zr); // return 0
 5890     __ ret(lr);
 5891 
 5892     return start;
 5893   }
 5894 
 5895   // Kyber add 2 polynomials.
 5896   // Implements
 5897   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5898   //
 5899   // result (short[256]) = c_rarg0
 5900   // a (short[256]) = c_rarg1
 5901   // b (short[256]) = c_rarg2
 5902   address generate_kyberAddPoly_2() {
 5903 
 5904     __ align(CodeEntryAlignment);
 5905     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5906     StubCodeMark mark(this, stub_id);
 5907     address start = __ pc();
 5908     __ enter();
 5909 
 5910     const Register result = c_rarg0;
 5911     const Register a = c_rarg1;
 5912     const Register b = c_rarg2;
 5913 
 5914     const Register kyberConsts = r11;
 5915 
 5916     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5917     // So, we can load, add and store the data in 3 groups of 11,
 5918     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5919     // registers. A further constraint is that the mapping needs
 5920     // to skip callee saves. So, we allocate the register
 5921     // sequences using two 8 sequences, two 2 sequences and two
 5922     // single registers.
 5923     VSeq<8> vs1_1(0);
 5924     VSeq<2> vs1_2(16);
 5925     FloatRegister vs1_3 = v28;
 5926     VSeq<8> vs2_1(18);
 5927     VSeq<2> vs2_2(26);
 5928     FloatRegister vs2_3 = v29;
 5929 
 5930     // two constant vector sequences
 5931     VSeq<8> vc_1(31, 0);
 5932     VSeq<2> vc_2(31, 0);
 5933 
 5934     FloatRegister vc_3 = v31;
 5935     __ lea(kyberConsts,
 5936              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5937 
 5938     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5939     for (int i = 0; i < 3; i++) {
 5940       // load 80 or 88 values from a into vs1_1/2/3
 5941       vs_ldpq_post(vs1_1, a);
 5942       vs_ldpq_post(vs1_2, a);
 5943       if (i < 2) {
 5944         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5945       }
 5946       // load 80 or 88 values from b into vs2_1/2/3
 5947       vs_ldpq_post(vs2_1, b);
 5948       vs_ldpq_post(vs2_2, b);
 5949       if (i < 2) {
 5950         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5951       }
 5952       // sum 80 or 88 values across vs1 and vs2 into vs1
 5953       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5954       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5955       if (i < 2) {
 5956         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5957       }
 5958       // add constant to all 80 or 88 results
 5959       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5960       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5961       if (i < 2) {
 5962         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5963       }
 5964       // store 80 or 88 values
 5965       vs_stpq_post(vs1_1, result);
 5966       vs_stpq_post(vs1_2, result);
 5967       if (i < 2) {
 5968         __ str(vs1_3, __ Q, __ post(result, 16));
 5969       }
 5970     }
 5971 
 5972     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5973     __ mov(r0, zr); // return 0
 5974     __ ret(lr);
 5975 
 5976     return start;
 5977   }
 5978 
 5979   // Kyber add 3 polynomials.
 5980   // Implements
 5981   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5982   //
 5983   // result (short[256]) = c_rarg0
 5984   // a (short[256]) = c_rarg1
 5985   // b (short[256]) = c_rarg2
 5986   // c (short[256]) = c_rarg3
 5987   address generate_kyberAddPoly_3() {
 5988 
 5989     __ align(CodeEntryAlignment);
 5990     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 5991     StubCodeMark mark(this, stub_id);
 5992     address start = __ pc();
 5993     __ enter();
 5994 
 5995     const Register result = c_rarg0;
 5996     const Register a = c_rarg1;
 5997     const Register b = c_rarg2;
 5998     const Register c = c_rarg3;
 5999 
 6000     const Register kyberConsts = r11;
 6001 
 6002     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6003     // quadwords.  So, we can load, add and store the data in 3
 6004     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6005     // of 10 or 11 registers. A further constraint is that the
 6006     // mapping needs to skip callee saves. So, we allocate the
 6007     // register sequences using two 8 sequences, two 2 sequences
 6008     // and two single registers.
 6009     VSeq<8> vs1_1(0);
 6010     VSeq<2> vs1_2(16);
 6011     FloatRegister vs1_3 = v28;
 6012     VSeq<8> vs2_1(18);
 6013     VSeq<2> vs2_2(26);
 6014     FloatRegister vs2_3 = v29;
 6015 
 6016     // two constant vector sequences
 6017     VSeq<8> vc_1(31, 0);
 6018     VSeq<2> vc_2(31, 0);
 6019 
 6020     FloatRegister vc_3 = v31;
 6021 
 6022     __ lea(kyberConsts,
 6023              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6024 
 6025     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6026     for (int i = 0; i < 3; i++) {
 6027       // load 80 or 88 values from a into vs1_1/2/3
 6028       vs_ldpq_post(vs1_1, a);
 6029       vs_ldpq_post(vs1_2, a);
 6030       if (i < 2) {
 6031         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6032       }
 6033       // load 80 or 88 values from b into vs2_1/2/3
 6034       vs_ldpq_post(vs2_1, b);
 6035       vs_ldpq_post(vs2_2, b);
 6036       if (i < 2) {
 6037         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6038       }
 6039       // sum 80 or 88 values across vs1 and vs2 into vs1
 6040       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6041       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6042       if (i < 2) {
 6043         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6044       }
 6045       // load 80 or 88 values from c into vs2_1/2/3
 6046       vs_ldpq_post(vs2_1, c);
 6047       vs_ldpq_post(vs2_2, c);
 6048       if (i < 2) {
 6049         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6050       }
 6051       // sum 80 or 88 values across vs1 and vs2 into vs1
 6052       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6053       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6054       if (i < 2) {
 6055         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6056       }
 6057       // add constant to all 80 or 88 results
 6058       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6059       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6060       if (i < 2) {
 6061         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6062       }
 6063       // store 80 or 88 values
 6064       vs_stpq_post(vs1_1, result);
 6065       vs_stpq_post(vs1_2, result);
 6066       if (i < 2) {
 6067         __ str(vs1_3, __ Q, __ post(result, 16));
 6068       }
 6069     }
 6070 
 6071     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6072     __ mov(r0, zr); // return 0
 6073     __ ret(lr);
 6074 
 6075     return start;
 6076   }
 6077 
 6078   // Kyber parse XOF output to polynomial coefficient candidates
 6079   // or decodePoly(12, ...).
 6080   // Implements
 6081   // static int implKyber12To16(
 6082   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6083   //
 6084   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6085   //
 6086   // condensed (byte[]) = c_rarg0
 6087   // condensedIndex = c_rarg1
 6088   // parsed (short[112 or 256]) = c_rarg2
 6089   // parsedLength (112 or 256) = c_rarg3
 6090   address generate_kyber12To16() {
 6091     Label L_F00, L_loop, L_end;
 6092 
 6093     __ align(CodeEntryAlignment);
 6094     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6095     StubCodeMark mark(this, stub_id);
 6096     address start = __ pc();
 6097     __ enter();
 6098 
 6099     const Register condensed = c_rarg0;
 6100     const Register condensedOffs = c_rarg1;
 6101     const Register parsed = c_rarg2;
 6102     const Register parsedLength = c_rarg3;
 6103 
 6104     const Register tmpAddr = r11;
 6105 
 6106     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6107     // quadwords so we need a 6 vector sequence for the inputs.
 6108     // Parsing produces 64 shorts, employing two 8 vector
 6109     // sequences to store and combine the intermediate data.
 6110     VSeq<6> vin(24);
 6111     VSeq<8> va(0), vb(16);
 6112 
 6113     __ adr(tmpAddr, L_F00);
 6114     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6115     __ add(condensed, condensed, condensedOffs);
 6116 
 6117     __ BIND(L_loop);
 6118     // load 96 (6 x 16B) byte values
 6119     vs_ld3_post(vin, __ T16B, condensed);
 6120 
 6121     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6122     // holds 48 (16x3) contiguous bytes from memory striped
 6123     // horizontally across each of the 16 byte lanes. Equivalently,
 6124     // that is 16 pairs of 12-bit integers. Likewise the back half
 6125     // holds the next 48 bytes in the same arrangement.
 6126 
 6127     // Each vector in the front half can also be viewed as a vertical
 6128     // strip across the 16 pairs of 12 bit integers. Each byte in
 6129     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6130     // byte in vin[1] stores the high 4 bits of the first int and the
 6131     // low 4 bits of the second int. Each byte in vin[2] stores the
 6132     // high 8 bits of the second int. Likewise the vectors in second
 6133     // half.
 6134 
 6135     // Converting the data to 16-bit shorts requires first of all
 6136     // expanding each of the 6 x 16B vectors into 6 corresponding
 6137     // pairs of 8H vectors. Mask, shift and add operations on the
 6138     // resulting vector pairs can be used to combine 4 and 8 bit
 6139     // parts of related 8H vector elements.
 6140     //
 6141     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6142     // twice, one copy manipulated to provide the lower 4 bits
 6143     // belonging to the first short in a pair and another copy
 6144     // manipulated to provide the higher 4 bits belonging to the
 6145     // second short in a pair. This is why the the vector sequences va
 6146     // and vb used to hold the expanded 8H elements are of length 8.
 6147 
 6148     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6149     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6150     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6151     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6152     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6153     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6154     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6155     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6156 
 6157     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6158     // and vb[4:5]
 6159     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6160     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6161     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6162     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6163     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6164     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6165 
 6166     // shift lo byte of copy 1 of the middle stripe into the high byte
 6167     __ shl(va[2], __ T8H, va[2], 8);
 6168     __ shl(va[3], __ T8H, va[3], 8);
 6169     __ shl(vb[2], __ T8H, vb[2], 8);
 6170     __ shl(vb[3], __ T8H, vb[3], 8);
 6171 
 6172     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6173     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6174     // are in bit positions [4..11].
 6175     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6176     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6177     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6178     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6179 
 6180     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6181     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6182     // copy2
 6183     __ andr(va[2], __ T16B, va[2], v31);
 6184     __ andr(va[3], __ T16B, va[3], v31);
 6185     __ ushr(va[4], __ T8H, va[4], 4);
 6186     __ ushr(va[5], __ T8H, va[5], 4);
 6187     __ andr(vb[2], __ T16B, vb[2], v31);
 6188     __ andr(vb[3], __ T16B, vb[3], v31);
 6189     __ ushr(vb[4], __ T8H, vb[4], 4);
 6190     __ ushr(vb[5], __ T8H, vb[5], 4);
 6191 
 6192     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6193     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6194     // n.b. the ordering ensures: i) inputs are consumed before they
 6195     // are overwritten ii) the order of 16-bit results across successive
 6196     // pairs of vectors in va and then vb reflects the order of the
 6197     // corresponding 12-bit inputs
 6198     __ addv(va[0], __ T8H, va[0], va[2]);
 6199     __ addv(va[2], __ T8H, va[1], va[3]);
 6200     __ addv(va[1], __ T8H, va[4], va[6]);
 6201     __ addv(va[3], __ T8H, va[5], va[7]);
 6202     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6203     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6204     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6205     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6206 
 6207     // store 64 results interleaved as shorts
 6208     vs_st2_post(vs_front(va), __ T8H, parsed);
 6209     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6210 
 6211     __ sub(parsedLength, parsedLength, 64);
 6212     __ cmp(parsedLength, (u1)64);
 6213     __ br(Assembler::GE, L_loop);
 6214     __ cbz(parsedLength, L_end);
 6215 
 6216     // if anything is left it should be a final 72 bytes of input
 6217     // i.e. a final 48 12-bit values. so we handle this by loading
 6218     // 48 bytes into all 16B lanes of front(vin) and only 24
 6219     // bytes into the lower 8B lane of back(vin)
 6220     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6221     vs_ld3(vs_back(vin), __ T8B, condensed);
 6222 
 6223     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6224     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6225     // 5 and target element 2 of vb duplicates element 4.
 6226     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6227     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6228     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6229     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6230     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6231     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6232 
 6233     // This time expand just the lower 8 lanes
 6234     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6235     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6236     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6237 
 6238     // shift lo byte of copy 1 of the middle stripe into the high byte
 6239     __ shl(va[2], __ T8H, va[2], 8);
 6240     __ shl(va[3], __ T8H, va[3], 8);
 6241     __ shl(vb[2], __ T8H, vb[2], 8);
 6242 
 6243     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6244     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6245     // int are in bit positions [4..11].
 6246     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6247     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6248     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6249 
 6250     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6251     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6252     // copy2
 6253     __ andr(va[2], __ T16B, va[2], v31);
 6254     __ andr(va[3], __ T16B, va[3], v31);
 6255     __ ushr(va[4], __ T8H, va[4], 4);
 6256     __ ushr(va[5], __ T8H, va[5], 4);
 6257     __ andr(vb[2], __ T16B, vb[2], v31);
 6258     __ ushr(vb[4], __ T8H, vb[4], 4);
 6259 
 6260 
 6261 
 6262     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6263     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6264 
 6265     // n.b. ordering ensures: i) inputs are consumed before they are
 6266     // overwritten ii) order of 16-bit results across succsessive
 6267     // pairs of vectors in va and then lower half of vb reflects order
 6268     // of corresponding 12-bit inputs
 6269     __ addv(va[0], __ T8H, va[0], va[2]);
 6270     __ addv(va[2], __ T8H, va[1], va[3]);
 6271     __ addv(va[1], __ T8H, va[4], va[6]);
 6272     __ addv(va[3], __ T8H, va[5], va[7]);
 6273     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6274     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6275 
 6276     // store 48 results interleaved as shorts
 6277     vs_st2_post(vs_front(va), __ T8H, parsed);
 6278     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6279 
 6280     __ BIND(L_end);
 6281 
 6282     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6283     __ mov(r0, zr); // return 0
 6284     __ ret(lr);
 6285 
 6286     // bind label and generate constant data used by this stub
 6287     __ BIND(L_F00);
 6288     __ emit_int64(0x0f000f000f000f00);
 6289     __ emit_int64(0x0f000f000f000f00);
 6290 
 6291     return start;
 6292   }
 6293 
 6294   // Kyber Barrett reduce function.
 6295   // Implements
 6296   // static int implKyberBarrettReduce(short[] coeffs) {}
 6297   //
 6298   // coeffs (short[256]) = c_rarg0
 6299   address generate_kyberBarrettReduce() {
 6300 
 6301     __ align(CodeEntryAlignment);
 6302     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6303     StubCodeMark mark(this, stub_id);
 6304     address start = __ pc();
 6305     __ enter();
 6306 
 6307     const Register coeffs = c_rarg0;
 6308 
 6309     const Register kyberConsts = r10;
 6310     const Register result = r11;
 6311 
 6312     // As above we process 256 sets of values in total i.e. 32 x
 6313     // 8H quadwords. So, we can load, add and store the data in 3
 6314     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6315     // of 10 or 11 registers. A further constraint is that the
 6316     // mapping needs to skip callee saves. So, we allocate the
 6317     // register sequences using two 8 sequences, two 2 sequences
 6318     // and two single registers.
 6319     VSeq<8> vs1_1(0);
 6320     VSeq<2> vs1_2(16);
 6321     FloatRegister vs1_3 = v28;
 6322     VSeq<8> vs2_1(18);
 6323     VSeq<2> vs2_2(26);
 6324     FloatRegister vs2_3 = v29;
 6325 
 6326     // we also need a pair of corresponding constant sequences
 6327 
 6328     VSeq<8> vc1_1(30, 0);
 6329     VSeq<2> vc1_2(30, 0);
 6330     FloatRegister vc1_3 = v30; // for kyber_q
 6331 
 6332     VSeq<8> vc2_1(31, 0);
 6333     VSeq<2> vc2_2(31, 0);
 6334     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6335 
 6336     __ add(result, coeffs, 0);
 6337     __ lea(kyberConsts,
 6338              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6339 
 6340     // load q and the multiplier for the Barrett reduction
 6341     __ add(kyberConsts, kyberConsts, 16);
 6342     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6343 
 6344     for (int i = 0; i < 3; i++) {
 6345       // load 80 or 88 coefficients
 6346       vs_ldpq_post(vs1_1, coeffs);
 6347       vs_ldpq_post(vs1_2, coeffs);
 6348       if (i < 2) {
 6349         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6350       }
 6351 
 6352       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6353       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6354       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6355       if (i < 2) {
 6356         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6357       }
 6358 
 6359       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6360       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6361       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6362       if (i < 2) {
 6363         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6364       }
 6365 
 6366       // vs1 <- vs1 - vs2 * kyber_q
 6367       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6368       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6369       if (i < 2) {
 6370         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6371       }
 6372 
 6373       vs_stpq_post(vs1_1, result);
 6374       vs_stpq_post(vs1_2, result);
 6375       if (i < 2) {
 6376         __ str(vs1_3, __ Q, __ post(result, 16));
 6377       }
 6378     }
 6379 
 6380     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6381     __ mov(r0, zr); // return 0
 6382     __ ret(lr);
 6383 
 6384     return start;
 6385   }
 6386 
 6387 
 6388   // Dilithium-specific montmul helper routines that generate parallel
 6389   // code for, respectively, a single 4x4s vector sequence montmul or
 6390   // two such multiplies in a row.
 6391 
 6392   // Perform 16 32-bit Montgomery multiplications in parallel
 6393   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6394                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6395     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6396     // It will assert that the register use is valid
 6397     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6398   }
 6399 
 6400   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6401   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6402                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6403     // Schedule two successive 4x4S multiplies via the montmul helper
 6404     // on the front and back halves of va, vb and vc. The helper will
 6405     // assert that the register use has no overlap conflicts on each
 6406     // individual call but we also need to ensure that the necessary
 6407     // disjoint/equality constraints are met across both calls.
 6408 
 6409     // vb, vc, vtmp and vq must be disjoint. va must either be
 6410     // disjoint from all other registers or equal vc
 6411 
 6412     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6413     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6414     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6415 
 6416     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6417     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6418 
 6419     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6420 
 6421     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6422     assert(vs_disjoint(va, vb), "va and vb overlap");
 6423     assert(vs_disjoint(va, vq), "va and vq overlap");
 6424     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6425 
 6426     // We multiply the front and back halves of each sequence 4 at a
 6427     // time because
 6428     //
 6429     // 1) we are currently only able to get 4-way instruction
 6430     // parallelism at best
 6431     //
 6432     // 2) we need registers for the constants in vq and temporary
 6433     // scratch registers to hold intermediate results so vtmp can only
 6434     // be a VSeq<4> which means we only have 4 scratch slots.
 6435 
 6436     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6437     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6438   }
 6439 
 6440   // Perform combined montmul then add/sub on 4x4S vectors.
 6441   void dilithium_montmul16_sub_add(
 6442           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6443           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6444     // compute a = montmul(a1, c)
 6445     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6446     // ouptut a1 = a0 - a
 6447     vs_subv(va1, __ T4S, va0, vc);
 6448     //    and a0 = a0 + a
 6449     vs_addv(va0, __ T4S, va0, vc);
 6450   }
 6451 
 6452   // Perform combined add/sub then montul on 4x4S vectors.
 6453   void dilithium_sub_add_montmul16(
 6454           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6455           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6456     // compute c = a0 - a1
 6457     vs_subv(vtmp1, __ T4S, va0, va1);
 6458     // output a0 = a0 + a1
 6459     vs_addv(va0, __ T4S, va0, va1);
 6460     // output a1 = b montmul c
 6461     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6462   }
 6463 
 6464   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6465   // in the Java implementation come in sequences of at least 8, so we
 6466   // can use ldpq to collect the corresponding data into pairs of vector
 6467   // registers.
 6468   // We collect the coefficients corresponding to the 'j+l' indexes into
 6469   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6470   // then we do the (Montgomery) multiplications by the zetas in parallel
 6471   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6472   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6473   // v0-v7 and finally save the results back to the coeffs array.
 6474   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6475     const Register coeffs, const Register zetas) {
 6476     int c1 = 0;
 6477     int c2 = 512;
 6478     int startIncr;
 6479     // don't use callee save registers v8 - v15
 6480     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6481     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6482     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6483     int offsets[4] = { 0, 32, 64, 96 };
 6484 
 6485     for (int level = 0; level < 5; level++) {
 6486       int c1Start = c1;
 6487       int c2Start = c2;
 6488       if (level == 3) {
 6489         offsets[1] = 32;
 6490         offsets[2] = 128;
 6491         offsets[3] = 160;
 6492       } else if (level == 4) {
 6493         offsets[1] = 64;
 6494         offsets[2] = 128;
 6495         offsets[3] = 192;
 6496       }
 6497 
 6498       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6499       // time at 4 different offsets and multiply them in order by the
 6500       // next set of input values. So we employ indexed load and store
 6501       // pair instructions with arrangement 4S.
 6502       for (int i = 0; i < 4; i++) {
 6503         // reload q and qinv
 6504         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6505         // load 8x4S coefficients via second start pos == c2
 6506         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6507         // load next 8x4S inputs == b
 6508         vs_ldpq_post(vs2, zetas);
 6509         // compute a == c2 * b mod MONT_Q
 6510         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6511         // load 8x4s coefficients via first start pos == c1
 6512         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6513         // compute a1 =  c1 + a
 6514         vs_addv(vs3, __ T4S, vs1, vs2);
 6515         // compute a2 =  c1 - a
 6516         vs_subv(vs1, __ T4S, vs1, vs2);
 6517         // output a1 and a2
 6518         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6519         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6520 
 6521         int k = 4 * level + i;
 6522 
 6523         if (k > 7) {
 6524           startIncr = 256;
 6525         } else if (k == 5) {
 6526           startIncr = 384;
 6527         } else {
 6528           startIncr = 128;
 6529         }
 6530 
 6531         c1Start += startIncr;
 6532         c2Start += startIncr;
 6533       }
 6534 
 6535       c2 /= 2;
 6536     }
 6537   }
 6538 
 6539   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6540   // Implements the method
 6541   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6542   // of the Java class sun.security.provider
 6543   //
 6544   // coeffs (int[256]) = c_rarg0
 6545   // zetas (int[256]) = c_rarg1
 6546   address generate_dilithiumAlmostNtt() {
 6547 
 6548     __ align(CodeEntryAlignment);
 6549     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6550     StubCodeMark mark(this, stub_id);
 6551     address start = __ pc();
 6552     __ enter();
 6553 
 6554     const Register coeffs = c_rarg0;
 6555     const Register zetas = c_rarg1;
 6556 
 6557     const Register tmpAddr = r9;
 6558     const Register dilithiumConsts = r10;
 6559     const Register result = r11;
 6560     // don't use callee save registers v8 - v15
 6561     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6562     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6563     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6564     int offsets[4] = { 0, 32, 64, 96};
 6565     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6566     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6567     __ add(result, coeffs, 0);
 6568     __ lea(dilithiumConsts,
 6569              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6570 
 6571     // Each level represents one iteration of the outer for loop of the Java version.
 6572 
 6573     // level 0-4
 6574     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6575 
 6576     // level 5
 6577 
 6578     // At level 5 the coefficients we need to combine with the zetas
 6579     // are grouped in memory in blocks of size 4. So, for both sets of
 6580     // coefficients we load 4 adjacent values at 8 different offsets
 6581     // using an indexed ldr with register variant Q and multiply them
 6582     // in sequence order by the next set of inputs. Likewise we store
 6583     // the resuls using an indexed str with register variant Q.
 6584     for (int i = 0; i < 1024; i += 256) {
 6585       // reload constants q, qinv each iteration as they get clobbered later
 6586       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6587       // load 32 (8x4S) coefficients via first offsets = c1
 6588       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6589       // load next 32 (8x4S) inputs = b
 6590       vs_ldpq_post(vs2, zetas);
 6591       // a = b montul c1
 6592       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6593       // load 32 (8x4S) coefficients via second offsets = c2
 6594       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6595       // add/sub with result of multiply
 6596       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6597       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6598       // write back new coefficients using same offsets
 6599       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6600       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6601     }
 6602 
 6603     // level 6
 6604     // At level 6 the coefficients we need to combine with the zetas
 6605     // are grouped in memory in pairs, the first two being montmul
 6606     // inputs and the second add/sub inputs. We can still implement
 6607     // the montmul+sub+add using 4-way parallelism but only if we
 6608     // combine the coefficients with the zetas 16 at a time. We load 8
 6609     // adjacent values at 4 different offsets using an ld2 load with
 6610     // arrangement 2D. That interleaves the lower and upper halves of
 6611     // each pair of quadwords into successive vector registers. We
 6612     // then need to montmul the 4 even elements of the coefficients
 6613     // register sequence by the zetas in order and then add/sub the 4
 6614     // odd elements of the coefficients register sequence. We use an
 6615     // equivalent st2 operation to store the results back into memory
 6616     // de-interleaved.
 6617     for (int i = 0; i < 1024; i += 128) {
 6618       // reload constants q, qinv each iteration as they get clobbered later
 6619       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6620       // load interleaved 16 (4x2D) coefficients via offsets
 6621       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6622       // load next 16 (4x4S) inputs
 6623       vs_ldpq_post(vs_front(vs2), zetas);
 6624       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6625       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6626                                   vs_front(vs2), vtmp, vq);
 6627       // store interleaved 16 (4x2D) coefficients via offsets
 6628       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6629     }
 6630 
 6631     // level 7
 6632     // At level 7 the coefficients we need to combine with the zetas
 6633     // occur singly with montmul inputs alterating with add/sub
 6634     // inputs. Once again we can use 4-way parallelism to combine 16
 6635     // zetas at a time. However, we have to load 8 adjacent values at
 6636     // 4 different offsets using an ld2 load with arrangement 4S. That
 6637     // interleaves the the odd words of each pair into one
 6638     // coefficients vector register and the even words of the pair
 6639     // into the next register. We then need to montmul the 4 even
 6640     // elements of the coefficients register sequence by the zetas in
 6641     // order and then add/sub the 4 odd elements of the coefficients
 6642     // register sequence. We use an equivalent st2 operation to store
 6643     // the results back into memory de-interleaved.
 6644 
 6645     for (int i = 0; i < 1024; i += 128) {
 6646       // reload constants q, qinv each iteration as they get clobbered later
 6647       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6648       // load interleaved 16 (4x4S) coefficients via offsets
 6649       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6650       // load next 16 (4x4S) inputs
 6651       vs_ldpq_post(vs_front(vs2), zetas);
 6652       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6653       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6654                                   vs_front(vs2), vtmp, vq);
 6655       // store interleaved 16 (4x4S) coefficients via offsets
 6656       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6657     }
 6658     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6659     __ mov(r0, zr); // return 0
 6660     __ ret(lr);
 6661 
 6662     return start;
 6663   }
 6664 
 6665   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6666   // in the Java implementation come in sequences of at least 8, so we
 6667   // can use ldpq to collect the corresponding data into pairs of vector
 6668   // registers
 6669   // We collect the coefficients that correspond to the 'j's into vs1
 6670   // the coefficiets that correspond to the 'j+l's into vs2 then
 6671   // do the additions into vs3 and the subtractions into vs1 then
 6672   // save the result of the additions, load the zetas into vs2
 6673   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6674   // finally save the results back to the coeffs array
 6675   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6676     const Register coeffs, const Register zetas) {
 6677     int c1 = 0;
 6678     int c2 = 32;
 6679     int startIncr;
 6680     int offsets[4];
 6681     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6682     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6683     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6684 
 6685     offsets[0] = 0;
 6686 
 6687     for (int level = 3; level < 8; level++) {
 6688       int c1Start = c1;
 6689       int c2Start = c2;
 6690       if (level == 3) {
 6691         offsets[1] = 64;
 6692         offsets[2] = 128;
 6693         offsets[3] = 192;
 6694       } else if (level == 4) {
 6695         offsets[1] = 32;
 6696         offsets[2] = 128;
 6697         offsets[3] = 160;
 6698       } else {
 6699         offsets[1] = 32;
 6700         offsets[2] = 64;
 6701         offsets[3] = 96;
 6702       }
 6703 
 6704       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6705       // time at 4 different offsets and multiply them in order by the
 6706       // next set of input values. So we employ indexed load and store
 6707       // pair instructions with arrangement 4S.
 6708       for (int i = 0; i < 4; i++) {
 6709         // load v1 32 (8x4S) coefficients relative to first start index
 6710         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6711         // load v2 32 (8x4S) coefficients relative to second start index
 6712         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6713         // a0 = v1 + v2 -- n.b. clobbers vqs
 6714         vs_addv(vs3, __ T4S, vs1, vs2);
 6715         // a1 = v1 - v2
 6716         vs_subv(vs1, __ T4S, vs1, vs2);
 6717         // save a1 relative to first start index
 6718         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6719         // load constants q, qinv each iteration as they get clobbered above
 6720         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6721         // load b next 32 (8x4S) inputs
 6722         vs_ldpq_post(vs2, zetas);
 6723         // a = a1 montmul b
 6724         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6725         // save a relative to second start index
 6726         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6727 
 6728         int k = 4 * level + i;
 6729 
 6730         if (k < 24) {
 6731           startIncr = 256;
 6732         } else if (k == 25) {
 6733           startIncr = 384;
 6734         } else {
 6735           startIncr = 128;
 6736         }
 6737 
 6738         c1Start += startIncr;
 6739         c2Start += startIncr;
 6740       }
 6741 
 6742       c2 *= 2;
 6743     }
 6744   }
 6745 
 6746   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6747   // Implements the method
 6748   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6749   // the sun.security.provider.ML_DSA class.
 6750   //
 6751   // coeffs (int[256]) = c_rarg0
 6752   // zetas (int[256]) = c_rarg1
 6753   address generate_dilithiumAlmostInverseNtt() {
 6754 
 6755     __ align(CodeEntryAlignment);
 6756     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6757     StubCodeMark mark(this, stub_id);
 6758     address start = __ pc();
 6759     __ enter();
 6760 
 6761     const Register coeffs = c_rarg0;
 6762     const Register zetas = c_rarg1;
 6763 
 6764     const Register tmpAddr = r9;
 6765     const Register dilithiumConsts = r10;
 6766     const Register result = r11;
 6767     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6768     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6769     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6770     int offsets[4] = { 0, 32, 64, 96 };
 6771     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6772     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6773 
 6774     __ add(result, coeffs, 0);
 6775     __ lea(dilithiumConsts,
 6776              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6777 
 6778     // Each level represents one iteration of the outer for loop of the Java version
 6779 
 6780     // level 0
 6781     // At level 0 we need to interleave adjacent quartets of
 6782     // coefficients before we multiply and add/sub by the next 16
 6783     // zetas just as we did for level 7 in the multiply code. So we
 6784     // load and store the values using an ld2/st2 with arrangement 4S.
 6785     for (int i = 0; i < 1024; i += 128) {
 6786       // load constants q, qinv
 6787       // n.b. this can be moved out of the loop as they do not get
 6788       // clobbered by first two loops
 6789       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6790       // a0/a1 load interleaved 32 (8x4S) coefficients
 6791       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6792       // b load next 32 (8x4S) inputs
 6793       vs_ldpq_post(vs_front(vs2), zetas);
 6794       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6795       // n.b. second half of vs2 provides temporary register storage
 6796       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6797                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6798       // a0/a1 store interleaved 32 (8x4S) coefficients
 6799       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6800     }
 6801 
 6802     // level 1
 6803     // At level 1 we need to interleave pairs of adjacent pairs of
 6804     // coefficients before we multiply by the next 16 zetas just as we
 6805     // did for level 6 in the multiply code. So we load and store the
 6806     // values an ld2/st2 with arrangement 2D.
 6807     for (int i = 0; i < 1024; i += 128) {
 6808       // a0/a1 load interleaved 32 (8x2D) coefficients
 6809       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6810       // b load next 16 (4x4S) inputs
 6811       vs_ldpq_post(vs_front(vs2), zetas);
 6812       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6813       // n.b. second half of vs2 provides temporary register storage
 6814       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6815                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6816       // a0/a1 store interleaved 32 (8x2D) coefficients
 6817       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6818     }
 6819 
 6820     // level 2
 6821     // At level 2 coefficients come in blocks of 4. So, we load 4
 6822     // adjacent coefficients at 8 distinct offsets for both the first
 6823     // and second coefficient sequences, using an ldr with register
 6824     // variant Q then combine them with next set of 32 zetas. Likewise
 6825     // we store the results using an str with register variant Q.
 6826     for (int i = 0; i < 1024; i += 256) {
 6827       // c0 load 32 (8x4S) coefficients via first offsets
 6828       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6829       // c1 load 32 (8x4S) coefficients via second offsets
 6830       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6831       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6832       vs_addv(vs3, __ T4S, vs1, vs2);
 6833       // c = c0 - c1
 6834       vs_subv(vs1, __ T4S, vs1, vs2);
 6835       // store a0 32 (8x4S) coefficients via first offsets
 6836       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6837       // b load 32 (8x4S) next inputs
 6838       vs_ldpq_post(vs2, zetas);
 6839       // reload constants q, qinv -- they were clobbered earlier
 6840       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6841       // compute a1 = b montmul c
 6842       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6843       // store a1 32 (8x4S) coefficients via second offsets
 6844       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6845     }
 6846 
 6847     // level 3-7
 6848     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6849 
 6850     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6851     __ mov(r0, zr); // return 0
 6852     __ ret(lr);
 6853 
 6854     return start;
 6855   }
 6856 
 6857   // Dilithium multiply polynomials in the NTT domain.
 6858   // Straightforward implementation of the method
 6859   // static int implDilithiumNttMult(
 6860   //              int[] result, int[] ntta, int[] nttb {} of
 6861   // the sun.security.provider.ML_DSA class.
 6862   //
 6863   // result (int[256]) = c_rarg0
 6864   // poly1 (int[256]) = c_rarg1
 6865   // poly2 (int[256]) = c_rarg2
 6866   address generate_dilithiumNttMult() {
 6867 
 6868         __ align(CodeEntryAlignment);
 6869     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6870     StubCodeMark mark(this, stub_id);
 6871     address start = __ pc();
 6872     __ enter();
 6873 
 6874     Label L_loop;
 6875 
 6876     const Register result = c_rarg0;
 6877     const Register poly1 = c_rarg1;
 6878     const Register poly2 = c_rarg2;
 6879 
 6880     const Register dilithiumConsts = r10;
 6881     const Register len = r11;
 6882 
 6883     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6884     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6885     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6886     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6887 
 6888     __ lea(dilithiumConsts,
 6889              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6890 
 6891     // load constants q, qinv
 6892     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6893     // load constant rSquare into v29
 6894     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6895 
 6896     __ mov(len, zr);
 6897     __ add(len, len, 1024);
 6898 
 6899     __ BIND(L_loop);
 6900 
 6901     // b load 32 (8x4S) next inputs from poly1
 6902     vs_ldpq_post(vs1, poly1);
 6903     // c load 32 (8x4S) next inputs from poly2
 6904     vs_ldpq_post(vs2, poly2);
 6905     // compute a = b montmul c
 6906     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6907     // compute a = rsquare montmul a
 6908     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6909     // save a 32 (8x4S) results
 6910     vs_stpq_post(vs2, result);
 6911 
 6912     __ sub(len, len, 128);
 6913     __ cmp(len, (u1)128);
 6914     __ br(Assembler::GE, L_loop);
 6915 
 6916     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6917     __ mov(r0, zr); // return 0
 6918     __ ret(lr);
 6919 
 6920     return start;
 6921   }
 6922 
 6923   // Dilithium Motgomery multiply an array by a constant.
 6924   // A straightforward implementation of the method
 6925   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6926   // of the sun.security.provider.MLDSA class
 6927   //
 6928   // coeffs (int[256]) = c_rarg0
 6929   // constant (int) = c_rarg1
 6930   address generate_dilithiumMontMulByConstant() {
 6931 
 6932     __ align(CodeEntryAlignment);
 6933     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6934     StubCodeMark mark(this, stub_id);
 6935     address start = __ pc();
 6936     __ enter();
 6937 
 6938     Label L_loop;
 6939 
 6940     const Register coeffs = c_rarg0;
 6941     const Register constant = c_rarg1;
 6942 
 6943     const Register dilithiumConsts = r10;
 6944     const Register result = r11;
 6945     const Register len = r12;
 6946 
 6947     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6948     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6949     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6950     VSeq<8> vconst(29, 0);             // for montmul by constant
 6951 
 6952     // results track inputs
 6953     __ add(result, coeffs, 0);
 6954     __ lea(dilithiumConsts,
 6955              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6956 
 6957     // load constants q, qinv -- they do not get clobbered by first two loops
 6958     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6959     // copy caller supplied constant across vconst
 6960     __ dup(vconst[0], __ T4S, constant);
 6961     __ mov(len, zr);
 6962     __ add(len, len, 1024);
 6963 
 6964     __ BIND(L_loop);
 6965 
 6966     // load next 32 inputs
 6967     vs_ldpq_post(vs2, coeffs);
 6968     // mont mul by constant
 6969     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6970     // write next 32 results
 6971     vs_stpq_post(vs2, result);
 6972 
 6973     __ sub(len, len, 128);
 6974     __ cmp(len, (u1)128);
 6975     __ br(Assembler::GE, L_loop);
 6976 
 6977     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6978     __ mov(r0, zr); // return 0
 6979     __ ret(lr);
 6980 
 6981     return start;
 6982   }
 6983 
 6984   // Dilithium decompose poly.
 6985   // Implements the method
 6986   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6987   // of the sun.security.provider.ML_DSA class
 6988   //
 6989   // input (int[256]) = c_rarg0
 6990   // lowPart (int[256]) = c_rarg1
 6991   // highPart (int[256]) = c_rarg2
 6992   // twoGamma2  (int) = c_rarg3
 6993   // multiplier (int) = c_rarg4
 6994   address generate_dilithiumDecomposePoly() {
 6995 
 6996     __ align(CodeEntryAlignment);
 6997     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 6998     StubCodeMark mark(this, stub_id);
 6999     address start = __ pc();
 7000     Label L_loop;
 7001 
 7002     const Register input = c_rarg0;
 7003     const Register lowPart = c_rarg1;
 7004     const Register highPart = c_rarg2;
 7005     const Register twoGamma2 = c_rarg3;
 7006     const Register multiplier = c_rarg4;
 7007 
 7008     const Register len = r9;
 7009     const Register dilithiumConsts = r10;
 7010     const Register tmp = r11;
 7011 
 7012     // 6 independent sets of 4x4s values
 7013     VSeq<4> vs1(0), vs2(4), vs3(8);
 7014     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7015 
 7016     // 7 constants for cross-multiplying
 7017     VSeq<4> one(25, 0);
 7018     VSeq<4> qminus1(26, 0);
 7019     VSeq<4> g2(27, 0);
 7020     VSeq<4> twog2(28, 0);
 7021     VSeq<4> mult(29, 0);
 7022     VSeq<4> q(30, 0);
 7023     VSeq<4> qadd(31, 0);
 7024 
 7025     __ enter();
 7026 
 7027     __ lea(dilithiumConsts,
 7028              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7029 
 7030     // save callee-saved registers
 7031     __ stpd(v8, v9, __ pre(sp, -64));
 7032     __ stpd(v10, v11, Address(sp, 16));
 7033     __ stpd(v12, v13, Address(sp, 32));
 7034     __ stpd(v14, v15, Address(sp, 48));
 7035 
 7036     // populate constant registers
 7037     __ mov(tmp, zr);
 7038     __ add(tmp, tmp, 1);
 7039     __ dup(one[0], __ T4S, tmp); // 1
 7040     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7041     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7042     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7043     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7044     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7045     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7046 
 7047     __ mov(len, zr);
 7048     __ add(len, len, 1024);
 7049 
 7050     __ BIND(L_loop);
 7051 
 7052     // load next 4x4S inputs interleaved: rplus --> vs1
 7053     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7054 
 7055     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7056     vs_addv(vtmp, __ T4S, vs1, qadd);
 7057     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7058     vs_mulv(vtmp, __ T4S, vtmp, q);
 7059     vs_subv(vs1, __ T4S, vs1, vtmp);
 7060 
 7061     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7062     vs_sshr(vtmp, __ T4S, vs1, 31);
 7063     vs_andr(vtmp, vtmp, q);
 7064     vs_addv(vs1, __ T4S, vs1, vtmp);
 7065 
 7066     // quotient --> vs2
 7067     // int quotient = (rplus * multiplier) >> 22;
 7068     vs_mulv(vtmp, __ T4S, vs1, mult);
 7069     vs_sshr(vs2, __ T4S, vtmp, 22);
 7070 
 7071     // r0 --> vs3
 7072     // int r0 = rplus - quotient * twoGamma2;
 7073     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7074     vs_subv(vs3, __ T4S, vs1, vtmp);
 7075 
 7076     // mask --> vs4
 7077     // int mask = (twoGamma2 - r0) >> 22;
 7078     vs_subv(vtmp, __ T4S, twog2, vs3);
 7079     vs_sshr(vs4, __ T4S, vtmp, 22);
 7080 
 7081     // r0 -= (mask & twoGamma2);
 7082     vs_andr(vtmp, vs4, twog2);
 7083     vs_subv(vs3, __ T4S, vs3, vtmp);
 7084 
 7085     //  quotient += (mask & 1);
 7086     vs_andr(vtmp, vs4, one);
 7087     vs_addv(vs2, __ T4S, vs2, vtmp);
 7088 
 7089     // mask = (twoGamma2 / 2 - r0) >> 31;
 7090     vs_subv(vtmp, __ T4S, g2, vs3);
 7091     vs_sshr(vs4, __ T4S, vtmp, 31);
 7092 
 7093     // r0 -= (mask & twoGamma2);
 7094     vs_andr(vtmp, vs4, twog2);
 7095     vs_subv(vs3, __ T4S, vs3, vtmp);
 7096 
 7097     // quotient += (mask & 1);
 7098     vs_andr(vtmp, vs4, one);
 7099     vs_addv(vs2, __ T4S, vs2, vtmp);
 7100 
 7101     // r1 --> vs5
 7102     // int r1 = rplus - r0 - (dilithium_q - 1);
 7103     vs_subv(vtmp, __ T4S, vs1, vs3);
 7104     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7105 
 7106     // r1 --> vs1 (overwriting rplus)
 7107     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7108     vs_negr(vtmp, __ T4S, vs5);
 7109     vs_orr(vtmp, vs5, vtmp);
 7110     vs_sshr(vs1, __ T4S, vtmp, 31);
 7111 
 7112     // r0 += ~r1;
 7113     vs_notr(vtmp, vs1);
 7114     vs_addv(vs3, __ T4S, vs3, vtmp);
 7115 
 7116     // r1 = r1 & quotient;
 7117     vs_andr(vs1, vs2, vs1);
 7118 
 7119     // store results inteleaved
 7120     // lowPart[m] = r0;
 7121     // highPart[m] = r1;
 7122     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7123     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7124 
 7125     __ sub(len, len, 64);
 7126     __ cmp(len, (u1)64);
 7127     __ br(Assembler::GE, L_loop);
 7128 
 7129     // restore callee-saved vector registers
 7130     __ ldpd(v14, v15, Address(sp, 48));
 7131     __ ldpd(v12, v13, Address(sp, 32));
 7132     __ ldpd(v10, v11, Address(sp, 16));
 7133     __ ldpd(v8, v9, __ post(sp, 64));
 7134 
 7135     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7136     __ mov(r0, zr); // return 0
 7137     __ ret(lr);
 7138 
 7139     return start;
 7140   }
 7141 
 7142   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7143              Register tmp0, Register tmp1, Register tmp2) {
 7144     __ bic(tmp0, a2, a1); // for a0
 7145     __ bic(tmp1, a3, a2); // for a1
 7146     __ bic(tmp2, a4, a3); // for a2
 7147     __ eor(a2, a2, tmp2);
 7148     __ bic(tmp2, a0, a4); // for a3
 7149     __ eor(a3, a3, tmp2);
 7150     __ bic(tmp2, a1, a0); // for a4
 7151     __ eor(a0, a0, tmp0);
 7152     __ eor(a1, a1, tmp1);
 7153     __ eor(a4, a4, tmp2);
 7154   }
 7155 
 7156   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7157                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7158                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7159                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7160                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7161                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7162                         Register tmp0, Register tmp1, Register tmp2) {
 7163     __ eor3(tmp1, a4, a9, a14);
 7164     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7165     __ eor3(tmp2, a1, a6, a11);
 7166     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7167     __ rax1(tmp2, tmp0, tmp1); // d0
 7168     {
 7169 
 7170       Register tmp3, tmp4;
 7171       if (can_use_fp && can_use_r18) {
 7172         tmp3 = rfp;
 7173         tmp4 = r18_tls;
 7174       } else {
 7175         tmp3 = a4;
 7176         tmp4 = a9;
 7177         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7178       }
 7179 
 7180       __ eor3(tmp3, a0, a5, a10);
 7181       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7182       __ eor(a0, a0, tmp2);
 7183       __ eor(a5, a5, tmp2);
 7184       __ eor(a10, a10, tmp2);
 7185       __ eor(a15, a15, tmp2);
 7186       __ eor(a20, a20, tmp2); // d0(tmp2)
 7187       __ eor3(tmp3, a2, a7, a12);
 7188       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7189       __ rax1(tmp3, tmp4, tmp2); // d1
 7190       __ eor(a1, a1, tmp3);
 7191       __ eor(a6, a6, tmp3);
 7192       __ eor(a11, a11, tmp3);
 7193       __ eor(a16, a16, tmp3);
 7194       __ eor(a21, a21, tmp3); // d1(tmp3)
 7195       __ rax1(tmp3, tmp2, tmp0); // d3
 7196       __ eor3(tmp2, a3, a8, a13);
 7197       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7198       __ eor(a3, a3, tmp3);
 7199       __ eor(a8, a8, tmp3);
 7200       __ eor(a13, a13, tmp3);
 7201       __ eor(a18, a18, tmp3);
 7202       __ eor(a23, a23, tmp3);
 7203       __ rax1(tmp2, tmp1, tmp0); // d2
 7204       __ eor(a2, a2, tmp2);
 7205       __ eor(a7, a7, tmp2);
 7206       __ eor(a12, a12, tmp2);
 7207       __ rax1(tmp0, tmp0, tmp4); // d4
 7208       if (!can_use_fp || !can_use_r18) {
 7209         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7210       }
 7211       __ eor(a17, a17, tmp2);
 7212       __ eor(a22, a22, tmp2);
 7213       __ eor(a4, a4, tmp0);
 7214       __ eor(a9, a9, tmp0);
 7215       __ eor(a14, a14, tmp0);
 7216       __ eor(a19, a19, tmp0);
 7217       __ eor(a24, a24, tmp0);
 7218     }
 7219 
 7220     __ rol(tmp0, a10, 3);
 7221     __ rol(a10, a1, 1);
 7222     __ rol(a1, a6, 44);
 7223     __ rol(a6, a9, 20);
 7224     __ rol(a9, a22, 61);
 7225     __ rol(a22, a14, 39);
 7226     __ rol(a14, a20, 18);
 7227     __ rol(a20, a2, 62);
 7228     __ rol(a2, a12, 43);
 7229     __ rol(a12, a13, 25);
 7230     __ rol(a13, a19, 8) ;
 7231     __ rol(a19, a23, 56);
 7232     __ rol(a23, a15, 41);
 7233     __ rol(a15, a4, 27);
 7234     __ rol(a4, a24, 14);
 7235     __ rol(a24, a21, 2);
 7236     __ rol(a21, a8, 55);
 7237     __ rol(a8, a16, 45);
 7238     __ rol(a16, a5, 36);
 7239     __ rol(a5, a3, 28);
 7240     __ rol(a3, a18, 21);
 7241     __ rol(a18, a17, 15);
 7242     __ rol(a17, a11, 10);
 7243     __ rol(a11, a7, 6);
 7244     __ mov(a7, tmp0);
 7245 
 7246     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7247     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7248     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7249     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7250     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7251 
 7252     __ ldr(tmp1, __ post(rc, 8));
 7253     __ eor(a0, a0, tmp1);
 7254 
 7255   }
 7256 
 7257   // Arguments:
 7258   //
 7259   // Inputs:
 7260   //   c_rarg0   - byte[]  source+offset
 7261   //   c_rarg1   - byte[]  SHA.state
 7262   //   c_rarg2   - int     block_size
 7263   //   c_rarg3   - int     offset
 7264   //   c_rarg4   - int     limit
 7265   //
 7266   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7267     bool multi_block;
 7268     switch (stub_id) {
 7269     case StubId::stubgen_sha3_implCompress_id:
 7270       multi_block = false;
 7271       break;
 7272     case StubId::stubgen_sha3_implCompressMB_id:
 7273       multi_block = true;
 7274       break;
 7275     default:
 7276       ShouldNotReachHere();
 7277     }
 7278 
 7279     static const uint64_t round_consts[24] = {
 7280       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7281       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7282       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7283       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7284       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7285       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7286       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7287       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7288     };
 7289 
 7290     __ align(CodeEntryAlignment);
 7291     StubCodeMark mark(this, stub_id);
 7292     address start = __ pc();
 7293 
 7294     Register buf           = c_rarg0;
 7295     Register state         = c_rarg1;
 7296     Register block_size    = c_rarg2;
 7297     Register ofs           = c_rarg3;
 7298     Register limit         = c_rarg4;
 7299 
 7300     // use r3.r17,r19..r28 to keep a0..a24.
 7301     // a0..a24 are respective locals from SHA3.java
 7302     Register a0 = r25,
 7303              a1 = r26,
 7304              a2 = r27,
 7305              a3 = r3,
 7306              a4 = r4,
 7307              a5 = r5,
 7308              a6 = r6,
 7309              a7 = r7,
 7310              a8 = rscratch1, // r8
 7311              a9 = rscratch2, // r9
 7312              a10 = r10,
 7313              a11 = r11,
 7314              a12 = r12,
 7315              a13 = r13,
 7316              a14 = r14,
 7317              a15 = r15,
 7318              a16 = r16,
 7319              a17 = r17,
 7320              a18 = r28,
 7321              a19 = r19,
 7322              a20 = r20,
 7323              a21 = r21,
 7324              a22 = r22,
 7325              a23 = r23,
 7326              a24 = r24;
 7327 
 7328     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7329 
 7330     Label sha3_loop, rounds24_preloop, loop_body;
 7331     Label sha3_512_or_sha3_384, shake128;
 7332 
 7333     bool can_use_r18 = false;
 7334 #ifndef R18_RESERVED
 7335     can_use_r18 = true;
 7336 #endif
 7337     bool can_use_fp = !PreserveFramePointer;
 7338 
 7339     __ enter();
 7340 
 7341     // save almost all yet unsaved gpr registers on stack
 7342     __ str(block_size, __ pre(sp, -128));
 7343     if (multi_block) {
 7344       __ stpw(ofs, limit, Address(sp, 8));
 7345     }
 7346     // 8 bytes at sp+16 will be used to keep buf
 7347     __ stp(r19, r20, Address(sp, 32));
 7348     __ stp(r21, r22, Address(sp, 48));
 7349     __ stp(r23, r24, Address(sp, 64));
 7350     __ stp(r25, r26, Address(sp, 80));
 7351     __ stp(r27, r28, Address(sp, 96));
 7352     if (can_use_r18 && can_use_fp) {
 7353       __ stp(r18_tls, state, Address(sp, 112));
 7354     } else {
 7355       __ str(state, Address(sp, 112));
 7356     }
 7357 
 7358     // begin sha3 calculations: loading a0..a24 from state arrary
 7359     __ ldp(a0, a1, state);
 7360     __ ldp(a2, a3, Address(state, 16));
 7361     __ ldp(a4, a5, Address(state, 32));
 7362     __ ldp(a6, a7, Address(state, 48));
 7363     __ ldp(a8, a9, Address(state, 64));
 7364     __ ldp(a10, a11, Address(state, 80));
 7365     __ ldp(a12, a13, Address(state, 96));
 7366     __ ldp(a14, a15, Address(state, 112));
 7367     __ ldp(a16, a17, Address(state, 128));
 7368     __ ldp(a18, a19, Address(state, 144));
 7369     __ ldp(a20, a21, Address(state, 160));
 7370     __ ldp(a22, a23, Address(state, 176));
 7371     __ ldr(a24, Address(state, 192));
 7372 
 7373     __ BIND(sha3_loop);
 7374 
 7375     // load input
 7376     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7377     __ eor(a0, a0, tmp3);
 7378     __ eor(a1, a1, tmp2);
 7379     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7380     __ eor(a2, a2, tmp3);
 7381     __ eor(a3, a3, tmp2);
 7382     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7383     __ eor(a4, a4, tmp3);
 7384     __ eor(a5, a5, tmp2);
 7385     __ ldr(tmp3, __ post(buf, 8));
 7386     __ eor(a6, a6, tmp3);
 7387 
 7388     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7389     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7390 
 7391     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7392     __ eor(a7, a7, tmp3);
 7393     __ eor(a8, a8, tmp2);
 7394     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7395     __ eor(a9, a9, tmp3);
 7396     __ eor(a10, a10, tmp2);
 7397     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7398     __ eor(a11, a11, tmp3);
 7399     __ eor(a12, a12, tmp2);
 7400     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7401     __ eor(a13, a13, tmp3);
 7402     __ eor(a14, a14, tmp2);
 7403     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7404     __ eor(a15, a15, tmp3);
 7405     __ eor(a16, a16, tmp2);
 7406 
 7407     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7408     __ andw(tmp2, block_size, 48);
 7409     __ cbzw(tmp2, rounds24_preloop);
 7410     __ tbnz(block_size, 5, shake128);
 7411     // block_size == 144, bit5 == 0, SHA3-244
 7412     __ ldr(tmp3, __ post(buf, 8));
 7413     __ eor(a17, a17, tmp3);
 7414     __ b(rounds24_preloop);
 7415 
 7416     __ BIND(shake128);
 7417     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7418     __ eor(a17, a17, tmp3);
 7419     __ eor(a18, a18, tmp2);
 7420     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7421     __ eor(a19, a19, tmp3);
 7422     __ eor(a20, a20, tmp2);
 7423     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7424 
 7425     __ BIND(sha3_512_or_sha3_384);
 7426     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7427     __ eor(a7, a7, tmp3);
 7428     __ eor(a8, a8, tmp2);
 7429     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7430 
 7431     // SHA3-384
 7432     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7433     __ eor(a9, a9, tmp3);
 7434     __ eor(a10, a10, tmp2);
 7435     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7436     __ eor(a11, a11, tmp3);
 7437     __ eor(a12, a12, tmp2);
 7438 
 7439     __ BIND(rounds24_preloop);
 7440     __ fmovs(v0, 24.0); // float loop counter,
 7441     __ fmovs(v1, 1.0);  // exact representation
 7442 
 7443     __ str(buf, Address(sp, 16));
 7444     __ lea(tmp3, ExternalAddress((address) round_consts));
 7445 
 7446     __ BIND(loop_body);
 7447     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7448                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7449                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7450                      tmp0, tmp1, tmp2);
 7451     __ fsubs(v0, v0, v1);
 7452     __ fcmps(v0, 0.0);
 7453     __ br(__ NE, loop_body);
 7454 
 7455     if (multi_block) {
 7456       __ ldrw(block_size, sp); // block_size
 7457       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7458       __ addw(tmp2, tmp2, block_size);
 7459       __ cmpw(tmp2, tmp1);
 7460       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7461       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7462       __ br(Assembler::LE, sha3_loop);
 7463       __ movw(c_rarg0, tmp2); // return offset
 7464     }
 7465     if (can_use_fp && can_use_r18) {
 7466       __ ldp(r18_tls, state, Address(sp, 112));
 7467     } else {
 7468       __ ldr(state, Address(sp, 112));
 7469     }
 7470     // save calculated sha3 state
 7471     __ stp(a0, a1, Address(state));
 7472     __ stp(a2, a3, Address(state, 16));
 7473     __ stp(a4, a5, Address(state, 32));
 7474     __ stp(a6, a7, Address(state, 48));
 7475     __ stp(a8, a9, Address(state, 64));
 7476     __ stp(a10, a11, Address(state, 80));
 7477     __ stp(a12, a13, Address(state, 96));
 7478     __ stp(a14, a15, Address(state, 112));
 7479     __ stp(a16, a17, Address(state, 128));
 7480     __ stp(a18, a19, Address(state, 144));
 7481     __ stp(a20, a21, Address(state, 160));
 7482     __ stp(a22, a23, Address(state, 176));
 7483     __ str(a24, Address(state, 192));
 7484 
 7485     // restore required registers from stack
 7486     __ ldp(r19, r20, Address(sp, 32));
 7487     __ ldp(r21, r22, Address(sp, 48));
 7488     __ ldp(r23, r24, Address(sp, 64));
 7489     __ ldp(r25, r26, Address(sp, 80));
 7490     __ ldp(r27, r28, Address(sp, 96));
 7491     if (can_use_fp && can_use_r18) {
 7492       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7493     } // else no need to recalculate rfp, since it wasn't changed
 7494 
 7495     __ leave();
 7496 
 7497     __ ret(lr);
 7498 
 7499     return start;
 7500   }
 7501 
 7502   /**
 7503    *  Arguments:
 7504    *
 7505    * Inputs:
 7506    *   c_rarg0   - int crc
 7507    *   c_rarg1   - byte* buf
 7508    *   c_rarg2   - int length
 7509    *
 7510    * Output:
 7511    *       rax   - int crc result
 7512    */
 7513   address generate_updateBytesCRC32() {
 7514     assert(UseCRC32Intrinsics, "what are we doing here?");
 7515 
 7516     __ align(CodeEntryAlignment);
 7517     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7518     StubCodeMark mark(this, stub_id);
 7519 
 7520     address start = __ pc();
 7521 
 7522     const Register crc   = c_rarg0;  // crc
 7523     const Register buf   = c_rarg1;  // source java byte array address
 7524     const Register len   = c_rarg2;  // length
 7525     const Register table0 = c_rarg3; // crc_table address
 7526     const Register table1 = c_rarg4;
 7527     const Register table2 = c_rarg5;
 7528     const Register table3 = c_rarg6;
 7529     const Register tmp3 = c_rarg7;
 7530 
 7531     BLOCK_COMMENT("Entry:");
 7532     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7533 
 7534     __ kernel_crc32(crc, buf, len,
 7535               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7536 
 7537     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7538     __ ret(lr);
 7539 
 7540     return start;
 7541   }
 7542 
 7543   /**
 7544    *  Arguments:
 7545    *
 7546    * Inputs:
 7547    *   c_rarg0   - int crc
 7548    *   c_rarg1   - byte* buf
 7549    *   c_rarg2   - int length
 7550    *   c_rarg3   - int* table
 7551    *
 7552    * Output:
 7553    *       r0   - int crc result
 7554    */
 7555   address generate_updateBytesCRC32C() {
 7556     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7557 
 7558     __ align(CodeEntryAlignment);
 7559     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7560     StubCodeMark mark(this, stub_id);
 7561 
 7562     address start = __ pc();
 7563 
 7564     const Register crc   = c_rarg0;  // crc
 7565     const Register buf   = c_rarg1;  // source java byte array address
 7566     const Register len   = c_rarg2;  // length
 7567     const Register table0 = c_rarg3; // crc_table address
 7568     const Register table1 = c_rarg4;
 7569     const Register table2 = c_rarg5;
 7570     const Register table3 = c_rarg6;
 7571     const Register tmp3 = c_rarg7;
 7572 
 7573     BLOCK_COMMENT("Entry:");
 7574     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7575 
 7576     __ kernel_crc32c(crc, buf, len,
 7577               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7578 
 7579     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7580     __ ret(lr);
 7581 
 7582     return start;
 7583   }
 7584 
 7585   /***
 7586    *  Arguments:
 7587    *
 7588    *  Inputs:
 7589    *   c_rarg0   - int   adler
 7590    *   c_rarg1   - byte* buff
 7591    *   c_rarg2   - int   len
 7592    *
 7593    * Output:
 7594    *   c_rarg0   - int adler result
 7595    */
 7596   address generate_updateBytesAdler32() {
 7597     __ align(CodeEntryAlignment);
 7598     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7599     StubCodeMark mark(this, stub_id);
 7600     address start = __ pc();
 7601 
 7602     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7603 
 7604     // Aliases
 7605     Register adler  = c_rarg0;
 7606     Register s1     = c_rarg0;
 7607     Register s2     = c_rarg3;
 7608     Register buff   = c_rarg1;
 7609     Register len    = c_rarg2;
 7610     Register nmax  = r4;
 7611     Register base  = r5;
 7612     Register count = r6;
 7613     Register temp0 = rscratch1;
 7614     Register temp1 = rscratch2;
 7615     FloatRegister vbytes = v0;
 7616     FloatRegister vs1acc = v1;
 7617     FloatRegister vs2acc = v2;
 7618     FloatRegister vtable = v3;
 7619 
 7620     // Max number of bytes we can process before having to take the mod
 7621     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7622     uint64_t BASE = 0xfff1;
 7623     uint64_t NMAX = 0x15B0;
 7624 
 7625     __ mov(base, BASE);
 7626     __ mov(nmax, NMAX);
 7627 
 7628     // Load accumulation coefficients for the upper 16 bits
 7629     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7630     __ ld1(vtable, __ T16B, Address(temp0));
 7631 
 7632     // s1 is initialized to the lower 16 bits of adler
 7633     // s2 is initialized to the upper 16 bits of adler
 7634     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7635     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7636 
 7637     // The pipelined loop needs at least 16 elements for 1 iteration
 7638     // It does check this, but it is more effective to skip to the cleanup loop
 7639     __ cmp(len, (u1)16);
 7640     __ br(Assembler::HS, L_nmax);
 7641     __ cbz(len, L_combine);
 7642 
 7643     __ bind(L_simple_by1_loop);
 7644     __ ldrb(temp0, Address(__ post(buff, 1)));
 7645     __ add(s1, s1, temp0);
 7646     __ add(s2, s2, s1);
 7647     __ subs(len, len, 1);
 7648     __ br(Assembler::HI, L_simple_by1_loop);
 7649 
 7650     // s1 = s1 % BASE
 7651     __ subs(temp0, s1, base);
 7652     __ csel(s1, temp0, s1, Assembler::HS);
 7653 
 7654     // s2 = s2 % BASE
 7655     __ lsr(temp0, s2, 16);
 7656     __ lsl(temp1, temp0, 4);
 7657     __ sub(temp1, temp1, temp0);
 7658     __ add(s2, temp1, s2, ext::uxth);
 7659 
 7660     __ subs(temp0, s2, base);
 7661     __ csel(s2, temp0, s2, Assembler::HS);
 7662 
 7663     __ b(L_combine);
 7664 
 7665     __ bind(L_nmax);
 7666     __ subs(len, len, nmax);
 7667     __ sub(count, nmax, 16);
 7668     __ br(Assembler::LO, L_by16);
 7669 
 7670     __ bind(L_nmax_loop);
 7671 
 7672     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7673                                       vbytes, vs1acc, vs2acc, vtable);
 7674 
 7675     __ subs(count, count, 16);
 7676     __ br(Assembler::HS, L_nmax_loop);
 7677 
 7678     // s1 = s1 % BASE
 7679     __ lsr(temp0, s1, 16);
 7680     __ lsl(temp1, temp0, 4);
 7681     __ sub(temp1, temp1, temp0);
 7682     __ add(temp1, temp1, s1, ext::uxth);
 7683 
 7684     __ lsr(temp0, temp1, 16);
 7685     __ lsl(s1, temp0, 4);
 7686     __ sub(s1, s1, temp0);
 7687     __ add(s1, s1, temp1, ext:: uxth);
 7688 
 7689     __ subs(temp0, s1, base);
 7690     __ csel(s1, temp0, s1, Assembler::HS);
 7691 
 7692     // s2 = s2 % BASE
 7693     __ lsr(temp0, s2, 16);
 7694     __ lsl(temp1, temp0, 4);
 7695     __ sub(temp1, temp1, temp0);
 7696     __ add(temp1, temp1, s2, ext::uxth);
 7697 
 7698     __ lsr(temp0, temp1, 16);
 7699     __ lsl(s2, temp0, 4);
 7700     __ sub(s2, s2, temp0);
 7701     __ add(s2, s2, temp1, ext:: uxth);
 7702 
 7703     __ subs(temp0, s2, base);
 7704     __ csel(s2, temp0, s2, Assembler::HS);
 7705 
 7706     __ subs(len, len, nmax);
 7707     __ sub(count, nmax, 16);
 7708     __ br(Assembler::HS, L_nmax_loop);
 7709 
 7710     __ bind(L_by16);
 7711     __ adds(len, len, count);
 7712     __ br(Assembler::LO, L_by1);
 7713 
 7714     __ bind(L_by16_loop);
 7715 
 7716     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7717                                       vbytes, vs1acc, vs2acc, vtable);
 7718 
 7719     __ subs(len, len, 16);
 7720     __ br(Assembler::HS, L_by16_loop);
 7721 
 7722     __ bind(L_by1);
 7723     __ adds(len, len, 15);
 7724     __ br(Assembler::LO, L_do_mod);
 7725 
 7726     __ bind(L_by1_loop);
 7727     __ ldrb(temp0, Address(__ post(buff, 1)));
 7728     __ add(s1, temp0, s1);
 7729     __ add(s2, s2, s1);
 7730     __ subs(len, len, 1);
 7731     __ br(Assembler::HS, L_by1_loop);
 7732 
 7733     __ bind(L_do_mod);
 7734     // s1 = s1 % BASE
 7735     __ lsr(temp0, s1, 16);
 7736     __ lsl(temp1, temp0, 4);
 7737     __ sub(temp1, temp1, temp0);
 7738     __ add(temp1, temp1, s1, ext::uxth);
 7739 
 7740     __ lsr(temp0, temp1, 16);
 7741     __ lsl(s1, temp0, 4);
 7742     __ sub(s1, s1, temp0);
 7743     __ add(s1, s1, temp1, ext:: uxth);
 7744 
 7745     __ subs(temp0, s1, base);
 7746     __ csel(s1, temp0, s1, Assembler::HS);
 7747 
 7748     // s2 = s2 % BASE
 7749     __ lsr(temp0, s2, 16);
 7750     __ lsl(temp1, temp0, 4);
 7751     __ sub(temp1, temp1, temp0);
 7752     __ add(temp1, temp1, s2, ext::uxth);
 7753 
 7754     __ lsr(temp0, temp1, 16);
 7755     __ lsl(s2, temp0, 4);
 7756     __ sub(s2, s2, temp0);
 7757     __ add(s2, s2, temp1, ext:: uxth);
 7758 
 7759     __ subs(temp0, s2, base);
 7760     __ csel(s2, temp0, s2, Assembler::HS);
 7761 
 7762     // Combine lower bits and higher bits
 7763     __ bind(L_combine);
 7764     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7765 
 7766     __ ret(lr);
 7767 
 7768     return start;
 7769   }
 7770 
 7771   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7772           Register temp0, Register temp1, FloatRegister vbytes,
 7773           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7774     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7775     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7776     // In non-vectorized code, we update s1 and s2 as:
 7777     //   s1 <- s1 + b1
 7778     //   s2 <- s2 + s1
 7779     //   s1 <- s1 + b2
 7780     //   s2 <- s2 + b1
 7781     //   ...
 7782     //   s1 <- s1 + b16
 7783     //   s2 <- s2 + s1
 7784     // Putting above assignments together, we have:
 7785     //   s1_new = s1 + b1 + b2 + ... + b16
 7786     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7787     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7788     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7789     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7790 
 7791     // s2 = s2 + s1 * 16
 7792     __ add(s2, s2, s1, Assembler::LSL, 4);
 7793 
 7794     // vs1acc = b1 + b2 + b3 + ... + b16
 7795     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7796     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7797     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7798     __ uaddlv(vs1acc, __ T16B, vbytes);
 7799     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7800 
 7801     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7802     __ fmovd(temp0, vs1acc);
 7803     __ fmovd(temp1, vs2acc);
 7804     __ add(s1, s1, temp0);
 7805     __ add(s2, s2, temp1);
 7806   }
 7807 
 7808   /**
 7809    *  Arguments:
 7810    *
 7811    *  Input:
 7812    *    c_rarg0   - x address
 7813    *    c_rarg1   - x length
 7814    *    c_rarg2   - y address
 7815    *    c_rarg3   - y length
 7816    *    c_rarg4   - z address
 7817    */
 7818   address generate_multiplyToLen() {
 7819     __ align(CodeEntryAlignment);
 7820     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7821     StubCodeMark mark(this, stub_id);
 7822 
 7823     address start = __ pc();
 7824     const Register x     = r0;
 7825     const Register xlen  = r1;
 7826     const Register y     = r2;
 7827     const Register ylen  = r3;
 7828     const Register z     = r4;
 7829 
 7830     const Register tmp0  = r5;
 7831     const Register tmp1  = r10;
 7832     const Register tmp2  = r11;
 7833     const Register tmp3  = r12;
 7834     const Register tmp4  = r13;
 7835     const Register tmp5  = r14;
 7836     const Register tmp6  = r15;
 7837     const Register tmp7  = r16;
 7838 
 7839     BLOCK_COMMENT("Entry:");
 7840     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7841     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7842     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7843     __ ret(lr);
 7844 
 7845     return start;
 7846   }
 7847 
 7848   address generate_squareToLen() {
 7849     // squareToLen algorithm for sizes 1..127 described in java code works
 7850     // faster than multiply_to_len on some CPUs and slower on others, but
 7851     // multiply_to_len shows a bit better overall results
 7852     __ align(CodeEntryAlignment);
 7853     StubId stub_id = StubId::stubgen_squareToLen_id;
 7854     StubCodeMark mark(this, stub_id);
 7855     address start = __ pc();
 7856 
 7857     const Register x     = r0;
 7858     const Register xlen  = r1;
 7859     const Register z     = r2;
 7860     const Register y     = r4; // == x
 7861     const Register ylen  = r5; // == xlen
 7862 
 7863     const Register tmp0  = r3;
 7864     const Register tmp1  = r10;
 7865     const Register tmp2  = r11;
 7866     const Register tmp3  = r12;
 7867     const Register tmp4  = r13;
 7868     const Register tmp5  = r14;
 7869     const Register tmp6  = r15;
 7870     const Register tmp7  = r16;
 7871 
 7872     RegSet spilled_regs = RegSet::of(y, ylen);
 7873     BLOCK_COMMENT("Entry:");
 7874     __ enter();
 7875     __ push(spilled_regs, sp);
 7876     __ mov(y, x);
 7877     __ mov(ylen, xlen);
 7878     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7879     __ pop(spilled_regs, sp);
 7880     __ leave();
 7881     __ ret(lr);
 7882     return start;
 7883   }
 7884 
 7885   address generate_mulAdd() {
 7886     __ align(CodeEntryAlignment);
 7887     StubId stub_id = StubId::stubgen_mulAdd_id;
 7888     StubCodeMark mark(this, stub_id);
 7889 
 7890     address start = __ pc();
 7891 
 7892     const Register out     = r0;
 7893     const Register in      = r1;
 7894     const Register offset  = r2;
 7895     const Register len     = r3;
 7896     const Register k       = r4;
 7897 
 7898     BLOCK_COMMENT("Entry:");
 7899     __ enter();
 7900     __ mul_add(out, in, offset, len, k);
 7901     __ leave();
 7902     __ ret(lr);
 7903 
 7904     return start;
 7905   }
 7906 
 7907   // Arguments:
 7908   //
 7909   // Input:
 7910   //   c_rarg0   - newArr address
 7911   //   c_rarg1   - oldArr address
 7912   //   c_rarg2   - newIdx
 7913   //   c_rarg3   - shiftCount
 7914   //   c_rarg4   - numIter
 7915   //
 7916   address generate_bigIntegerRightShift() {
 7917     __ align(CodeEntryAlignment);
 7918     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7919     StubCodeMark mark(this, stub_id);
 7920     address start = __ pc();
 7921 
 7922     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7923 
 7924     Register newArr        = c_rarg0;
 7925     Register oldArr        = c_rarg1;
 7926     Register newIdx        = c_rarg2;
 7927     Register shiftCount    = c_rarg3;
 7928     Register numIter       = c_rarg4;
 7929     Register idx           = numIter;
 7930 
 7931     Register newArrCur     = rscratch1;
 7932     Register shiftRevCount = rscratch2;
 7933     Register oldArrCur     = r13;
 7934     Register oldArrNext    = r14;
 7935 
 7936     FloatRegister oldElem0        = v0;
 7937     FloatRegister oldElem1        = v1;
 7938     FloatRegister newElem         = v2;
 7939     FloatRegister shiftVCount     = v3;
 7940     FloatRegister shiftVRevCount  = v4;
 7941 
 7942     __ cbz(idx, Exit);
 7943 
 7944     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7945 
 7946     // left shift count
 7947     __ movw(shiftRevCount, 32);
 7948     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7949 
 7950     // numIter too small to allow a 4-words SIMD loop, rolling back
 7951     __ cmp(numIter, (u1)4);
 7952     __ br(Assembler::LT, ShiftThree);
 7953 
 7954     __ dup(shiftVCount,    __ T4S, shiftCount);
 7955     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7956     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7957 
 7958     __ BIND(ShiftSIMDLoop);
 7959 
 7960     // Calculate the load addresses
 7961     __ sub(idx, idx, 4);
 7962     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7963     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7964     __ add(oldArrCur,  oldArrNext, 4);
 7965 
 7966     // Load 4 words and process
 7967     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7968     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7969     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7970     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7971     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7972     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7973 
 7974     __ cmp(idx, (u1)4);
 7975     __ br(Assembler::LT, ShiftTwoLoop);
 7976     __ b(ShiftSIMDLoop);
 7977 
 7978     __ BIND(ShiftTwoLoop);
 7979     __ cbz(idx, Exit);
 7980     __ cmp(idx, (u1)1);
 7981     __ br(Assembler::EQ, ShiftOne);
 7982 
 7983     // Calculate the load addresses
 7984     __ sub(idx, idx, 2);
 7985     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7986     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7987     __ add(oldArrCur,  oldArrNext, 4);
 7988 
 7989     // Load 2 words and process
 7990     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7991     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7992     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7993     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7994     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7995     __ st1(newElem,   __ T2S, Address(newArrCur));
 7996     __ b(ShiftTwoLoop);
 7997 
 7998     __ BIND(ShiftThree);
 7999     __ tbz(idx, 1, ShiftOne);
 8000     __ tbz(idx, 0, ShiftTwo);
 8001     __ ldrw(r10,  Address(oldArr, 12));
 8002     __ ldrw(r11,  Address(oldArr, 8));
 8003     __ lsrvw(r10, r10, shiftCount);
 8004     __ lslvw(r11, r11, shiftRevCount);
 8005     __ orrw(r12,  r10, r11);
 8006     __ strw(r12,  Address(newArr, 8));
 8007 
 8008     __ BIND(ShiftTwo);
 8009     __ ldrw(r10,  Address(oldArr, 8));
 8010     __ ldrw(r11,  Address(oldArr, 4));
 8011     __ lsrvw(r10, r10, shiftCount);
 8012     __ lslvw(r11, r11, shiftRevCount);
 8013     __ orrw(r12,  r10, r11);
 8014     __ strw(r12,  Address(newArr, 4));
 8015 
 8016     __ BIND(ShiftOne);
 8017     __ ldrw(r10,  Address(oldArr, 4));
 8018     __ ldrw(r11,  Address(oldArr));
 8019     __ lsrvw(r10, r10, shiftCount);
 8020     __ lslvw(r11, r11, shiftRevCount);
 8021     __ orrw(r12,  r10, r11);
 8022     __ strw(r12,  Address(newArr));
 8023 
 8024     __ BIND(Exit);
 8025     __ ret(lr);
 8026 
 8027     return start;
 8028   }
 8029 
 8030   // Arguments:
 8031   //
 8032   // Input:
 8033   //   c_rarg0   - newArr address
 8034   //   c_rarg1   - oldArr address
 8035   //   c_rarg2   - newIdx
 8036   //   c_rarg3   - shiftCount
 8037   //   c_rarg4   - numIter
 8038   //
 8039   address generate_bigIntegerLeftShift() {
 8040     __ align(CodeEntryAlignment);
 8041     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8042     StubCodeMark mark(this, stub_id);
 8043     address start = __ pc();
 8044 
 8045     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8046 
 8047     Register newArr        = c_rarg0;
 8048     Register oldArr        = c_rarg1;
 8049     Register newIdx        = c_rarg2;
 8050     Register shiftCount    = c_rarg3;
 8051     Register numIter       = c_rarg4;
 8052 
 8053     Register shiftRevCount = rscratch1;
 8054     Register oldArrNext    = rscratch2;
 8055 
 8056     FloatRegister oldElem0        = v0;
 8057     FloatRegister oldElem1        = v1;
 8058     FloatRegister newElem         = v2;
 8059     FloatRegister shiftVCount     = v3;
 8060     FloatRegister shiftVRevCount  = v4;
 8061 
 8062     __ cbz(numIter, Exit);
 8063 
 8064     __ add(oldArrNext, oldArr, 4);
 8065     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8066 
 8067     // right shift count
 8068     __ movw(shiftRevCount, 32);
 8069     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8070 
 8071     // numIter too small to allow a 4-words SIMD loop, rolling back
 8072     __ cmp(numIter, (u1)4);
 8073     __ br(Assembler::LT, ShiftThree);
 8074 
 8075     __ dup(shiftVCount,     __ T4S, shiftCount);
 8076     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8077     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8078 
 8079     __ BIND(ShiftSIMDLoop);
 8080 
 8081     // load 4 words and process
 8082     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8083     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8084     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8085     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8086     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8087     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8088     __ sub(numIter,   numIter, 4);
 8089 
 8090     __ cmp(numIter, (u1)4);
 8091     __ br(Assembler::LT, ShiftTwoLoop);
 8092     __ b(ShiftSIMDLoop);
 8093 
 8094     __ BIND(ShiftTwoLoop);
 8095     __ cbz(numIter, Exit);
 8096     __ cmp(numIter, (u1)1);
 8097     __ br(Assembler::EQ, ShiftOne);
 8098 
 8099     // load 2 words and process
 8100     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8101     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8102     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8103     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8104     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8105     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8106     __ sub(numIter,   numIter, 2);
 8107     __ b(ShiftTwoLoop);
 8108 
 8109     __ BIND(ShiftThree);
 8110     __ ldrw(r10,  __ post(oldArr, 4));
 8111     __ ldrw(r11,  __ post(oldArrNext, 4));
 8112     __ lslvw(r10, r10, shiftCount);
 8113     __ lsrvw(r11, r11, shiftRevCount);
 8114     __ orrw(r12,  r10, r11);
 8115     __ strw(r12,  __ post(newArr, 4));
 8116     __ tbz(numIter, 1, Exit);
 8117     __ tbz(numIter, 0, ShiftOne);
 8118 
 8119     __ BIND(ShiftTwo);
 8120     __ ldrw(r10,  __ post(oldArr, 4));
 8121     __ ldrw(r11,  __ post(oldArrNext, 4));
 8122     __ lslvw(r10, r10, shiftCount);
 8123     __ lsrvw(r11, r11, shiftRevCount);
 8124     __ orrw(r12,  r10, r11);
 8125     __ strw(r12,  __ post(newArr, 4));
 8126 
 8127     __ BIND(ShiftOne);
 8128     __ ldrw(r10,  Address(oldArr));
 8129     __ ldrw(r11,  Address(oldArrNext));
 8130     __ lslvw(r10, r10, shiftCount);
 8131     __ lsrvw(r11, r11, shiftRevCount);
 8132     __ orrw(r12,  r10, r11);
 8133     __ strw(r12,  Address(newArr));
 8134 
 8135     __ BIND(Exit);
 8136     __ ret(lr);
 8137 
 8138     return start;
 8139   }
 8140 
 8141   address generate_count_positives(address &count_positives_long) {
 8142     const u1 large_loop_size = 64;
 8143     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8144     int dcache_line = VM_Version::dcache_line_size();
 8145 
 8146     Register ary1 = r1, len = r2, result = r0;
 8147 
 8148     __ align(CodeEntryAlignment);
 8149 
 8150     StubId stub_id = StubId::stubgen_count_positives_id;
 8151     StubCodeMark mark(this, stub_id);
 8152 
 8153     address entry = __ pc();
 8154 
 8155     __ enter();
 8156     // precondition: a copy of len is already in result
 8157     // __ mov(result, len);
 8158 
 8159   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8160         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8161 
 8162   __ cmp(len, (u1)15);
 8163   __ br(Assembler::GT, LEN_OVER_15);
 8164   // The only case when execution falls into this code is when pointer is near
 8165   // the end of memory page and we have to avoid reading next page
 8166   __ add(ary1, ary1, len);
 8167   __ subs(len, len, 8);
 8168   __ br(Assembler::GT, LEN_OVER_8);
 8169   __ ldr(rscratch2, Address(ary1, -8));
 8170   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8171   __ lsrv(rscratch2, rscratch2, rscratch1);
 8172   __ tst(rscratch2, UPPER_BIT_MASK);
 8173   __ csel(result, zr, result, Assembler::NE);
 8174   __ leave();
 8175   __ ret(lr);
 8176   __ bind(LEN_OVER_8);
 8177   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8178   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8179   __ tst(rscratch2, UPPER_BIT_MASK);
 8180   __ br(Assembler::NE, RET_NO_POP);
 8181   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8182   __ lsrv(rscratch1, rscratch1, rscratch2);
 8183   __ tst(rscratch1, UPPER_BIT_MASK);
 8184   __ bind(RET_NO_POP);
 8185   __ csel(result, zr, result, Assembler::NE);
 8186   __ leave();
 8187   __ ret(lr);
 8188 
 8189   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8190   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8191 
 8192   count_positives_long = __ pc(); // 2nd entry point
 8193 
 8194   __ enter();
 8195 
 8196   __ bind(LEN_OVER_15);
 8197     __ push(spilled_regs, sp);
 8198     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8199     __ cbz(rscratch2, ALIGNED);
 8200     __ ldp(tmp6, tmp1, Address(ary1));
 8201     __ mov(tmp5, 16);
 8202     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8203     __ add(ary1, ary1, rscratch1);
 8204     __ orr(tmp6, tmp6, tmp1);
 8205     __ tst(tmp6, UPPER_BIT_MASK);
 8206     __ br(Assembler::NE, RET_ADJUST);
 8207     __ sub(len, len, rscratch1);
 8208 
 8209   __ bind(ALIGNED);
 8210     __ cmp(len, large_loop_size);
 8211     __ br(Assembler::LT, CHECK_16);
 8212     // Perform 16-byte load as early return in pre-loop to handle situation
 8213     // when initially aligned large array has negative values at starting bytes,
 8214     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8215     // slower. Cases with negative bytes further ahead won't be affected that
 8216     // much. In fact, it'll be faster due to early loads, less instructions and
 8217     // less branches in LARGE_LOOP.
 8218     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8219     __ sub(len, len, 16);
 8220     __ orr(tmp6, tmp6, tmp1);
 8221     __ tst(tmp6, UPPER_BIT_MASK);
 8222     __ br(Assembler::NE, RET_ADJUST_16);
 8223     __ cmp(len, large_loop_size);
 8224     __ br(Assembler::LT, CHECK_16);
 8225 
 8226     if (SoftwarePrefetchHintDistance >= 0
 8227         && SoftwarePrefetchHintDistance >= dcache_line) {
 8228       // initial prefetch
 8229       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8230     }
 8231   __ bind(LARGE_LOOP);
 8232     if (SoftwarePrefetchHintDistance >= 0) {
 8233       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8234     }
 8235     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8236     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8237     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8238     // instructions per cycle and have less branches, but this approach disables
 8239     // early return, thus, all 64 bytes are loaded and checked every time.
 8240     __ ldp(tmp2, tmp3, Address(ary1));
 8241     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8242     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8243     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8244     __ add(ary1, ary1, large_loop_size);
 8245     __ sub(len, len, large_loop_size);
 8246     __ orr(tmp2, tmp2, tmp3);
 8247     __ orr(tmp4, tmp4, tmp5);
 8248     __ orr(rscratch1, rscratch1, rscratch2);
 8249     __ orr(tmp6, tmp6, tmp1);
 8250     __ orr(tmp2, tmp2, tmp4);
 8251     __ orr(rscratch1, rscratch1, tmp6);
 8252     __ orr(tmp2, tmp2, rscratch1);
 8253     __ tst(tmp2, UPPER_BIT_MASK);
 8254     __ br(Assembler::NE, RET_ADJUST_LONG);
 8255     __ cmp(len, large_loop_size);
 8256     __ br(Assembler::GE, LARGE_LOOP);
 8257 
 8258   __ bind(CHECK_16); // small 16-byte load pre-loop
 8259     __ cmp(len, (u1)16);
 8260     __ br(Assembler::LT, POST_LOOP16);
 8261 
 8262   __ bind(LOOP16); // small 16-byte load loop
 8263     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8264     __ sub(len, len, 16);
 8265     __ orr(tmp2, tmp2, tmp3);
 8266     __ tst(tmp2, UPPER_BIT_MASK);
 8267     __ br(Assembler::NE, RET_ADJUST_16);
 8268     __ cmp(len, (u1)16);
 8269     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8270 
 8271   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8272     __ cmp(len, (u1)8);
 8273     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8274     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8275     __ tst(tmp3, UPPER_BIT_MASK);
 8276     __ br(Assembler::NE, RET_ADJUST);
 8277     __ sub(len, len, 8);
 8278 
 8279   __ bind(POST_LOOP16_LOAD_TAIL);
 8280     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8281     __ ldr(tmp1, Address(ary1));
 8282     __ mov(tmp2, 64);
 8283     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8284     __ lslv(tmp1, tmp1, tmp4);
 8285     __ tst(tmp1, UPPER_BIT_MASK);
 8286     __ br(Assembler::NE, RET_ADJUST);
 8287     // Fallthrough
 8288 
 8289   __ bind(RET_LEN);
 8290     __ pop(spilled_regs, sp);
 8291     __ leave();
 8292     __ ret(lr);
 8293 
 8294     // difference result - len is the count of guaranteed to be
 8295     // positive bytes
 8296 
 8297   __ bind(RET_ADJUST_LONG);
 8298     __ add(len, len, (u1)(large_loop_size - 16));
 8299   __ bind(RET_ADJUST_16);
 8300     __ add(len, len, 16);
 8301   __ bind(RET_ADJUST);
 8302     __ pop(spilled_regs, sp);
 8303     __ leave();
 8304     __ sub(result, result, len);
 8305     __ ret(lr);
 8306 
 8307     return entry;
 8308   }
 8309 
 8310   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8311         bool usePrefetch, Label &NOT_EQUAL) {
 8312     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8313         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8314         tmp7 = r12, tmp8 = r13;
 8315     Label LOOP;
 8316 
 8317     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8318     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8319     __ bind(LOOP);
 8320     if (usePrefetch) {
 8321       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8322       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8323     }
 8324     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8325     __ eor(tmp1, tmp1, tmp2);
 8326     __ eor(tmp3, tmp3, tmp4);
 8327     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8328     __ orr(tmp1, tmp1, tmp3);
 8329     __ cbnz(tmp1, NOT_EQUAL);
 8330     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8331     __ eor(tmp5, tmp5, tmp6);
 8332     __ eor(tmp7, tmp7, tmp8);
 8333     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8334     __ orr(tmp5, tmp5, tmp7);
 8335     __ cbnz(tmp5, NOT_EQUAL);
 8336     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8337     __ eor(tmp1, tmp1, tmp2);
 8338     __ eor(tmp3, tmp3, tmp4);
 8339     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8340     __ orr(tmp1, tmp1, tmp3);
 8341     __ cbnz(tmp1, NOT_EQUAL);
 8342     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8343     __ eor(tmp5, tmp5, tmp6);
 8344     __ sub(cnt1, cnt1, 8 * wordSize);
 8345     __ eor(tmp7, tmp7, tmp8);
 8346     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8347     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8348     // cmp) because subs allows an unlimited range of immediate operand.
 8349     __ subs(tmp6, cnt1, loopThreshold);
 8350     __ orr(tmp5, tmp5, tmp7);
 8351     __ cbnz(tmp5, NOT_EQUAL);
 8352     __ br(__ GE, LOOP);
 8353     // post-loop
 8354     __ eor(tmp1, tmp1, tmp2);
 8355     __ eor(tmp3, tmp3, tmp4);
 8356     __ orr(tmp1, tmp1, tmp3);
 8357     __ sub(cnt1, cnt1, 2 * wordSize);
 8358     __ cbnz(tmp1, NOT_EQUAL);
 8359   }
 8360 
 8361   void generate_large_array_equals_loop_simd(int loopThreshold,
 8362         bool usePrefetch, Label &NOT_EQUAL) {
 8363     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8364         tmp2 = rscratch2;
 8365     Label LOOP;
 8366 
 8367     __ bind(LOOP);
 8368     if (usePrefetch) {
 8369       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8370       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8371     }
 8372     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8373     __ sub(cnt1, cnt1, 8 * wordSize);
 8374     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8375     __ subs(tmp1, cnt1, loopThreshold);
 8376     __ eor(v0, __ T16B, v0, v4);
 8377     __ eor(v1, __ T16B, v1, v5);
 8378     __ eor(v2, __ T16B, v2, v6);
 8379     __ eor(v3, __ T16B, v3, v7);
 8380     __ orr(v0, __ T16B, v0, v1);
 8381     __ orr(v1, __ T16B, v2, v3);
 8382     __ orr(v0, __ T16B, v0, v1);
 8383     __ umov(tmp1, v0, __ D, 0);
 8384     __ umov(tmp2, v0, __ D, 1);
 8385     __ orr(tmp1, tmp1, tmp2);
 8386     __ cbnz(tmp1, NOT_EQUAL);
 8387     __ br(__ GE, LOOP);
 8388   }
 8389 
 8390   // a1 = r1 - array1 address
 8391   // a2 = r2 - array2 address
 8392   // result = r0 - return value. Already contains "false"
 8393   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8394   // r3-r5 are reserved temporary registers
 8395   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8396   address generate_large_array_equals() {
 8397     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8398         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8399         tmp7 = r12, tmp8 = r13;
 8400     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8401         SMALL_LOOP, POST_LOOP;
 8402     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8403     // calculate if at least 32 prefetched bytes are used
 8404     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8405     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8406     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8407     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8408         tmp5, tmp6, tmp7, tmp8);
 8409 
 8410     __ align(CodeEntryAlignment);
 8411 
 8412     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8413     StubCodeMark mark(this, stub_id);
 8414 
 8415     address entry = __ pc();
 8416     __ enter();
 8417     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8418     // also advance pointers to use post-increment instead of pre-increment
 8419     __ add(a1, a1, wordSize);
 8420     __ add(a2, a2, wordSize);
 8421     if (AvoidUnalignedAccesses) {
 8422       // both implementations (SIMD/nonSIMD) are using relatively large load
 8423       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8424       // on some CPUs in case of address is not at least 16-byte aligned.
 8425       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8426       // load if needed at least for 1st address and make if 16-byte aligned.
 8427       Label ALIGNED16;
 8428       __ tbz(a1, 3, ALIGNED16);
 8429       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8430       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8431       __ sub(cnt1, cnt1, wordSize);
 8432       __ eor(tmp1, tmp1, tmp2);
 8433       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8434       __ bind(ALIGNED16);
 8435     }
 8436     if (UseSIMDForArrayEquals) {
 8437       if (SoftwarePrefetchHintDistance >= 0) {
 8438         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8439         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8440         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8441             /* prfm = */ true, NOT_EQUAL);
 8442         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8443         __ br(__ LT, TAIL);
 8444       }
 8445       __ bind(NO_PREFETCH_LARGE_LOOP);
 8446       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8447           /* prfm = */ false, NOT_EQUAL);
 8448     } else {
 8449       __ push(spilled_regs, sp);
 8450       if (SoftwarePrefetchHintDistance >= 0) {
 8451         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8452         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8453         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8454             /* prfm = */ true, NOT_EQUAL);
 8455         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8456         __ br(__ LT, TAIL);
 8457       }
 8458       __ bind(NO_PREFETCH_LARGE_LOOP);
 8459       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8460           /* prfm = */ false, NOT_EQUAL);
 8461     }
 8462     __ bind(TAIL);
 8463       __ cbz(cnt1, EQUAL);
 8464       __ subs(cnt1, cnt1, wordSize);
 8465       __ br(__ LE, POST_LOOP);
 8466     __ bind(SMALL_LOOP);
 8467       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8468       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8469       __ subs(cnt1, cnt1, wordSize);
 8470       __ eor(tmp1, tmp1, tmp2);
 8471       __ cbnz(tmp1, NOT_EQUAL);
 8472       __ br(__ GT, SMALL_LOOP);
 8473     __ bind(POST_LOOP);
 8474       __ ldr(tmp1, Address(a1, cnt1));
 8475       __ ldr(tmp2, Address(a2, cnt1));
 8476       __ eor(tmp1, tmp1, tmp2);
 8477       __ cbnz(tmp1, NOT_EQUAL);
 8478     __ bind(EQUAL);
 8479       __ mov(result, true);
 8480     __ bind(NOT_EQUAL);
 8481       if (!UseSIMDForArrayEquals) {
 8482         __ pop(spilled_regs, sp);
 8483       }
 8484     __ bind(NOT_EQUAL_NO_POP);
 8485     __ leave();
 8486     __ ret(lr);
 8487     return entry;
 8488   }
 8489 
 8490   // result = r0 - return value. Contains initial hashcode value on entry.
 8491   // ary = r1 - array address
 8492   // cnt = r2 - elements count
 8493   // Clobbers: v0-v13, rscratch1, rscratch2
 8494   address generate_large_arrays_hashcode(BasicType eltype) {
 8495     const Register result = r0, ary = r1, cnt = r2;
 8496     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8497     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8498     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8499     const FloatRegister vpowm = v13;
 8500 
 8501     ARRAYS_HASHCODE_REGISTERS;
 8502 
 8503     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8504 
 8505     unsigned int vf; // vectorization factor
 8506     bool multiply_by_halves;
 8507     Assembler::SIMD_Arrangement load_arrangement;
 8508     switch (eltype) {
 8509     case T_BOOLEAN:
 8510     case T_BYTE:
 8511       load_arrangement = Assembler::T8B;
 8512       multiply_by_halves = true;
 8513       vf = 8;
 8514       break;
 8515     case T_CHAR:
 8516     case T_SHORT:
 8517       load_arrangement = Assembler::T8H;
 8518       multiply_by_halves = true;
 8519       vf = 8;
 8520       break;
 8521     case T_INT:
 8522       load_arrangement = Assembler::T4S;
 8523       multiply_by_halves = false;
 8524       vf = 4;
 8525       break;
 8526     default:
 8527       ShouldNotReachHere();
 8528     }
 8529 
 8530     // Unroll factor
 8531     const unsigned uf = 4;
 8532 
 8533     // Effective vectorization factor
 8534     const unsigned evf = vf * uf;
 8535 
 8536     __ align(CodeEntryAlignment);
 8537 
 8538     StubId stub_id;
 8539     switch (eltype) {
 8540     case T_BOOLEAN:
 8541       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8542       break;
 8543     case T_BYTE:
 8544       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8545       break;
 8546     case T_CHAR:
 8547       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8548       break;
 8549     case T_SHORT:
 8550       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8551       break;
 8552     case T_INT:
 8553       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8554       break;
 8555     default:
 8556       stub_id = StubId::NO_STUBID;
 8557       ShouldNotReachHere();
 8558     };
 8559 
 8560     StubCodeMark mark(this, stub_id);
 8561 
 8562     address entry = __ pc();
 8563     __ enter();
 8564 
 8565     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8566     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8567     // value shouldn't change throughout both loops.
 8568     __ movw(rscratch1, intpow(31U, 3));
 8569     __ mov(vpow, Assembler::S, 0, rscratch1);
 8570     __ movw(rscratch1, intpow(31U, 2));
 8571     __ mov(vpow, Assembler::S, 1, rscratch1);
 8572     __ movw(rscratch1, intpow(31U, 1));
 8573     __ mov(vpow, Assembler::S, 2, rscratch1);
 8574     __ movw(rscratch1, intpow(31U, 0));
 8575     __ mov(vpow, Assembler::S, 3, rscratch1);
 8576 
 8577     __ mov(vmul0, Assembler::T16B, 0);
 8578     __ mov(vmul0, Assembler::S, 3, result);
 8579 
 8580     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8581     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8582 
 8583     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8584     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8585 
 8586     // SMALL LOOP
 8587     __ bind(SMALL_LOOP);
 8588 
 8589     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8590     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8591     __ subsw(rscratch2, rscratch2, vf);
 8592 
 8593     if (load_arrangement == Assembler::T8B) {
 8594       // Extend 8B to 8H to be able to use vector multiply
 8595       // instructions
 8596       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8597       if (is_signed_subword_type(eltype)) {
 8598         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8599       } else {
 8600         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8601       }
 8602     }
 8603 
 8604     switch (load_arrangement) {
 8605     case Assembler::T4S:
 8606       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8607       break;
 8608     case Assembler::T8B:
 8609     case Assembler::T8H:
 8610       assert(is_subword_type(eltype), "subword type expected");
 8611       if (is_signed_subword_type(eltype)) {
 8612         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8613       } else {
 8614         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8615       }
 8616       break;
 8617     default:
 8618       __ should_not_reach_here();
 8619     }
 8620 
 8621     // Process the upper half of a vector
 8622     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8623       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8624       if (is_signed_subword_type(eltype)) {
 8625         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8626       } else {
 8627         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8628       }
 8629     }
 8630 
 8631     __ br(Assembler::HI, SMALL_LOOP);
 8632 
 8633     // SMALL LOOP'S EPILOQUE
 8634     __ lsr(rscratch2, cnt, exact_log2(evf));
 8635     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8636 
 8637     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8638     __ addv(vmul0, Assembler::T4S, vmul0);
 8639     __ umov(result, vmul0, Assembler::S, 0);
 8640 
 8641     // TAIL
 8642     __ bind(TAIL);
 8643 
 8644     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8645     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8646     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8647     __ andr(rscratch2, cnt, vf - 1);
 8648     __ bind(TAIL_SHORTCUT);
 8649     __ adr(rscratch1, BR_BASE);
 8650     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8651     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8652     __ movw(rscratch2, 0x1f);
 8653     __ br(rscratch1);
 8654 
 8655     for (size_t i = 0; i < vf - 1; ++i) {
 8656       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8657                                    eltype);
 8658       __ maddw(result, result, rscratch2, rscratch1);
 8659       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8660       // Generate 2nd nop to have 4 instructions per iteration.
 8661       if (VM_Version::supports_a53mac()) {
 8662         __ nop();
 8663       }
 8664     }
 8665     __ bind(BR_BASE);
 8666 
 8667     __ leave();
 8668     __ ret(lr);
 8669 
 8670     // LARGE LOOP
 8671     __ bind(LARGE_LOOP_PREHEADER);
 8672 
 8673     __ lsr(rscratch2, cnt, exact_log2(evf));
 8674 
 8675     if (multiply_by_halves) {
 8676       // 31^4 - multiplier between lower and upper parts of a register
 8677       __ movw(rscratch1, intpow(31U, vf / 2));
 8678       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8679       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8680       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8681       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8682     } else {
 8683       // 31^16
 8684       __ movw(rscratch1, intpow(31U, evf));
 8685       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8686     }
 8687 
 8688     __ mov(vmul3, Assembler::T16B, 0);
 8689     __ mov(vmul2, Assembler::T16B, 0);
 8690     __ mov(vmul1, Assembler::T16B, 0);
 8691 
 8692     __ bind(LARGE_LOOP);
 8693 
 8694     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8695     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8696     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8697     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8698 
 8699     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8700            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8701 
 8702     if (load_arrangement == Assembler::T8B) {
 8703       // Extend 8B to 8H to be able to use vector multiply
 8704       // instructions
 8705       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8706       if (is_signed_subword_type(eltype)) {
 8707         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8708         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8709         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8710         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8711       } else {
 8712         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8713         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8714         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8715         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8716       }
 8717     }
 8718 
 8719     switch (load_arrangement) {
 8720     case Assembler::T4S:
 8721       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8722       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8723       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8724       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8725       break;
 8726     case Assembler::T8B:
 8727     case Assembler::T8H:
 8728       assert(is_subword_type(eltype), "subword type expected");
 8729       if (is_signed_subword_type(eltype)) {
 8730         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8731         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8732         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8733         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8734       } else {
 8735         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8736         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8737         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8738         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8739       }
 8740       break;
 8741     default:
 8742       __ should_not_reach_here();
 8743     }
 8744 
 8745     // Process the upper half of a vector
 8746     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8747       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8748       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8749       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8750       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8751       if (is_signed_subword_type(eltype)) {
 8752         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8753         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8754         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8755         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8756       } else {
 8757         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8758         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8759         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8760         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8761       }
 8762     }
 8763 
 8764     __ subsw(rscratch2, rscratch2, 1);
 8765     __ br(Assembler::HI, LARGE_LOOP);
 8766 
 8767     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8768     __ addv(vmul3, Assembler::T4S, vmul3);
 8769     __ umov(result, vmul3, Assembler::S, 0);
 8770 
 8771     __ mov(rscratch2, intpow(31U, vf));
 8772 
 8773     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8774     __ addv(vmul2, Assembler::T4S, vmul2);
 8775     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8776     __ maddw(result, result, rscratch2, rscratch1);
 8777 
 8778     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8779     __ addv(vmul1, Assembler::T4S, vmul1);
 8780     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8781     __ maddw(result, result, rscratch2, rscratch1);
 8782 
 8783     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8784     __ addv(vmul0, Assembler::T4S, vmul0);
 8785     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8786     __ maddw(result, result, rscratch2, rscratch1);
 8787 
 8788     __ andr(rscratch2, cnt, vf - 1);
 8789     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8790 
 8791     __ leave();
 8792     __ ret(lr);
 8793 
 8794     return entry;
 8795   }
 8796 
 8797   address generate_dsin_dcos(bool isCos) {
 8798     __ align(CodeEntryAlignment);
 8799     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8800     StubCodeMark mark(this, stub_id);
 8801     address start = __ pc();
 8802     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8803         (address)StubRoutines::aarch64::_two_over_pi,
 8804         (address)StubRoutines::aarch64::_pio2,
 8805         (address)StubRoutines::aarch64::_dsin_coef,
 8806         (address)StubRoutines::aarch64::_dcos_coef);
 8807     return start;
 8808   }
 8809 
 8810   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8811   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8812       Label &DIFF2) {
 8813     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8814     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8815 
 8816     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8817     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8818     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8819     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8820 
 8821     __ fmovd(tmpL, vtmp3);
 8822     __ eor(rscratch2, tmp3, tmpL);
 8823     __ cbnz(rscratch2, DIFF2);
 8824 
 8825     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8826     __ umov(tmpL, vtmp3, __ D, 1);
 8827     __ eor(rscratch2, tmpU, tmpL);
 8828     __ cbnz(rscratch2, DIFF1);
 8829 
 8830     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8831     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8832     __ fmovd(tmpL, vtmp);
 8833     __ eor(rscratch2, tmp3, tmpL);
 8834     __ cbnz(rscratch2, DIFF2);
 8835 
 8836     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8837     __ umov(tmpL, vtmp, __ D, 1);
 8838     __ eor(rscratch2, tmpU, tmpL);
 8839     __ cbnz(rscratch2, DIFF1);
 8840   }
 8841 
 8842   // r0  = result
 8843   // r1  = str1
 8844   // r2  = cnt1
 8845   // r3  = str2
 8846   // r4  = cnt2
 8847   // r10 = tmp1
 8848   // r11 = tmp2
 8849   address generate_compare_long_string_different_encoding(bool isLU) {
 8850     __ align(CodeEntryAlignment);
 8851     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8852     StubCodeMark mark(this, stub_id);
 8853     address entry = __ pc();
 8854     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8855         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8856         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8857     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8858         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8859     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8860     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8861 
 8862     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8863 
 8864     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8865     // cnt2 == amount of characters left to compare
 8866     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8867     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8868     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8869     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8870     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8871     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8872     __ eor(rscratch2, tmp1, tmp2);
 8873     __ mov(rscratch1, tmp2);
 8874     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8875     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8876              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8877     __ push(spilled_regs, sp);
 8878     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8879     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8880 
 8881     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8882 
 8883     if (SoftwarePrefetchHintDistance >= 0) {
 8884       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8885       __ br(__ LT, NO_PREFETCH);
 8886       __ bind(LARGE_LOOP_PREFETCH);
 8887         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8888         __ mov(tmp4, 2);
 8889         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8890         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8891           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8892           __ subs(tmp4, tmp4, 1);
 8893           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8894           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8895           __ mov(tmp4, 2);
 8896         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8897           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8898           __ subs(tmp4, tmp4, 1);
 8899           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8900           __ sub(cnt2, cnt2, 64);
 8901           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8902           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8903     }
 8904     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8905     __ bind(NO_PREFETCH);
 8906     __ subs(cnt2, cnt2, 16);
 8907     __ br(__ LT, TAIL);
 8908     __ align(OptoLoopAlignment);
 8909     __ bind(SMALL_LOOP); // smaller loop
 8910       __ subs(cnt2, cnt2, 16);
 8911       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8912       __ br(__ GE, SMALL_LOOP);
 8913       __ cmn(cnt2, (u1)16);
 8914       __ br(__ EQ, LOAD_LAST);
 8915     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8916       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8917       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8918       __ ldr(tmp3, Address(cnt1, -8));
 8919       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8920       __ b(LOAD_LAST);
 8921     __ bind(DIFF2);
 8922       __ mov(tmpU, tmp3);
 8923     __ bind(DIFF1);
 8924       __ pop(spilled_regs, sp);
 8925       __ b(CALCULATE_DIFFERENCE);
 8926     __ bind(LOAD_LAST);
 8927       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8928       // No need to load it again
 8929       __ mov(tmpU, tmp3);
 8930       __ pop(spilled_regs, sp);
 8931 
 8932       // tmp2 points to the address of the last 4 Latin1 characters right now
 8933       __ ldrs(vtmp, Address(tmp2));
 8934       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8935       __ fmovd(tmpL, vtmp);
 8936 
 8937       __ eor(rscratch2, tmpU, tmpL);
 8938       __ cbz(rscratch2, DONE);
 8939 
 8940     // Find the first different characters in the longwords and
 8941     // compute their difference.
 8942     __ bind(CALCULATE_DIFFERENCE);
 8943       __ rev(rscratch2, rscratch2);
 8944       __ clz(rscratch2, rscratch2);
 8945       __ andr(rscratch2, rscratch2, -16);
 8946       __ lsrv(tmp1, tmp1, rscratch2);
 8947       __ uxthw(tmp1, tmp1);
 8948       __ lsrv(rscratch1, rscratch1, rscratch2);
 8949       __ uxthw(rscratch1, rscratch1);
 8950       __ subw(result, tmp1, rscratch1);
 8951     __ bind(DONE);
 8952       __ ret(lr);
 8953     return entry;
 8954   }
 8955 
 8956   // r0 = input (float16)
 8957   // v0 = result (float)
 8958   // v1 = temporary float register
 8959   address generate_float16ToFloat() {
 8960     __ align(CodeEntryAlignment);
 8961     StubId stub_id = StubId::stubgen_hf2f_id;
 8962     StubCodeMark mark(this, stub_id);
 8963     address entry = __ pc();
 8964     BLOCK_COMMENT("Entry:");
 8965     __ flt16_to_flt(v0, r0, v1);
 8966     __ ret(lr);
 8967     return entry;
 8968   }
 8969 
 8970   // v0 = input (float)
 8971   // r0 = result (float16)
 8972   // v1 = temporary float register
 8973   address generate_floatToFloat16() {
 8974     __ align(CodeEntryAlignment);
 8975     StubId stub_id = StubId::stubgen_f2hf_id;
 8976     StubCodeMark mark(this, stub_id);
 8977     address entry = __ pc();
 8978     BLOCK_COMMENT("Entry:");
 8979     __ flt_to_flt16(r0, v0, v1);
 8980     __ ret(lr);
 8981     return entry;
 8982   }
 8983 
 8984   address generate_method_entry_barrier() {
 8985     __ align(CodeEntryAlignment);
 8986     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 8987     StubCodeMark mark(this, stub_id);
 8988 
 8989     Label deoptimize_label;
 8990 
 8991     address start = __ pc();
 8992 
 8993     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8994 
 8995     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8996       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8997       // We can get here despite the nmethod being good, if we have not
 8998       // yet applied our cross modification fence (or data fence).
 8999       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9000       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9001       __ ldrw(rscratch2, rscratch2);
 9002       __ strw(rscratch2, thread_epoch_addr);
 9003       __ isb();
 9004       __ membar(__ LoadLoad);
 9005     }
 9006 
 9007     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9008 
 9009     __ enter();
 9010     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9011 
 9012     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9013 
 9014     __ push_call_clobbered_registers();
 9015 
 9016     __ mov(c_rarg0, rscratch2);
 9017     __ call_VM_leaf
 9018          (CAST_FROM_FN_PTR
 9019           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9020 
 9021     __ reset_last_Java_frame(true);
 9022 
 9023     __ mov(rscratch1, r0);
 9024 
 9025     __ pop_call_clobbered_registers();
 9026 
 9027     __ cbnz(rscratch1, deoptimize_label);
 9028 
 9029     __ leave();
 9030     __ ret(lr);
 9031 
 9032     __ BIND(deoptimize_label);
 9033 
 9034     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9035     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9036 
 9037     __ mov(sp, rscratch1);
 9038     __ br(rscratch2);
 9039 
 9040     return start;
 9041   }
 9042 
 9043   // r0  = result
 9044   // r1  = str1
 9045   // r2  = cnt1
 9046   // r3  = str2
 9047   // r4  = cnt2
 9048   // r10 = tmp1
 9049   // r11 = tmp2
 9050   address generate_compare_long_string_same_encoding(bool isLL) {
 9051     __ align(CodeEntryAlignment);
 9052     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9053     StubCodeMark mark(this, stub_id);
 9054     address entry = __ pc();
 9055     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9056         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9057 
 9058     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9059 
 9060     // exit from large loop when less than 64 bytes left to read or we're about
 9061     // to prefetch memory behind array border
 9062     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9063 
 9064     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9065     __ eor(rscratch2, tmp1, tmp2);
 9066     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9067 
 9068     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9069     // update pointers, because of previous read
 9070     __ add(str1, str1, wordSize);
 9071     __ add(str2, str2, wordSize);
 9072     if (SoftwarePrefetchHintDistance >= 0) {
 9073       __ align(OptoLoopAlignment);
 9074       __ bind(LARGE_LOOP_PREFETCH);
 9075         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9076         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9077 
 9078         for (int i = 0; i < 4; i++) {
 9079           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9080           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9081           __ cmp(tmp1, tmp2);
 9082           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9083           __ br(Assembler::NE, DIFF);
 9084         }
 9085         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9086         __ add(str1, str1, 64);
 9087         __ add(str2, str2, 64);
 9088         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9089         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9090         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9091     }
 9092 
 9093     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9094     __ br(Assembler::LE, LESS16);
 9095     __ align(OptoLoopAlignment);
 9096     __ bind(LOOP_COMPARE16);
 9097       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9098       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9099       __ cmp(tmp1, tmp2);
 9100       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9101       __ br(Assembler::NE, DIFF);
 9102       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9103       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9104       __ br(Assembler::LT, LESS16);
 9105 
 9106       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9107       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9108       __ cmp(tmp1, tmp2);
 9109       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9110       __ br(Assembler::NE, DIFF);
 9111       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9112       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9113       __ br(Assembler::GE, LOOP_COMPARE16);
 9114       __ cbz(cnt2, LENGTH_DIFF);
 9115 
 9116     __ bind(LESS16);
 9117       // each 8 compare
 9118       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9119       __ br(Assembler::LE, LESS8);
 9120       __ ldr(tmp1, Address(__ post(str1, 8)));
 9121       __ ldr(tmp2, Address(__ post(str2, 8)));
 9122       __ eor(rscratch2, tmp1, tmp2);
 9123       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9124       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9125 
 9126     __ bind(LESS8); // directly load last 8 bytes
 9127       if (!isLL) {
 9128         __ add(cnt2, cnt2, cnt2);
 9129       }
 9130       __ ldr(tmp1, Address(str1, cnt2));
 9131       __ ldr(tmp2, Address(str2, cnt2));
 9132       __ eor(rscratch2, tmp1, tmp2);
 9133       __ cbz(rscratch2, LENGTH_DIFF);
 9134       __ b(CAL_DIFFERENCE);
 9135 
 9136     __ bind(DIFF);
 9137       __ cmp(tmp1, tmp2);
 9138       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9139       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9140       // reuse rscratch2 register for the result of eor instruction
 9141       __ eor(rscratch2, tmp1, tmp2);
 9142 
 9143     __ bind(CAL_DIFFERENCE);
 9144       __ rev(rscratch2, rscratch2);
 9145       __ clz(rscratch2, rscratch2);
 9146       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9147       __ lsrv(tmp1, tmp1, rscratch2);
 9148       __ lsrv(tmp2, tmp2, rscratch2);
 9149       if (isLL) {
 9150         __ uxtbw(tmp1, tmp1);
 9151         __ uxtbw(tmp2, tmp2);
 9152       } else {
 9153         __ uxthw(tmp1, tmp1);
 9154         __ uxthw(tmp2, tmp2);
 9155       }
 9156       __ subw(result, tmp1, tmp2);
 9157 
 9158     __ bind(LENGTH_DIFF);
 9159       __ ret(lr);
 9160     return entry;
 9161   }
 9162 
 9163   enum string_compare_mode {
 9164     LL,
 9165     LU,
 9166     UL,
 9167     UU,
 9168   };
 9169 
 9170   // The following registers are declared in aarch64.ad
 9171   // r0  = result
 9172   // r1  = str1
 9173   // r2  = cnt1
 9174   // r3  = str2
 9175   // r4  = cnt2
 9176   // r10 = tmp1
 9177   // r11 = tmp2
 9178   // z0  = ztmp1
 9179   // z1  = ztmp2
 9180   // p0  = pgtmp1
 9181   // p1  = pgtmp2
 9182   address generate_compare_long_string_sve(string_compare_mode mode) {
 9183     StubId stub_id;
 9184     switch (mode) {
 9185       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9186       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9187       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9188       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9189       default: ShouldNotReachHere();
 9190     }
 9191 
 9192     __ align(CodeEntryAlignment);
 9193     address entry = __ pc();
 9194     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9195              tmp1 = r10, tmp2 = r11;
 9196 
 9197     Label LOOP, DONE, MISMATCH;
 9198     Register vec_len = tmp1;
 9199     Register idx = tmp2;
 9200     // The minimum of the string lengths has been stored in cnt2.
 9201     Register cnt = cnt2;
 9202     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9203     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9204 
 9205 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9206     switch (mode) {                                                            \
 9207       case LL:                                                                 \
 9208         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9209         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9210         break;                                                                 \
 9211       case LU:                                                                 \
 9212         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9213         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9214         break;                                                                 \
 9215       case UL:                                                                 \
 9216         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9217         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9218         break;                                                                 \
 9219       case UU:                                                                 \
 9220         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9221         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9222         break;                                                                 \
 9223       default:                                                                 \
 9224         ShouldNotReachHere();                                                  \
 9225     }
 9226 
 9227     StubCodeMark mark(this, stub_id);
 9228 
 9229     __ mov(idx, 0);
 9230     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9231 
 9232     if (mode == LL) {
 9233       __ sve_cntb(vec_len);
 9234     } else {
 9235       __ sve_cnth(vec_len);
 9236     }
 9237 
 9238     __ sub(rscratch1, cnt, vec_len);
 9239 
 9240     __ bind(LOOP);
 9241 
 9242       // main loop
 9243       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9244       __ add(idx, idx, vec_len);
 9245       // Compare strings.
 9246       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9247       __ br(__ NE, MISMATCH);
 9248       __ cmp(idx, rscratch1);
 9249       __ br(__ LT, LOOP);
 9250 
 9251     // post loop, last iteration
 9252     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9253 
 9254     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9255     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9256     __ br(__ EQ, DONE);
 9257 
 9258     __ bind(MISMATCH);
 9259 
 9260     // Crop the vector to find its location.
 9261     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9262     // Extract the first different characters of each string.
 9263     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9264     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9265 
 9266     // Compute the difference of the first different characters.
 9267     __ sub(result, rscratch1, rscratch2);
 9268 
 9269     __ bind(DONE);
 9270     __ ret(lr);
 9271 #undef LOAD_PAIR
 9272     return entry;
 9273   }
 9274 
 9275   void generate_compare_long_strings() {
 9276     if (UseSVE == 0) {
 9277       StubRoutines::aarch64::_compare_long_string_LL
 9278           = generate_compare_long_string_same_encoding(true);
 9279       StubRoutines::aarch64::_compare_long_string_UU
 9280           = generate_compare_long_string_same_encoding(false);
 9281       StubRoutines::aarch64::_compare_long_string_LU
 9282           = generate_compare_long_string_different_encoding(true);
 9283       StubRoutines::aarch64::_compare_long_string_UL
 9284           = generate_compare_long_string_different_encoding(false);
 9285     } else {
 9286       StubRoutines::aarch64::_compare_long_string_LL
 9287           = generate_compare_long_string_sve(LL);
 9288       StubRoutines::aarch64::_compare_long_string_UU
 9289           = generate_compare_long_string_sve(UU);
 9290       StubRoutines::aarch64::_compare_long_string_LU
 9291           = generate_compare_long_string_sve(LU);
 9292       StubRoutines::aarch64::_compare_long_string_UL
 9293           = generate_compare_long_string_sve(UL);
 9294     }
 9295   }
 9296 
 9297   // R0 = result
 9298   // R1 = str2
 9299   // R2 = cnt1
 9300   // R3 = str1
 9301   // R4 = cnt2
 9302   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9303   //
 9304   // This generic linear code use few additional ideas, which makes it faster:
 9305   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9306   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9307   // 2) we can use "fast" algorithm of finding single character to search for
 9308   // first symbol with less branches(1 branch per each loaded register instead
 9309   // of branch for each symbol), so, this is where constants like
 9310   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9311   // 3) after loading and analyzing 1st register of source string, it can be
 9312   // used to search for every 1st character entry, saving few loads in
 9313   // comparison with "simplier-but-slower" implementation
 9314   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9315   // re-using/re-initializing/compressing register values, which makes code
 9316   // larger and a bit less readable, however, most of extra operations are
 9317   // issued during loads or branches, so, penalty is minimal
 9318   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9319     StubId stub_id;
 9320     if (str1_isL) {
 9321       if (str2_isL) {
 9322         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9323       } else {
 9324         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9325       }
 9326     } else {
 9327       if (str2_isL) {
 9328         ShouldNotReachHere();
 9329       } else {
 9330         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9331       }
 9332     }
 9333     __ align(CodeEntryAlignment);
 9334     StubCodeMark mark(this, stub_id);
 9335     address entry = __ pc();
 9336 
 9337     int str1_chr_size = str1_isL ? 1 : 2;
 9338     int str2_chr_size = str2_isL ? 1 : 2;
 9339     int str1_chr_shift = str1_isL ? 0 : 1;
 9340     int str2_chr_shift = str2_isL ? 0 : 1;
 9341     bool isL = str1_isL && str2_isL;
 9342    // parameters
 9343     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9344     // temporary registers
 9345     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9346     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9347     // redefinitions
 9348     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9349 
 9350     __ push(spilled_regs, sp);
 9351     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9352         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9353         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9354         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9355         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9356         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9357     // Read whole register from str1. It is safe, because length >=8 here
 9358     __ ldr(ch1, Address(str1));
 9359     // Read whole register from str2. It is safe, because length >=8 here
 9360     __ ldr(ch2, Address(str2));
 9361     __ sub(cnt2, cnt2, cnt1);
 9362     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9363     if (str1_isL != str2_isL) {
 9364       __ eor(v0, __ T16B, v0, v0);
 9365     }
 9366     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9367     __ mul(first, first, tmp1);
 9368     // check if we have less than 1 register to check
 9369     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9370     if (str1_isL != str2_isL) {
 9371       __ fmovd(v1, ch1);
 9372     }
 9373     __ br(__ LE, L_SMALL);
 9374     __ eor(ch2, first, ch2);
 9375     if (str1_isL != str2_isL) {
 9376       __ zip1(v1, __ T16B, v1, v0);
 9377     }
 9378     __ sub(tmp2, ch2, tmp1);
 9379     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9380     __ bics(tmp2, tmp2, ch2);
 9381     if (str1_isL != str2_isL) {
 9382       __ fmovd(ch1, v1);
 9383     }
 9384     __ br(__ NE, L_HAS_ZERO);
 9385     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9386     __ add(result, result, wordSize/str2_chr_size);
 9387     __ add(str2, str2, wordSize);
 9388     __ br(__ LT, L_POST_LOOP);
 9389     __ BIND(L_LOOP);
 9390       __ ldr(ch2, Address(str2));
 9391       __ eor(ch2, first, ch2);
 9392       __ sub(tmp2, ch2, tmp1);
 9393       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9394       __ bics(tmp2, tmp2, ch2);
 9395       __ br(__ NE, L_HAS_ZERO);
 9396     __ BIND(L_LOOP_PROCEED);
 9397       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9398       __ add(str2, str2, wordSize);
 9399       __ add(result, result, wordSize/str2_chr_size);
 9400       __ br(__ GE, L_LOOP);
 9401     __ BIND(L_POST_LOOP);
 9402       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9403       __ br(__ LE, NOMATCH);
 9404       __ ldr(ch2, Address(str2));
 9405       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9406       __ eor(ch2, first, ch2);
 9407       __ sub(tmp2, ch2, tmp1);
 9408       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9409       __ mov(tmp4, -1); // all bits set
 9410       __ b(L_SMALL_PROCEED);
 9411     __ align(OptoLoopAlignment);
 9412     __ BIND(L_SMALL);
 9413       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9414       __ eor(ch2, first, ch2);
 9415       if (str1_isL != str2_isL) {
 9416         __ zip1(v1, __ T16B, v1, v0);
 9417       }
 9418       __ sub(tmp2, ch2, tmp1);
 9419       __ mov(tmp4, -1); // all bits set
 9420       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9421       if (str1_isL != str2_isL) {
 9422         __ fmovd(ch1, v1); // move converted 4 symbols
 9423       }
 9424     __ BIND(L_SMALL_PROCEED);
 9425       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9426       __ bic(tmp2, tmp2, ch2);
 9427       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9428       __ rbit(tmp2, tmp2);
 9429       __ br(__ EQ, NOMATCH);
 9430     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9431       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9432       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9433       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9434       if (str2_isL) { // LL
 9435         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9436         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9437         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9438         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9439         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9440       } else {
 9441         __ mov(ch2, 0xE); // all bits in byte set except last one
 9442         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9443         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9444         __ lslv(tmp2, tmp2, tmp4);
 9445         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9446         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9447         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9448         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9449       }
 9450       __ cmp(ch1, ch2);
 9451       __ mov(tmp4, wordSize/str2_chr_size);
 9452       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9453     __ BIND(L_SMALL_CMP_LOOP);
 9454       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9455                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9456       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9457                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9458       __ add(tmp4, tmp4, 1);
 9459       __ cmp(tmp4, cnt1);
 9460       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9461       __ cmp(first, ch2);
 9462       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9463     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9464       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9465       __ clz(tmp4, tmp2);
 9466       __ add(result, result, 1); // advance index
 9467       __ add(str2, str2, str2_chr_size); // advance pointer
 9468       __ b(L_SMALL_HAS_ZERO_LOOP);
 9469     __ align(OptoLoopAlignment);
 9470     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9471       __ cmp(first, ch2);
 9472       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9473       __ b(DONE);
 9474     __ align(OptoLoopAlignment);
 9475     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9476       if (str2_isL) { // LL
 9477         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9478         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9479         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9480         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9481         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9482       } else {
 9483         __ mov(ch2, 0xE); // all bits in byte set except last one
 9484         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9485         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9486         __ lslv(tmp2, tmp2, tmp4);
 9487         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9488         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9489         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9490         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9491       }
 9492       __ cmp(ch1, ch2);
 9493       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9494       __ b(DONE);
 9495     __ align(OptoLoopAlignment);
 9496     __ BIND(L_HAS_ZERO);
 9497       __ rbit(tmp2, tmp2);
 9498       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9499       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9500       // It's fine because both counters are 32bit and are not changed in this
 9501       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9502       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9503       __ sub(result, result, 1);
 9504     __ BIND(L_HAS_ZERO_LOOP);
 9505       __ mov(cnt1, wordSize/str2_chr_size);
 9506       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9507       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9508       if (str2_isL) {
 9509         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9510         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9511         __ lslv(tmp2, tmp2, tmp4);
 9512         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9513         __ add(tmp4, tmp4, 1);
 9514         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9515         __ lsl(tmp2, tmp2, 1);
 9516         __ mov(tmp4, wordSize/str2_chr_size);
 9517       } else {
 9518         __ mov(ch2, 0xE);
 9519         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9520         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9521         __ lslv(tmp2, tmp2, tmp4);
 9522         __ add(tmp4, tmp4, 1);
 9523         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9524         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9525         __ lsl(tmp2, tmp2, 1);
 9526         __ mov(tmp4, wordSize/str2_chr_size);
 9527         __ sub(str2, str2, str2_chr_size);
 9528       }
 9529       __ cmp(ch1, ch2);
 9530       __ mov(tmp4, wordSize/str2_chr_size);
 9531       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9532     __ BIND(L_CMP_LOOP);
 9533       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9534                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9535       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9536                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9537       __ add(tmp4, tmp4, 1);
 9538       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9539       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9540       __ cmp(cnt1, ch2);
 9541       __ br(__ EQ, L_CMP_LOOP);
 9542     __ BIND(L_CMP_LOOP_NOMATCH);
 9543       // here we're not matched
 9544       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9545       __ clz(tmp4, tmp2);
 9546       __ add(str2, str2, str2_chr_size); // advance pointer
 9547       __ b(L_HAS_ZERO_LOOP);
 9548     __ align(OptoLoopAlignment);
 9549     __ BIND(L_CMP_LOOP_LAST_CMP);
 9550       __ cmp(cnt1, ch2);
 9551       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9552       __ b(DONE);
 9553     __ align(OptoLoopAlignment);
 9554     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9555       if (str2_isL) {
 9556         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9557         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9558         __ lslv(tmp2, tmp2, tmp4);
 9559         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9560         __ add(tmp4, tmp4, 1);
 9561         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9562         __ lsl(tmp2, tmp2, 1);
 9563       } else {
 9564         __ mov(ch2, 0xE);
 9565         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9566         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9567         __ lslv(tmp2, tmp2, tmp4);
 9568         __ add(tmp4, tmp4, 1);
 9569         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9570         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9571         __ lsl(tmp2, tmp2, 1);
 9572         __ sub(str2, str2, str2_chr_size);
 9573       }
 9574       __ cmp(ch1, ch2);
 9575       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9576       __ b(DONE);
 9577     __ align(OptoLoopAlignment);
 9578     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9579       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9580       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9581       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9582       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9583       // result by analyzed characters value, so, we can just reset lower bits
 9584       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9585       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9586       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9587       // index of last analyzed substring inside current octet. So, str2 in at
 9588       // respective start address. We need to advance it to next octet
 9589       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9590       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9591       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9592       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9593       __ movw(cnt2, cnt2);
 9594       __ b(L_LOOP_PROCEED);
 9595     __ align(OptoLoopAlignment);
 9596     __ BIND(NOMATCH);
 9597       __ mov(result, -1);
 9598     __ BIND(DONE);
 9599       __ pop(spilled_regs, sp);
 9600       __ ret(lr);
 9601     return entry;
 9602   }
 9603 
 9604   void generate_string_indexof_stubs() {
 9605     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9606     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9607     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9608   }
 9609 
 9610   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9611       FloatRegister src1, FloatRegister src2) {
 9612     Register dst = r1;
 9613     __ zip1(v1, __ T16B, src1, v0);
 9614     __ zip2(v2, __ T16B, src1, v0);
 9615     if (generatePrfm) {
 9616       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9617     }
 9618     __ zip1(v3, __ T16B, src2, v0);
 9619     __ zip2(v4, __ T16B, src2, v0);
 9620     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9621   }
 9622 
 9623   // R0 = src
 9624   // R1 = dst
 9625   // R2 = len
 9626   // R3 = len >> 3
 9627   // V0 = 0
 9628   // v1 = loaded 8 bytes
 9629   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9630   address generate_large_byte_array_inflate() {
 9631     __ align(CodeEntryAlignment);
 9632     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9633     StubCodeMark mark(this, stub_id);
 9634     address entry = __ pc();
 9635     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9636     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9637     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9638 
 9639     // do one more 8-byte read to have address 16-byte aligned in most cases
 9640     // also use single store instruction
 9641     __ ldrd(v2, __ post(src, 8));
 9642     __ sub(octetCounter, octetCounter, 2);
 9643     __ zip1(v1, __ T16B, v1, v0);
 9644     __ zip1(v2, __ T16B, v2, v0);
 9645     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9646     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9647     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9648     __ br(__ LE, LOOP_START);
 9649     __ b(LOOP_PRFM_START);
 9650     __ bind(LOOP_PRFM);
 9651       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9652     __ bind(LOOP_PRFM_START);
 9653       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9654       __ sub(octetCounter, octetCounter, 8);
 9655       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9656       inflate_and_store_2_fp_registers(true, v3, v4);
 9657       inflate_and_store_2_fp_registers(true, v5, v6);
 9658       __ br(__ GT, LOOP_PRFM);
 9659       __ cmp(octetCounter, (u1)8);
 9660       __ br(__ LT, DONE);
 9661     __ bind(LOOP);
 9662       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9663       __ bind(LOOP_START);
 9664       __ sub(octetCounter, octetCounter, 8);
 9665       __ cmp(octetCounter, (u1)8);
 9666       inflate_and_store_2_fp_registers(false, v3, v4);
 9667       inflate_and_store_2_fp_registers(false, v5, v6);
 9668       __ br(__ GE, LOOP);
 9669     __ bind(DONE);
 9670       __ ret(lr);
 9671     return entry;
 9672   }
 9673 
 9674   /**
 9675    *  Arguments:
 9676    *
 9677    *  Input:
 9678    *  c_rarg0   - current state address
 9679    *  c_rarg1   - H key address
 9680    *  c_rarg2   - data address
 9681    *  c_rarg3   - number of blocks
 9682    *
 9683    *  Output:
 9684    *  Updated state at c_rarg0
 9685    */
 9686   address generate_ghash_processBlocks() {
 9687     // Bafflingly, GCM uses little-endian for the byte order, but
 9688     // big-endian for the bit order.  For example, the polynomial 1 is
 9689     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9690     //
 9691     // So, we must either reverse the bytes in each word and do
 9692     // everything big-endian or reverse the bits in each byte and do
 9693     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9694     // the bits in each byte (we have an instruction, RBIT, to do
 9695     // that) and keep the data in little-endian bit order through the
 9696     // calculation, bit-reversing the inputs and outputs.
 9697 
 9698     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9699     StubCodeMark mark(this, stub_id);
 9700     Label polynomial; // local data generated at end of stub
 9701     __ align(CodeEntryAlignment);
 9702     address start = __ pc();
 9703 
 9704     Register state   = c_rarg0;
 9705     Register subkeyH = c_rarg1;
 9706     Register data    = c_rarg2;
 9707     Register blocks  = c_rarg3;
 9708 
 9709     FloatRegister vzr = v30;
 9710     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9711 
 9712     __ adr(rscratch1, polynomial);
 9713     __ ldrq(v24, rscratch1);    // The field polynomial
 9714 
 9715     __ ldrq(v0, Address(state));
 9716     __ ldrq(v1, Address(subkeyH));
 9717 
 9718     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9719     __ rbit(v0, __ T16B, v0);
 9720     __ rev64(v1, __ T16B, v1);
 9721     __ rbit(v1, __ T16B, v1);
 9722 
 9723     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9724     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9725 
 9726     {
 9727       Label L_ghash_loop;
 9728       __ bind(L_ghash_loop);
 9729 
 9730       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9731                                                  // reversing each byte
 9732       __ rbit(v2, __ T16B, v2);
 9733       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9734 
 9735       // Multiply state in v2 by subkey in v1
 9736       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9737                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9738                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9739       // Reduce v7:v5 by the field polynomial
 9740       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9741 
 9742       __ sub(blocks, blocks, 1);
 9743       __ cbnz(blocks, L_ghash_loop);
 9744     }
 9745 
 9746     // The bit-reversed result is at this point in v0
 9747     __ rev64(v0, __ T16B, v0);
 9748     __ rbit(v0, __ T16B, v0);
 9749 
 9750     __ st1(v0, __ T16B, state);
 9751     __ ret(lr);
 9752 
 9753     // bind label and generate local polynomial data
 9754     __ align(wordSize * 2);
 9755     __ bind(polynomial);
 9756     __ emit_int64(0x87);  // The low-order bits of the field
 9757                           // polynomial (i.e. p = z^7+z^2+z+1)
 9758                           // repeated in the low and high parts of a
 9759                           // 128-bit vector
 9760     __ emit_int64(0x87);
 9761 
 9762     return start;
 9763   }
 9764 
 9765   address generate_ghash_processBlocks_wide() {
 9766     address small = generate_ghash_processBlocks();
 9767 
 9768     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9769     StubCodeMark mark(this, stub_id);
 9770     Label polynomial;           // local data generated after stub
 9771     __ align(CodeEntryAlignment);
 9772     address start = __ pc();
 9773 
 9774     Register state   = c_rarg0;
 9775     Register subkeyH = c_rarg1;
 9776     Register data    = c_rarg2;
 9777     Register blocks  = c_rarg3;
 9778 
 9779     const int unroll = 4;
 9780 
 9781     __ cmp(blocks, (unsigned char)(unroll * 2));
 9782     __ br(__ LT, small);
 9783 
 9784     if (unroll > 1) {
 9785     // Save state before entering routine
 9786       __ sub(sp, sp, 4 * 16);
 9787       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9788       __ sub(sp, sp, 4 * 16);
 9789       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9790     }
 9791 
 9792     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
 9793 
 9794     if (unroll > 1) {
 9795       // And restore state
 9796       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9797       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9798     }
 9799 
 9800     __ cmp(blocks, (unsigned char)0);
 9801     __ br(__ GT, small);
 9802 
 9803     __ ret(lr);
 9804 
 9805     // bind label and generate polynomial data
 9806     __ align(wordSize * 2);
 9807     __ bind(polynomial);
 9808     __ emit_int64(0x87);  // The low-order bits of the field
 9809                           // polynomial (i.e. p = z^7+z^2+z+1)
 9810                           // repeated in the low and high parts of a
 9811                           // 128-bit vector
 9812     __ emit_int64(0x87);
 9813 
 9814     return start;
 9815 
 9816   }
 9817 
 9818   void generate_base64_encode_simdround(Register src, Register dst,
 9819         FloatRegister codec, u8 size) {
 9820 
 9821     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9822     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9823     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9824 
 9825     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9826 
 9827     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9828 
 9829     __ ushr(ind0, arrangement, in0,  2);
 9830 
 9831     __ ushr(ind1, arrangement, in1,  2);
 9832     __ shl(in0,   arrangement, in0,  6);
 9833     __ orr(ind1,  arrangement, ind1, in0);
 9834     __ ushr(ind1, arrangement, ind1, 2);
 9835 
 9836     __ ushr(ind2, arrangement, in2,  4);
 9837     __ shl(in1,   arrangement, in1,  4);
 9838     __ orr(ind2,  arrangement, in1,  ind2);
 9839     __ ushr(ind2, arrangement, ind2, 2);
 9840 
 9841     __ shl(ind3,  arrangement, in2,  2);
 9842     __ ushr(ind3, arrangement, ind3, 2);
 9843 
 9844     __ tbl(out0,  arrangement, codec,  4, ind0);
 9845     __ tbl(out1,  arrangement, codec,  4, ind1);
 9846     __ tbl(out2,  arrangement, codec,  4, ind2);
 9847     __ tbl(out3,  arrangement, codec,  4, ind3);
 9848 
 9849     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9850   }
 9851 
 9852    /**
 9853    *  Arguments:
 9854    *
 9855    *  Input:
 9856    *  c_rarg0   - src_start
 9857    *  c_rarg1   - src_offset
 9858    *  c_rarg2   - src_length
 9859    *  c_rarg3   - dest_start
 9860    *  c_rarg4   - dest_offset
 9861    *  c_rarg5   - isURL
 9862    *
 9863    */
 9864   address generate_base64_encodeBlock() {
 9865 
 9866     static const char toBase64[64] = {
 9867       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9868       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9869       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9870       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9871       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9872     };
 9873 
 9874     static const char toBase64URL[64] = {
 9875       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9876       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9877       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9878       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9879       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9880     };
 9881 
 9882     __ align(CodeEntryAlignment);
 9883     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9884     StubCodeMark mark(this, stub_id);
 9885     address start = __ pc();
 9886 
 9887     Register src   = c_rarg0;  // source array
 9888     Register soff  = c_rarg1;  // source start offset
 9889     Register send  = c_rarg2;  // source end offset
 9890     Register dst   = c_rarg3;  // dest array
 9891     Register doff  = c_rarg4;  // position for writing to dest array
 9892     Register isURL = c_rarg5;  // Base64 or URL character set
 9893 
 9894     // c_rarg6 and c_rarg7 are free to use as temps
 9895     Register codec  = c_rarg6;
 9896     Register length = c_rarg7;
 9897 
 9898     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9899 
 9900     __ add(src, src, soff);
 9901     __ add(dst, dst, doff);
 9902     __ sub(length, send, soff);
 9903 
 9904     // load the codec base address
 9905     __ lea(codec, ExternalAddress((address) toBase64));
 9906     __ cbz(isURL, ProcessData);
 9907     __ lea(codec, ExternalAddress((address) toBase64URL));
 9908 
 9909     __ BIND(ProcessData);
 9910 
 9911     // too short to formup a SIMD loop, roll back
 9912     __ cmp(length, (u1)24);
 9913     __ br(Assembler::LT, Process3B);
 9914 
 9915     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9916 
 9917     __ BIND(Process48B);
 9918     __ cmp(length, (u1)48);
 9919     __ br(Assembler::LT, Process24B);
 9920     generate_base64_encode_simdround(src, dst, v0, 16);
 9921     __ sub(length, length, 48);
 9922     __ b(Process48B);
 9923 
 9924     __ BIND(Process24B);
 9925     __ cmp(length, (u1)24);
 9926     __ br(Assembler::LT, SIMDExit);
 9927     generate_base64_encode_simdround(src, dst, v0, 8);
 9928     __ sub(length, length, 24);
 9929 
 9930     __ BIND(SIMDExit);
 9931     __ cbz(length, Exit);
 9932 
 9933     __ BIND(Process3B);
 9934     //  3 src bytes, 24 bits
 9935     __ ldrb(r10, __ post(src, 1));
 9936     __ ldrb(r11, __ post(src, 1));
 9937     __ ldrb(r12, __ post(src, 1));
 9938     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9939     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9940     // codec index
 9941     __ ubfmw(r15, r12, 18, 23);
 9942     __ ubfmw(r14, r12, 12, 17);
 9943     __ ubfmw(r13, r12, 6,  11);
 9944     __ andw(r12,  r12, 63);
 9945     // get the code based on the codec
 9946     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9947     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9948     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9949     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9950     __ strb(r15, __ post(dst, 1));
 9951     __ strb(r14, __ post(dst, 1));
 9952     __ strb(r13, __ post(dst, 1));
 9953     __ strb(r12, __ post(dst, 1));
 9954     __ sub(length, length, 3);
 9955     __ cbnz(length, Process3B);
 9956 
 9957     __ BIND(Exit);
 9958     __ ret(lr);
 9959 
 9960     return start;
 9961   }
 9962 
 9963   void generate_base64_decode_simdround(Register src, Register dst,
 9964         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9965 
 9966     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9967     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9968 
 9969     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9970     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9971 
 9972     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9973 
 9974     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9975 
 9976     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9977 
 9978     // we need unsigned saturating subtract, to make sure all input values
 9979     // in range [0, 63] will have 0U value in the higher half lookup
 9980     __ uqsubv(decH0, __ T16B, in0, v27);
 9981     __ uqsubv(decH1, __ T16B, in1, v27);
 9982     __ uqsubv(decH2, __ T16B, in2, v27);
 9983     __ uqsubv(decH3, __ T16B, in3, v27);
 9984 
 9985     // lower half lookup
 9986     __ tbl(decL0, arrangement, codecL, 4, in0);
 9987     __ tbl(decL1, arrangement, codecL, 4, in1);
 9988     __ tbl(decL2, arrangement, codecL, 4, in2);
 9989     __ tbl(decL3, arrangement, codecL, 4, in3);
 9990 
 9991     // higher half lookup
 9992     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9993     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9994     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9995     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9996 
 9997     // combine lower and higher
 9998     __ orr(decL0, arrangement, decL0, decH0);
 9999     __ orr(decL1, arrangement, decL1, decH1);
10000     __ orr(decL2, arrangement, decL2, decH2);
10001     __ orr(decL3, arrangement, decL3, decH3);
10002 
10003     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10004     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10005     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10006     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10007     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10008     __ orr(in0, arrangement, decH0, decH1);
10009     __ orr(in1, arrangement, decH2, decH3);
10010     __ orr(in2, arrangement, in0,   in1);
10011     __ umaxv(in3, arrangement, in2);
10012     __ umov(rscratch2, in3, __ B, 0);
10013 
10014     // get the data to output
10015     __ shl(out0,  arrangement, decL0, 2);
10016     __ ushr(out1, arrangement, decL1, 4);
10017     __ orr(out0,  arrangement, out0,  out1);
10018     __ shl(out1,  arrangement, decL1, 4);
10019     __ ushr(out2, arrangement, decL2, 2);
10020     __ orr(out1,  arrangement, out1,  out2);
10021     __ shl(out2,  arrangement, decL2, 6);
10022     __ orr(out2,  arrangement, out2,  decL3);
10023 
10024     __ cbz(rscratch2, NoIllegalData);
10025 
10026     // handle illegal input
10027     __ umov(r10, in2, __ D, 0);
10028     if (size == 16) {
10029       __ cbnz(r10, ErrorInLowerHalf);
10030 
10031       // illegal input is in higher half, store the lower half now.
10032       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10033 
10034       __ umov(r10, in2,  __ D, 1);
10035       __ umov(r11, out0, __ D, 1);
10036       __ umov(r12, out1, __ D, 1);
10037       __ umov(r13, out2, __ D, 1);
10038       __ b(StoreLegalData);
10039 
10040       __ BIND(ErrorInLowerHalf);
10041     }
10042     __ umov(r11, out0, __ D, 0);
10043     __ umov(r12, out1, __ D, 0);
10044     __ umov(r13, out2, __ D, 0);
10045 
10046     __ BIND(StoreLegalData);
10047     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10048     __ strb(r11, __ post(dst, 1));
10049     __ strb(r12, __ post(dst, 1));
10050     __ strb(r13, __ post(dst, 1));
10051     __ lsr(r10, r10, 8);
10052     __ lsr(r11, r11, 8);
10053     __ lsr(r12, r12, 8);
10054     __ lsr(r13, r13, 8);
10055     __ b(StoreLegalData);
10056 
10057     __ BIND(NoIllegalData);
10058     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10059   }
10060 
10061 
10062    /**
10063    *  Arguments:
10064    *
10065    *  Input:
10066    *  c_rarg0   - src_start
10067    *  c_rarg1   - src_offset
10068    *  c_rarg2   - src_length
10069    *  c_rarg3   - dest_start
10070    *  c_rarg4   - dest_offset
10071    *  c_rarg5   - isURL
10072    *  c_rarg6   - isMIME
10073    *
10074    */
10075   address generate_base64_decodeBlock() {
10076 
10077     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10078     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10079     // titled "Base64 decoding".
10080 
10081     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10082     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10083     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10084     static const uint8_t fromBase64ForNoSIMD[256] = {
10085       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10086       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10087       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10088        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10089       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10090        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10091       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10092        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10093       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10094       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10095       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10096       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10097       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10098       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10099       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10100       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10101     };
10102 
10103     static const uint8_t fromBase64URLForNoSIMD[256] = {
10104       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10105       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10106       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10107        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10108       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10109        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10110       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10111        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10112       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10113       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10114       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10115       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10118       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10119       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10120     };
10121 
10122     // A legal value of base64 code is in range [0, 127].  We need two lookups
10123     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10124     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10125     // table vector lookup use tbx, out of range indices are unchanged in
10126     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10127     // The value of index 64 is set to 0, so that we know that we already get the
10128     // decoded data with the 1st lookup.
10129     static const uint8_t fromBase64ForSIMD[128] = {
10130       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10131       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10132       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10133        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10134         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10135        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10136       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10137        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10138     };
10139 
10140     static const uint8_t fromBase64URLForSIMD[128] = {
10141       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10142       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10143       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10144        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10145         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10146        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10147        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10148        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10149     };
10150 
10151     __ align(CodeEntryAlignment);
10152     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10153     StubCodeMark mark(this, stub_id);
10154     address start = __ pc();
10155 
10156     Register src    = c_rarg0;  // source array
10157     Register soff   = c_rarg1;  // source start offset
10158     Register send   = c_rarg2;  // source end offset
10159     Register dst    = c_rarg3;  // dest array
10160     Register doff   = c_rarg4;  // position for writing to dest array
10161     Register isURL  = c_rarg5;  // Base64 or URL character set
10162     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10163 
10164     Register length = send;    // reuse send as length of source data to process
10165 
10166     Register simd_codec   = c_rarg6;
10167     Register nosimd_codec = c_rarg7;
10168 
10169     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10170 
10171     __ enter();
10172 
10173     __ add(src, src, soff);
10174     __ add(dst, dst, doff);
10175 
10176     __ mov(doff, dst);
10177 
10178     __ sub(length, send, soff);
10179     __ bfm(length, zr, 0, 1);
10180 
10181     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10182     __ cbz(isURL, ProcessData);
10183     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10184 
10185     __ BIND(ProcessData);
10186     __ mov(rscratch1, length);
10187     __ cmp(length, (u1)144); // 144 = 80 + 64
10188     __ br(Assembler::LT, Process4B);
10189 
10190     // In the MIME case, the line length cannot be more than 76
10191     // bytes (see RFC 2045). This is too short a block for SIMD
10192     // to be worthwhile, so we use non-SIMD here.
10193     __ movw(rscratch1, 79);
10194 
10195     __ BIND(Process4B);
10196     __ ldrw(r14, __ post(src, 4));
10197     __ ubfxw(r10, r14, 0,  8);
10198     __ ubfxw(r11, r14, 8,  8);
10199     __ ubfxw(r12, r14, 16, 8);
10200     __ ubfxw(r13, r14, 24, 8);
10201     // get the de-code
10202     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10203     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10204     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10205     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10206     // error detection, 255u indicates an illegal input
10207     __ orrw(r14, r10, r11);
10208     __ orrw(r15, r12, r13);
10209     __ orrw(r14, r14, r15);
10210     __ tbnz(r14, 7, Exit);
10211     // recover the data
10212     __ lslw(r14, r10, 10);
10213     __ bfiw(r14, r11, 4, 6);
10214     __ bfmw(r14, r12, 2, 5);
10215     __ rev16w(r14, r14);
10216     __ bfiw(r13, r12, 6, 2);
10217     __ strh(r14, __ post(dst, 2));
10218     __ strb(r13, __ post(dst, 1));
10219     // non-simd loop
10220     __ subsw(rscratch1, rscratch1, 4);
10221     __ br(Assembler::GT, Process4B);
10222 
10223     // if exiting from PreProcess80B, rscratch1 == -1;
10224     // otherwise, rscratch1 == 0.
10225     __ cbzw(rscratch1, Exit);
10226     __ sub(length, length, 80);
10227 
10228     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10229     __ cbz(isURL, SIMDEnter);
10230     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10231 
10232     __ BIND(SIMDEnter);
10233     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10234     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10235     __ mov(rscratch1, 63);
10236     __ dup(v27, __ T16B, rscratch1);
10237 
10238     __ BIND(Process64B);
10239     __ cmp(length, (u1)64);
10240     __ br(Assembler::LT, Process32B);
10241     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10242     __ sub(length, length, 64);
10243     __ b(Process64B);
10244 
10245     __ BIND(Process32B);
10246     __ cmp(length, (u1)32);
10247     __ br(Assembler::LT, SIMDExit);
10248     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10249     __ sub(length, length, 32);
10250     __ b(Process32B);
10251 
10252     __ BIND(SIMDExit);
10253     __ cbz(length, Exit);
10254     __ movw(rscratch1, length);
10255     __ b(Process4B);
10256 
10257     __ BIND(Exit);
10258     __ sub(c_rarg0, dst, doff);
10259 
10260     __ leave();
10261     __ ret(lr);
10262 
10263     return start;
10264   }
10265 
10266   // Support for spin waits.
10267   address generate_spin_wait() {
10268     __ align(CodeEntryAlignment);
10269     StubId stub_id = StubId::stubgen_spin_wait_id;
10270     StubCodeMark mark(this, stub_id);
10271     address start = __ pc();
10272 
10273     __ spin_wait();
10274     __ ret(lr);
10275 
10276     return start;
10277   }
10278 
10279   void generate_lookup_secondary_supers_table_stub() {
10280     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10281     StubCodeMark mark(this, stub_id);
10282 
10283     const Register
10284       r_super_klass  = r0,
10285       r_array_base   = r1,
10286       r_array_length = r2,
10287       r_array_index  = r3,
10288       r_sub_klass    = r4,
10289       r_bitmap       = rscratch2,
10290       result         = r5;
10291     const FloatRegister
10292       vtemp          = v0;
10293 
10294     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10295       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10296       Label L_success;
10297       __ enter();
10298       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10299                                              r_array_base, r_array_length, r_array_index,
10300                                              vtemp, result, slot,
10301                                              /*stub_is_near*/true);
10302       __ leave();
10303       __ ret(lr);
10304     }
10305   }
10306 
10307   // Slow path implementation for UseSecondarySupersTable.
10308   address generate_lookup_secondary_supers_table_slow_path_stub() {
10309     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10310     StubCodeMark mark(this, stub_id);
10311 
10312     address start = __ pc();
10313     const Register
10314       r_super_klass  = r0,        // argument
10315       r_array_base   = r1,        // argument
10316       temp1          = r2,        // temp
10317       r_array_index  = r3,        // argument
10318       r_bitmap       = rscratch2, // argument
10319       result         = r5;        // argument
10320 
10321     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10322     __ ret(lr);
10323 
10324     return start;
10325   }
10326 
10327 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10328 
10329   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10330   //
10331   // If LSE is in use, generate LSE versions of all the stubs. The
10332   // non-LSE versions are in atomic_aarch64.S.
10333 
10334   // class AtomicStubMark records the entry point of a stub and the
10335   // stub pointer which will point to it. The stub pointer is set to
10336   // the entry point when ~AtomicStubMark() is called, which must be
10337   // after ICache::invalidate_range. This ensures safe publication of
10338   // the generated code.
10339   class AtomicStubMark {
10340     address _entry_point;
10341     aarch64_atomic_stub_t *_stub;
10342     MacroAssembler *_masm;
10343   public:
10344     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10345       _masm = masm;
10346       __ align(32);
10347       _entry_point = __ pc();
10348       _stub = stub;
10349     }
10350     ~AtomicStubMark() {
10351       *_stub = (aarch64_atomic_stub_t)_entry_point;
10352     }
10353   };
10354 
10355   // NB: For memory_order_conservative we need a trailing membar after
10356   // LSE atomic operations but not a leading membar.
10357   //
10358   // We don't need a leading membar because a clause in the Arm ARM
10359   // says:
10360   //
10361   //   Barrier-ordered-before
10362   //
10363   //   Barrier instructions order prior Memory effects before subsequent
10364   //   Memory effects generated by the same Observer. A read or a write
10365   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10366   //   Observer if and only if RW1 appears in program order before RW 2
10367   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10368   //   instruction with both Acquire and Release semantics.
10369   //
10370   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10371   // and Release semantics, therefore we don't need a leading
10372   // barrier. However, there is no corresponding Barrier-ordered-after
10373   // relationship, therefore we need a trailing membar to prevent a
10374   // later store or load from being reordered with the store in an
10375   // atomic instruction.
10376   //
10377   // This was checked by using the herd7 consistency model simulator
10378   // (http://diy.inria.fr/) with this test case:
10379   //
10380   // AArch64 LseCas
10381   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10382   // P0 | P1;
10383   // LDR W4, [X2] | MOV W3, #0;
10384   // DMB LD       | MOV W4, #1;
10385   // LDR W3, [X1] | CASAL W3, W4, [X1];
10386   //              | DMB ISH;
10387   //              | STR W4, [X2];
10388   // exists
10389   // (0:X3=0 /\ 0:X4=1)
10390   //
10391   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10392   // with the store to x in P1. Without the DMB in P1 this may happen.
10393   //
10394   // At the time of writing we don't know of any AArch64 hardware that
10395   // reorders stores in this way, but the Reference Manual permits it.
10396 
10397   void gen_cas_entry(Assembler::operand_size size,
10398                      atomic_memory_order order) {
10399     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10400       exchange_val = c_rarg2;
10401     bool acquire, release;
10402     switch (order) {
10403       case memory_order_relaxed:
10404         acquire = false;
10405         release = false;
10406         break;
10407       case memory_order_release:
10408         acquire = false;
10409         release = true;
10410         break;
10411       default:
10412         acquire = true;
10413         release = true;
10414         break;
10415     }
10416     __ mov(prev, compare_val);
10417     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10418     if (order == memory_order_conservative) {
10419       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10420     }
10421     if (size == Assembler::xword) {
10422       __ mov(r0, prev);
10423     } else {
10424       __ movw(r0, prev);
10425     }
10426     __ ret(lr);
10427   }
10428 
10429   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10430     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10431     // If not relaxed, then default to conservative.  Relaxed is the only
10432     // case we use enough to be worth specializing.
10433     if (order == memory_order_relaxed) {
10434       __ ldadd(size, incr, prev, addr);
10435     } else {
10436       __ ldaddal(size, incr, prev, addr);
10437       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10438     }
10439     if (size == Assembler::xword) {
10440       __ mov(r0, prev);
10441     } else {
10442       __ movw(r0, prev);
10443     }
10444     __ ret(lr);
10445   }
10446 
10447   void gen_swpal_entry(Assembler::operand_size size) {
10448     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10449     __ swpal(size, incr, prev, addr);
10450     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10451     if (size == Assembler::xword) {
10452       __ mov(r0, prev);
10453     } else {
10454       __ movw(r0, prev);
10455     }
10456     __ ret(lr);
10457   }
10458 
10459   void generate_atomic_entry_points() {
10460     if (! UseLSE) {
10461       return;
10462     }
10463     __ align(CodeEntryAlignment);
10464     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10465     StubCodeMark mark(this, stub_id);
10466     address first_entry = __ pc();
10467 
10468     // ADD, memory_order_conservative
10469     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10470     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10471     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10472     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10473 
10474     // ADD, memory_order_relaxed
10475     AtomicStubMark mark_fetch_add_4_relaxed
10476       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10477     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10478     AtomicStubMark mark_fetch_add_8_relaxed
10479       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10480     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10481 
10482     // XCHG, memory_order_conservative
10483     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10484     gen_swpal_entry(Assembler::word);
10485     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10486     gen_swpal_entry(Assembler::xword);
10487 
10488     // CAS, memory_order_conservative
10489     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10490     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10491     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10492     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10493     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10494     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10495 
10496     // CAS, memory_order_relaxed
10497     AtomicStubMark mark_cmpxchg_1_relaxed
10498       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10499     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10500     AtomicStubMark mark_cmpxchg_4_relaxed
10501       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10502     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10503     AtomicStubMark mark_cmpxchg_8_relaxed
10504       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10505     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10506 
10507     AtomicStubMark mark_cmpxchg_4_release
10508       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10509     gen_cas_entry(MacroAssembler::word, memory_order_release);
10510     AtomicStubMark mark_cmpxchg_8_release
10511       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10512     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10513 
10514     AtomicStubMark mark_cmpxchg_4_seq_cst
10515       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10516     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10517     AtomicStubMark mark_cmpxchg_8_seq_cst
10518       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10519     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10520 
10521     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10522   }
10523 #endif // LINUX
10524 
10525   address generate_cont_thaw(Continuation::thaw_kind kind) {
10526     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10527     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10528 
10529     address start = __ pc();
10530 
10531     if (return_barrier) {
10532       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10533       __ mov(sp, rscratch1);
10534     }
10535     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10536 
10537     if (return_barrier) {
10538       // preserve possible return value from a method returning to the return barrier
10539       __ fmovd(rscratch1, v0);
10540       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10541     }
10542 
10543     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10544     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10545     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10546 
10547     if (return_barrier) {
10548       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10549       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10550       __ fmovd(v0, rscratch1);
10551     }
10552     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10553 
10554 
10555     Label thaw_success;
10556     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10557     __ cbnz(rscratch2, thaw_success);
10558     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10559     __ br(rscratch1);
10560     __ bind(thaw_success);
10561 
10562     // make room for the thawed frames
10563     __ sub(rscratch1, sp, rscratch2);
10564     __ andr(rscratch1, rscratch1, -16); // align
10565     __ mov(sp, rscratch1);
10566 
10567     if (return_barrier) {
10568       // save original return value -- again
10569       __ fmovd(rscratch1, v0);
10570       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10571     }
10572 
10573     // If we want, we can templatize thaw by kind, and have three different entries
10574     __ movw(c_rarg1, (uint32_t)kind);
10575 
10576     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10577     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10578 
10579     if (return_barrier) {
10580       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10581       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10582       __ fmovd(v0, rscratch1);
10583     } else {
10584       __ mov(r0, zr); // return 0 (success) from doYield
10585     }
10586 
10587     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10588     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10589     __ mov(rfp, sp);
10590 
10591     if (return_barrier_exception) {
10592       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10593       __ authenticate_return_address(c_rarg1);
10594       __ verify_oop(r0);
10595       // save return value containing the exception oop in callee-saved R19
10596       __ mov(r19, r0);
10597 
10598       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10599 
10600       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10601       // __ reinitialize_ptrue();
10602 
10603       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10604 
10605       __ mov(r1, r0); // the exception handler
10606       __ mov(r0, r19); // restore return value containing the exception oop
10607       __ verify_oop(r0);
10608 
10609       __ leave();
10610       __ mov(r3, lr);
10611       __ br(r1); // the exception handler
10612     } else {
10613       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10614       __ leave();
10615       __ ret(lr);
10616     }
10617 
10618     return start;
10619   }
10620 
10621   address generate_cont_thaw() {
10622     if (!Continuations::enabled()) return nullptr;
10623 
10624     StubId stub_id = StubId::stubgen_cont_thaw_id;
10625     StubCodeMark mark(this, stub_id);
10626     address start = __ pc();
10627     generate_cont_thaw(Continuation::thaw_top);
10628     return start;
10629   }
10630 
10631   address generate_cont_returnBarrier() {
10632     if (!Continuations::enabled()) return nullptr;
10633 
10634     // TODO: will probably need multiple return barriers depending on return type
10635     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10636     StubCodeMark mark(this, stub_id);
10637     address start = __ pc();
10638 
10639     generate_cont_thaw(Continuation::thaw_return_barrier);
10640 
10641     return start;
10642   }
10643 
10644   address generate_cont_returnBarrier_exception() {
10645     if (!Continuations::enabled()) return nullptr;
10646 
10647     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10648     StubCodeMark mark(this, stub_id);
10649     address start = __ pc();
10650 
10651     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10652 
10653     return start;
10654   }
10655 
10656   address generate_cont_preempt_stub() {
10657     if (!Continuations::enabled()) return nullptr;
10658     StubId stub_id = StubId::stubgen_cont_preempt_id;
10659     StubCodeMark mark(this, stub_id);
10660     address start = __ pc();
10661 
10662     __ reset_last_Java_frame(true);
10663 
10664     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10665     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10666     __ mov(sp, rscratch2);
10667 
10668     Label preemption_cancelled;
10669     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10670     __ cbnz(rscratch1, preemption_cancelled);
10671 
10672     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10673     SharedRuntime::continuation_enter_cleanup(_masm);
10674     __ leave();
10675     __ ret(lr);
10676 
10677     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10678     __ bind(preemption_cancelled);
10679     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10680     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10681     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10682     __ ldr(rscratch1, Address(rscratch1));
10683     __ br(rscratch1);
10684 
10685     return start;
10686   }
10687 
10688   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10689   // are represented as long[5], with BITS_PER_LIMB = 26.
10690   // Pack five 26-bit limbs into three 64-bit registers.
10691   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10692     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10693     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10694     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10695     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10696 
10697     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10698     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10699     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10700     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10701 
10702     if (dest2->is_valid()) {
10703       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10704     } else {
10705 #ifdef ASSERT
10706       Label OK;
10707       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10708       __ br(__ EQ, OK);
10709       __ stop("high bits of Poly1305 integer should be zero");
10710       __ should_not_reach_here();
10711       __ bind(OK);
10712 #endif
10713     }
10714   }
10715 
10716   // As above, but return only a 128-bit integer, packed into two
10717   // 64-bit registers.
10718   void pack_26(Register dest0, Register dest1, Register src) {
10719     pack_26(dest0, dest1, noreg, src);
10720   }
10721 
10722   // Multiply and multiply-accumulate unsigned 64-bit registers.
10723   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10724     __ mul(prod_lo, n, m);
10725     __ umulh(prod_hi, n, m);
10726   }
10727   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10728     wide_mul(rscratch1, rscratch2, n, m);
10729     __ adds(sum_lo, sum_lo, rscratch1);
10730     __ adc(sum_hi, sum_hi, rscratch2);
10731   }
10732 
10733   // Poly1305, RFC 7539
10734 
10735   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10736   // description of the tricks used to simplify and accelerate this
10737   // computation.
10738 
10739   address generate_poly1305_processBlocks() {
10740     __ align(CodeEntryAlignment);
10741     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10742     StubCodeMark mark(this, stub_id);
10743     address start = __ pc();
10744     Label here;
10745     __ enter();
10746     RegSet callee_saved = RegSet::range(r19, r28);
10747     __ push(callee_saved, sp);
10748 
10749     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10750 
10751     // Arguments
10752     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10753 
10754     // R_n is the 128-bit randomly-generated key, packed into two
10755     // registers.  The caller passes this key to us as long[5], with
10756     // BITS_PER_LIMB = 26.
10757     const Register R_0 = *++regs, R_1 = *++regs;
10758     pack_26(R_0, R_1, r_start);
10759 
10760     // RR_n is (R_n >> 2) * 5
10761     const Register RR_0 = *++regs, RR_1 = *++regs;
10762     __ lsr(RR_0, R_0, 2);
10763     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10764     __ lsr(RR_1, R_1, 2);
10765     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10766 
10767     // U_n is the current checksum
10768     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10769     pack_26(U_0, U_1, U_2, acc_start);
10770 
10771     static constexpr int BLOCK_LENGTH = 16;
10772     Label DONE, LOOP;
10773 
10774     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10775     __ br(Assembler::LT, DONE); {
10776       __ bind(LOOP);
10777 
10778       // S_n is to be the sum of U_n and the next block of data
10779       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10780       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10781       __ adds(S_0, U_0, S_0);
10782       __ adcs(S_1, U_1, S_1);
10783       __ adc(S_2, U_2, zr);
10784       __ add(S_2, S_2, 1);
10785 
10786       const Register U_0HI = *++regs, U_1HI = *++regs;
10787 
10788       // NB: this logic depends on some of the special properties of
10789       // Poly1305 keys. In particular, because we know that the top
10790       // four bits of R_0 and R_1 are zero, we can add together
10791       // partial products without any risk of needing to propagate a
10792       // carry out.
10793       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10794       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10795       __ andr(U_2, R_0, 3);
10796       __ mul(U_2, S_2, U_2);
10797 
10798       // Recycle registers S_0, S_1, S_2
10799       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10800 
10801       // Partial reduction mod 2**130 - 5
10802       __ adds(U_1, U_0HI, U_1);
10803       __ adc(U_2, U_1HI, U_2);
10804       // Sum now in U_2:U_1:U_0.
10805       // Dead: U_0HI, U_1HI.
10806       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10807 
10808       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10809 
10810       // First, U_2:U_1:U_0 += (U_2 >> 2)
10811       __ lsr(rscratch1, U_2, 2);
10812       __ andr(U_2, U_2, (u8)3);
10813       __ adds(U_0, U_0, rscratch1);
10814       __ adcs(U_1, U_1, zr);
10815       __ adc(U_2, U_2, zr);
10816       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10817       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10818       __ adcs(U_1, U_1, zr);
10819       __ adc(U_2, U_2, zr);
10820 
10821       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10822       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10823       __ br(~ Assembler::LT, LOOP);
10824     }
10825 
10826     // Further reduce modulo 2^130 - 5
10827     __ lsr(rscratch1, U_2, 2);
10828     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10829     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10830     __ adcs(U_1, U_1, zr);
10831     __ andr(U_2, U_2, (u1)3);
10832     __ adc(U_2, U_2, zr);
10833 
10834     // Unpack the sum into five 26-bit limbs and write to memory.
10835     __ ubfiz(rscratch1, U_0, 0, 26);
10836     __ ubfx(rscratch2, U_0, 26, 26);
10837     __ stp(rscratch1, rscratch2, Address(acc_start));
10838     __ ubfx(rscratch1, U_0, 52, 12);
10839     __ bfi(rscratch1, U_1, 12, 14);
10840     __ ubfx(rscratch2, U_1, 14, 26);
10841     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10842     __ ubfx(rscratch1, U_1, 40, 24);
10843     __ bfi(rscratch1, U_2, 24, 3);
10844     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10845 
10846     __ bind(DONE);
10847     __ pop(callee_saved, sp);
10848     __ leave();
10849     __ ret(lr);
10850 
10851     return start;
10852   }
10853 
10854   // exception handler for upcall stubs
10855   address generate_upcall_stub_exception_handler() {
10856     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10857     StubCodeMark mark(this, stub_id);
10858     address start = __ pc();
10859 
10860     // Native caller has no idea how to handle exceptions,
10861     // so we just crash here. Up to callee to catch exceptions.
10862     __ verify_oop(r0);
10863     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10864     __ blr(rscratch1);
10865     __ should_not_reach_here();
10866 
10867     return start;
10868   }
10869 
10870   // load Method* target of MethodHandle
10871   // j_rarg0 = jobject receiver
10872   // rmethod = result
10873   address generate_upcall_stub_load_target() {
10874     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10875     StubCodeMark mark(this, stub_id);
10876     address start = __ pc();
10877 
10878     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10879       // Load target method from receiver
10880     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10881     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10882     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10883     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10884                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10885                       noreg, noreg);
10886     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10887 
10888     __ ret(lr);
10889 
10890     return start;
10891   }
10892 
10893 #undef __
10894 #define __ masm->
10895 
10896   class MontgomeryMultiplyGenerator : public MacroAssembler {
10897 
10898     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10899       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10900 
10901     RegSet _toSave;
10902     bool _squaring;
10903 
10904   public:
10905     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10906       : MacroAssembler(as->code()), _squaring(squaring) {
10907 
10908       // Register allocation
10909 
10910       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10911       Pa_base = *regs;       // Argument registers
10912       if (squaring)
10913         Pb_base = Pa_base;
10914       else
10915         Pb_base = *++regs;
10916       Pn_base = *++regs;
10917       Rlen= *++regs;
10918       inv = *++regs;
10919       Pm_base = *++regs;
10920 
10921                           // Working registers:
10922       Ra =  *++regs;        // The current digit of a, b, n, and m.
10923       Rb =  *++regs;
10924       Rm =  *++regs;
10925       Rn =  *++regs;
10926 
10927       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10928       Pb =  *++regs;
10929       Pm =  *++regs;
10930       Pn =  *++regs;
10931 
10932       t0 =  *++regs;        // Three registers which form a
10933       t1 =  *++regs;        // triple-precision accumuator.
10934       t2 =  *++regs;
10935 
10936       Ri =  *++regs;        // Inner and outer loop indexes.
10937       Rj =  *++regs;
10938 
10939       Rhi_ab = *++regs;     // Product registers: low and high parts
10940       Rlo_ab = *++regs;     // of a*b and m*n.
10941       Rhi_mn = *++regs;
10942       Rlo_mn = *++regs;
10943 
10944       // r19 and up are callee-saved.
10945       _toSave = RegSet::range(r19, *regs) + Pm_base;
10946     }
10947 
10948   private:
10949     void save_regs() {
10950       push(_toSave, sp);
10951     }
10952 
10953     void restore_regs() {
10954       pop(_toSave, sp);
10955     }
10956 
10957     template <typename T>
10958     void unroll_2(Register count, T block) {
10959       Label loop, end, odd;
10960       tbnz(count, 0, odd);
10961       cbz(count, end);
10962       align(16);
10963       bind(loop);
10964       (this->*block)();
10965       bind(odd);
10966       (this->*block)();
10967       subs(count, count, 2);
10968       br(Assembler::GT, loop);
10969       bind(end);
10970     }
10971 
10972     template <typename T>
10973     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10974       Label loop, end, odd;
10975       tbnz(count, 0, odd);
10976       cbz(count, end);
10977       align(16);
10978       bind(loop);
10979       (this->*block)(d, s, tmp);
10980       bind(odd);
10981       (this->*block)(d, s, tmp);
10982       subs(count, count, 2);
10983       br(Assembler::GT, loop);
10984       bind(end);
10985     }
10986 
10987     void pre1(RegisterOrConstant i) {
10988       block_comment("pre1");
10989       // Pa = Pa_base;
10990       // Pb = Pb_base + i;
10991       // Pm = Pm_base;
10992       // Pn = Pn_base + i;
10993       // Ra = *Pa;
10994       // Rb = *Pb;
10995       // Rm = *Pm;
10996       // Rn = *Pn;
10997       ldr(Ra, Address(Pa_base));
10998       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10999       ldr(Rm, Address(Pm_base));
11000       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11001       lea(Pa, Address(Pa_base));
11002       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11003       lea(Pm, Address(Pm_base));
11004       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11005 
11006       // Zero the m*n result.
11007       mov(Rhi_mn, zr);
11008       mov(Rlo_mn, zr);
11009     }
11010 
11011     // The core multiply-accumulate step of a Montgomery
11012     // multiplication.  The idea is to schedule operations as a
11013     // pipeline so that instructions with long latencies (loads and
11014     // multiplies) have time to complete before their results are
11015     // used.  This most benefits in-order implementations of the
11016     // architecture but out-of-order ones also benefit.
11017     void step() {
11018       block_comment("step");
11019       // MACC(Ra, Rb, t0, t1, t2);
11020       // Ra = *++Pa;
11021       // Rb = *--Pb;
11022       umulh(Rhi_ab, Ra, Rb);
11023       mul(Rlo_ab, Ra, Rb);
11024       ldr(Ra, pre(Pa, wordSize));
11025       ldr(Rb, pre(Pb, -wordSize));
11026       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11027                                        // previous iteration.
11028       // MACC(Rm, Rn, t0, t1, t2);
11029       // Rm = *++Pm;
11030       // Rn = *--Pn;
11031       umulh(Rhi_mn, Rm, Rn);
11032       mul(Rlo_mn, Rm, Rn);
11033       ldr(Rm, pre(Pm, wordSize));
11034       ldr(Rn, pre(Pn, -wordSize));
11035       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11036     }
11037 
11038     void post1() {
11039       block_comment("post1");
11040 
11041       // MACC(Ra, Rb, t0, t1, t2);
11042       // Ra = *++Pa;
11043       // Rb = *--Pb;
11044       umulh(Rhi_ab, Ra, Rb);
11045       mul(Rlo_ab, Ra, Rb);
11046       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11047       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11048 
11049       // *Pm = Rm = t0 * inv;
11050       mul(Rm, t0, inv);
11051       str(Rm, Address(Pm));
11052 
11053       // MACC(Rm, Rn, t0, t1, t2);
11054       // t0 = t1; t1 = t2; t2 = 0;
11055       umulh(Rhi_mn, Rm, Rn);
11056 
11057 #ifndef PRODUCT
11058       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11059       {
11060         mul(Rlo_mn, Rm, Rn);
11061         add(Rlo_mn, t0, Rlo_mn);
11062         Label ok;
11063         cbz(Rlo_mn, ok); {
11064           stop("broken Montgomery multiply");
11065         } bind(ok);
11066       }
11067 #endif
11068       // We have very carefully set things up so that
11069       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11070       // the lower half of Rm * Rn because we know the result already:
11071       // it must be -t0.  t0 + (-t0) must generate a carry iff
11072       // t0 != 0.  So, rather than do a mul and an adds we just set
11073       // the carry flag iff t0 is nonzero.
11074       //
11075       // mul(Rlo_mn, Rm, Rn);
11076       // adds(zr, t0, Rlo_mn);
11077       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11078       adcs(t0, t1, Rhi_mn);
11079       adc(t1, t2, zr);
11080       mov(t2, zr);
11081     }
11082 
11083     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11084       block_comment("pre2");
11085       // Pa = Pa_base + i-len;
11086       // Pb = Pb_base + len;
11087       // Pm = Pm_base + i-len;
11088       // Pn = Pn_base + len;
11089 
11090       if (i.is_register()) {
11091         sub(Rj, i.as_register(), len);
11092       } else {
11093         mov(Rj, i.as_constant());
11094         sub(Rj, Rj, len);
11095       }
11096       // Rj == i-len
11097 
11098       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11099       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11100       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11101       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11102 
11103       // Ra = *++Pa;
11104       // Rb = *--Pb;
11105       // Rm = *++Pm;
11106       // Rn = *--Pn;
11107       ldr(Ra, pre(Pa, wordSize));
11108       ldr(Rb, pre(Pb, -wordSize));
11109       ldr(Rm, pre(Pm, wordSize));
11110       ldr(Rn, pre(Pn, -wordSize));
11111 
11112       mov(Rhi_mn, zr);
11113       mov(Rlo_mn, zr);
11114     }
11115 
11116     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11117       block_comment("post2");
11118       if (i.is_constant()) {
11119         mov(Rj, i.as_constant()-len.as_constant());
11120       } else {
11121         sub(Rj, i.as_register(), len);
11122       }
11123 
11124       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11125 
11126       // As soon as we know the least significant digit of our result,
11127       // store it.
11128       // Pm_base[i-len] = t0;
11129       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11130 
11131       // t0 = t1; t1 = t2; t2 = 0;
11132       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11133       adc(t1, t2, zr);
11134       mov(t2, zr);
11135     }
11136 
11137     // A carry in t0 after Montgomery multiplication means that we
11138     // should subtract multiples of n from our result in m.  We'll
11139     // keep doing that until there is no carry.
11140     void normalize(RegisterOrConstant len) {
11141       block_comment("normalize");
11142       // while (t0)
11143       //   t0 = sub(Pm_base, Pn_base, t0, len);
11144       Label loop, post, again;
11145       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11146       cbz(t0, post); {
11147         bind(again); {
11148           mov(i, zr);
11149           mov(cnt, len);
11150           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11151           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11152           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11153           align(16);
11154           bind(loop); {
11155             sbcs(Rm, Rm, Rn);
11156             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11157             add(i, i, 1);
11158             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11159             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11160             sub(cnt, cnt, 1);
11161           } cbnz(cnt, loop);
11162           sbc(t0, t0, zr);
11163         } cbnz(t0, again);
11164       } bind(post);
11165     }
11166 
11167     // Move memory at s to d, reversing words.
11168     //    Increments d to end of copied memory
11169     //    Destroys tmp1, tmp2
11170     //    Preserves len
11171     //    Leaves s pointing to the address which was in d at start
11172     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11173       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11174       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11175 
11176       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11177       mov(tmp1, len);
11178       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11179       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11180     }
11181     // where
11182     void reverse1(Register d, Register s, Register tmp) {
11183       ldr(tmp, pre(s, -wordSize));
11184       ror(tmp, tmp, 32);
11185       str(tmp, post(d, wordSize));
11186     }
11187 
11188     void step_squaring() {
11189       // An extra ACC
11190       step();
11191       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11192     }
11193 
11194     void last_squaring(RegisterOrConstant i) {
11195       Label dont;
11196       // if ((i & 1) == 0) {
11197       tbnz(i.as_register(), 0, dont); {
11198         // MACC(Ra, Rb, t0, t1, t2);
11199         // Ra = *++Pa;
11200         // Rb = *--Pb;
11201         umulh(Rhi_ab, Ra, Rb);
11202         mul(Rlo_ab, Ra, Rb);
11203         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11204       } bind(dont);
11205     }
11206 
11207     void extra_step_squaring() {
11208       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11209 
11210       // MACC(Rm, Rn, t0, t1, t2);
11211       // Rm = *++Pm;
11212       // Rn = *--Pn;
11213       umulh(Rhi_mn, Rm, Rn);
11214       mul(Rlo_mn, Rm, Rn);
11215       ldr(Rm, pre(Pm, wordSize));
11216       ldr(Rn, pre(Pn, -wordSize));
11217     }
11218 
11219     void post1_squaring() {
11220       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11221 
11222       // *Pm = Rm = t0 * inv;
11223       mul(Rm, t0, inv);
11224       str(Rm, Address(Pm));
11225 
11226       // MACC(Rm, Rn, t0, t1, t2);
11227       // t0 = t1; t1 = t2; t2 = 0;
11228       umulh(Rhi_mn, Rm, Rn);
11229 
11230 #ifndef PRODUCT
11231       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11232       {
11233         mul(Rlo_mn, Rm, Rn);
11234         add(Rlo_mn, t0, Rlo_mn);
11235         Label ok;
11236         cbz(Rlo_mn, ok); {
11237           stop("broken Montgomery multiply");
11238         } bind(ok);
11239       }
11240 #endif
11241       // We have very carefully set things up so that
11242       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11243       // the lower half of Rm * Rn because we know the result already:
11244       // it must be -t0.  t0 + (-t0) must generate a carry iff
11245       // t0 != 0.  So, rather than do a mul and an adds we just set
11246       // the carry flag iff t0 is nonzero.
11247       //
11248       // mul(Rlo_mn, Rm, Rn);
11249       // adds(zr, t0, Rlo_mn);
11250       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11251       adcs(t0, t1, Rhi_mn);
11252       adc(t1, t2, zr);
11253       mov(t2, zr);
11254     }
11255 
11256     void acc(Register Rhi, Register Rlo,
11257              Register t0, Register t1, Register t2) {
11258       adds(t0, t0, Rlo);
11259       adcs(t1, t1, Rhi);
11260       adc(t2, t2, zr);
11261     }
11262 
11263   public:
11264     /**
11265      * Fast Montgomery multiplication.  The derivation of the
11266      * algorithm is in A Cryptographic Library for the Motorola
11267      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11268      *
11269      * Arguments:
11270      *
11271      * Inputs for multiplication:
11272      *   c_rarg0   - int array elements a
11273      *   c_rarg1   - int array elements b
11274      *   c_rarg2   - int array elements n (the modulus)
11275      *   c_rarg3   - int length
11276      *   c_rarg4   - int inv
11277      *   c_rarg5   - int array elements m (the result)
11278      *
11279      * Inputs for squaring:
11280      *   c_rarg0   - int array elements a
11281      *   c_rarg1   - int array elements n (the modulus)
11282      *   c_rarg2   - int length
11283      *   c_rarg3   - int inv
11284      *   c_rarg4   - int array elements m (the result)
11285      *
11286      */
11287     address generate_multiply() {
11288       Label argh, nothing;
11289       bind(argh);
11290       stop("MontgomeryMultiply total_allocation must be <= 8192");
11291 
11292       align(CodeEntryAlignment);
11293       address entry = pc();
11294 
11295       cbzw(Rlen, nothing);
11296 
11297       enter();
11298 
11299       // Make room.
11300       cmpw(Rlen, 512);
11301       br(Assembler::HI, argh);
11302       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11303       andr(sp, Ra, -2 * wordSize);
11304 
11305       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11306 
11307       {
11308         // Copy input args, reversing as we go.  We use Ra as a
11309         // temporary variable.
11310         reverse(Ra, Pa_base, Rlen, t0, t1);
11311         if (!_squaring)
11312           reverse(Ra, Pb_base, Rlen, t0, t1);
11313         reverse(Ra, Pn_base, Rlen, t0, t1);
11314       }
11315 
11316       // Push all call-saved registers and also Pm_base which we'll need
11317       // at the end.
11318       save_regs();
11319 
11320 #ifndef PRODUCT
11321       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11322       {
11323         ldr(Rn, Address(Pn_base, 0));
11324         mul(Rlo_mn, Rn, inv);
11325         subs(zr, Rlo_mn, -1);
11326         Label ok;
11327         br(EQ, ok); {
11328           stop("broken inverse in Montgomery multiply");
11329         } bind(ok);
11330       }
11331 #endif
11332 
11333       mov(Pm_base, Ra);
11334 
11335       mov(t0, zr);
11336       mov(t1, zr);
11337       mov(t2, zr);
11338 
11339       block_comment("for (int i = 0; i < len; i++) {");
11340       mov(Ri, zr); {
11341         Label loop, end;
11342         cmpw(Ri, Rlen);
11343         br(Assembler::GE, end);
11344 
11345         bind(loop);
11346         pre1(Ri);
11347 
11348         block_comment("  for (j = i; j; j--) {"); {
11349           movw(Rj, Ri);
11350           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11351         } block_comment("  } // j");
11352 
11353         post1();
11354         addw(Ri, Ri, 1);
11355         cmpw(Ri, Rlen);
11356         br(Assembler::LT, loop);
11357         bind(end);
11358         block_comment("} // i");
11359       }
11360 
11361       block_comment("for (int i = len; i < 2*len; i++) {");
11362       mov(Ri, Rlen); {
11363         Label loop, end;
11364         cmpw(Ri, Rlen, Assembler::LSL, 1);
11365         br(Assembler::GE, end);
11366 
11367         bind(loop);
11368         pre2(Ri, Rlen);
11369 
11370         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11371           lslw(Rj, Rlen, 1);
11372           subw(Rj, Rj, Ri);
11373           subw(Rj, Rj, 1);
11374           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11375         } block_comment("  } // j");
11376 
11377         post2(Ri, Rlen);
11378         addw(Ri, Ri, 1);
11379         cmpw(Ri, Rlen, Assembler::LSL, 1);
11380         br(Assembler::LT, loop);
11381         bind(end);
11382       }
11383       block_comment("} // i");
11384 
11385       normalize(Rlen);
11386 
11387       mov(Ra, Pm_base);  // Save Pm_base in Ra
11388       restore_regs();  // Restore caller's Pm_base
11389 
11390       // Copy our result into caller's Pm_base
11391       reverse(Pm_base, Ra, Rlen, t0, t1);
11392 
11393       leave();
11394       bind(nothing);
11395       ret(lr);
11396 
11397       return entry;
11398     }
11399     // In C, approximately:
11400 
11401     // void
11402     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11403     //                     julong Pn_base[], julong Pm_base[],
11404     //                     julong inv, int len) {
11405     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11406     //   julong *Pa, *Pb, *Pn, *Pm;
11407     //   julong Ra, Rb, Rn, Rm;
11408 
11409     //   int i;
11410 
11411     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11412 
11413     //   for (i = 0; i < len; i++) {
11414     //     int j;
11415 
11416     //     Pa = Pa_base;
11417     //     Pb = Pb_base + i;
11418     //     Pm = Pm_base;
11419     //     Pn = Pn_base + i;
11420 
11421     //     Ra = *Pa;
11422     //     Rb = *Pb;
11423     //     Rm = *Pm;
11424     //     Rn = *Pn;
11425 
11426     //     int iters = i;
11427     //     for (j = 0; iters--; j++) {
11428     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11429     //       MACC(Ra, Rb, t0, t1, t2);
11430     //       Ra = *++Pa;
11431     //       Rb = *--Pb;
11432     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11433     //       MACC(Rm, Rn, t0, t1, t2);
11434     //       Rm = *++Pm;
11435     //       Rn = *--Pn;
11436     //     }
11437 
11438     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11439     //     MACC(Ra, Rb, t0, t1, t2);
11440     //     *Pm = Rm = t0 * inv;
11441     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11442     //     MACC(Rm, Rn, t0, t1, t2);
11443 
11444     //     assert(t0 == 0, "broken Montgomery multiply");
11445 
11446     //     t0 = t1; t1 = t2; t2 = 0;
11447     //   }
11448 
11449     //   for (i = len; i < 2*len; i++) {
11450     //     int j;
11451 
11452     //     Pa = Pa_base + i-len;
11453     //     Pb = Pb_base + len;
11454     //     Pm = Pm_base + i-len;
11455     //     Pn = Pn_base + len;
11456 
11457     //     Ra = *++Pa;
11458     //     Rb = *--Pb;
11459     //     Rm = *++Pm;
11460     //     Rn = *--Pn;
11461 
11462     //     int iters = len*2-i-1;
11463     //     for (j = i-len+1; iters--; j++) {
11464     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11465     //       MACC(Ra, Rb, t0, t1, t2);
11466     //       Ra = *++Pa;
11467     //       Rb = *--Pb;
11468     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11469     //       MACC(Rm, Rn, t0, t1, t2);
11470     //       Rm = *++Pm;
11471     //       Rn = *--Pn;
11472     //     }
11473 
11474     //     Pm_base[i-len] = t0;
11475     //     t0 = t1; t1 = t2; t2 = 0;
11476     //   }
11477 
11478     //   while (t0)
11479     //     t0 = sub(Pm_base, Pn_base, t0, len);
11480     // }
11481 
11482     /**
11483      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11484      * multiplies than Montgomery multiplication so it should be up to
11485      * 25% faster.  However, its loop control is more complex and it
11486      * may actually run slower on some machines.
11487      *
11488      * Arguments:
11489      *
11490      * Inputs:
11491      *   c_rarg0   - int array elements a
11492      *   c_rarg1   - int array elements n (the modulus)
11493      *   c_rarg2   - int length
11494      *   c_rarg3   - int inv
11495      *   c_rarg4   - int array elements m (the result)
11496      *
11497      */
11498     address generate_square() {
11499       Label argh;
11500       bind(argh);
11501       stop("MontgomeryMultiply total_allocation must be <= 8192");
11502 
11503       align(CodeEntryAlignment);
11504       address entry = pc();
11505 
11506       enter();
11507 
11508       // Make room.
11509       cmpw(Rlen, 512);
11510       br(Assembler::HI, argh);
11511       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11512       andr(sp, Ra, -2 * wordSize);
11513 
11514       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11515 
11516       {
11517         // Copy input args, reversing as we go.  We use Ra as a
11518         // temporary variable.
11519         reverse(Ra, Pa_base, Rlen, t0, t1);
11520         reverse(Ra, Pn_base, Rlen, t0, t1);
11521       }
11522 
11523       // Push all call-saved registers and also Pm_base which we'll need
11524       // at the end.
11525       save_regs();
11526 
11527       mov(Pm_base, Ra);
11528 
11529       mov(t0, zr);
11530       mov(t1, zr);
11531       mov(t2, zr);
11532 
11533       block_comment("for (int i = 0; i < len; i++) {");
11534       mov(Ri, zr); {
11535         Label loop, end;
11536         bind(loop);
11537         cmp(Ri, Rlen);
11538         br(Assembler::GE, end);
11539 
11540         pre1(Ri);
11541 
11542         block_comment("for (j = (i+1)/2; j; j--) {"); {
11543           add(Rj, Ri, 1);
11544           lsr(Rj, Rj, 1);
11545           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11546         } block_comment("  } // j");
11547 
11548         last_squaring(Ri);
11549 
11550         block_comment("  for (j = i/2; j; j--) {"); {
11551           lsr(Rj, Ri, 1);
11552           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11553         } block_comment("  } // j");
11554 
11555         post1_squaring();
11556         add(Ri, Ri, 1);
11557         cmp(Ri, Rlen);
11558         br(Assembler::LT, loop);
11559 
11560         bind(end);
11561         block_comment("} // i");
11562       }
11563 
11564       block_comment("for (int i = len; i < 2*len; i++) {");
11565       mov(Ri, Rlen); {
11566         Label loop, end;
11567         bind(loop);
11568         cmp(Ri, Rlen, Assembler::LSL, 1);
11569         br(Assembler::GE, end);
11570 
11571         pre2(Ri, Rlen);
11572 
11573         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11574           lsl(Rj, Rlen, 1);
11575           sub(Rj, Rj, Ri);
11576           sub(Rj, Rj, 1);
11577           lsr(Rj, Rj, 1);
11578           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11579         } block_comment("  } // j");
11580 
11581         last_squaring(Ri);
11582 
11583         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11584           lsl(Rj, Rlen, 1);
11585           sub(Rj, Rj, Ri);
11586           lsr(Rj, Rj, 1);
11587           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11588         } block_comment("  } // j");
11589 
11590         post2(Ri, Rlen);
11591         add(Ri, Ri, 1);
11592         cmp(Ri, Rlen, Assembler::LSL, 1);
11593 
11594         br(Assembler::LT, loop);
11595         bind(end);
11596         block_comment("} // i");
11597       }
11598 
11599       normalize(Rlen);
11600 
11601       mov(Ra, Pm_base);  // Save Pm_base in Ra
11602       restore_regs();  // Restore caller's Pm_base
11603 
11604       // Copy our result into caller's Pm_base
11605       reverse(Pm_base, Ra, Rlen, t0, t1);
11606 
11607       leave();
11608       ret(lr);
11609 
11610       return entry;
11611     }
11612     // In C, approximately:
11613 
11614     // void
11615     // montgomery_square(julong Pa_base[], julong Pn_base[],
11616     //                   julong Pm_base[], julong inv, int len) {
11617     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11618     //   julong *Pa, *Pb, *Pn, *Pm;
11619     //   julong Ra, Rb, Rn, Rm;
11620 
11621     //   int i;
11622 
11623     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11624 
11625     //   for (i = 0; i < len; i++) {
11626     //     int j;
11627 
11628     //     Pa = Pa_base;
11629     //     Pb = Pa_base + i;
11630     //     Pm = Pm_base;
11631     //     Pn = Pn_base + i;
11632 
11633     //     Ra = *Pa;
11634     //     Rb = *Pb;
11635     //     Rm = *Pm;
11636     //     Rn = *Pn;
11637 
11638     //     int iters = (i+1)/2;
11639     //     for (j = 0; iters--; j++) {
11640     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11641     //       MACC2(Ra, Rb, t0, t1, t2);
11642     //       Ra = *++Pa;
11643     //       Rb = *--Pb;
11644     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11645     //       MACC(Rm, Rn, t0, t1, t2);
11646     //       Rm = *++Pm;
11647     //       Rn = *--Pn;
11648     //     }
11649     //     if ((i & 1) == 0) {
11650     //       assert(Ra == Pa_base[j], "must be");
11651     //       MACC(Ra, Ra, t0, t1, t2);
11652     //     }
11653     //     iters = i/2;
11654     //     assert(iters == i-j, "must be");
11655     //     for (; iters--; j++) {
11656     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11657     //       MACC(Rm, Rn, t0, t1, t2);
11658     //       Rm = *++Pm;
11659     //       Rn = *--Pn;
11660     //     }
11661 
11662     //     *Pm = Rm = t0 * inv;
11663     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11664     //     MACC(Rm, Rn, t0, t1, t2);
11665 
11666     //     assert(t0 == 0, "broken Montgomery multiply");
11667 
11668     //     t0 = t1; t1 = t2; t2 = 0;
11669     //   }
11670 
11671     //   for (i = len; i < 2*len; i++) {
11672     //     int start = i-len+1;
11673     //     int end = start + (len - start)/2;
11674     //     int j;
11675 
11676     //     Pa = Pa_base + i-len;
11677     //     Pb = Pa_base + len;
11678     //     Pm = Pm_base + i-len;
11679     //     Pn = Pn_base + len;
11680 
11681     //     Ra = *++Pa;
11682     //     Rb = *--Pb;
11683     //     Rm = *++Pm;
11684     //     Rn = *--Pn;
11685 
11686     //     int iters = (2*len-i-1)/2;
11687     //     assert(iters == end-start, "must be");
11688     //     for (j = start; iters--; j++) {
11689     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11690     //       MACC2(Ra, Rb, t0, t1, t2);
11691     //       Ra = *++Pa;
11692     //       Rb = *--Pb;
11693     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11694     //       MACC(Rm, Rn, t0, t1, t2);
11695     //       Rm = *++Pm;
11696     //       Rn = *--Pn;
11697     //     }
11698     //     if ((i & 1) == 0) {
11699     //       assert(Ra == Pa_base[j], "must be");
11700     //       MACC(Ra, Ra, t0, t1, t2);
11701     //     }
11702     //     iters =  (2*len-i)/2;
11703     //     assert(iters == len-j, "must be");
11704     //     for (; iters--; j++) {
11705     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11706     //       MACC(Rm, Rn, t0, t1, t2);
11707     //       Rm = *++Pm;
11708     //       Rn = *--Pn;
11709     //     }
11710     //     Pm_base[i-len] = t0;
11711     //     t0 = t1; t1 = t2; t2 = 0;
11712     //   }
11713 
11714     //   while (t0)
11715     //     t0 = sub(Pm_base, Pn_base, t0, len);
11716     // }
11717   };
11718 
11719   // Initialization
11720   void generate_preuniverse_stubs() {
11721     // preuniverse stubs are not needed for aarch64
11722   }
11723 
11724   void generate_initial_stubs() {
11725     // Generate initial stubs and initializes the entry points
11726 
11727     // entry points that exist in all platforms Note: This is code
11728     // that could be shared among different platforms - however the
11729     // benefit seems to be smaller than the disadvantage of having a
11730     // much more complicated generator structure. See also comment in
11731     // stubRoutines.hpp.
11732 
11733     StubRoutines::_forward_exception_entry = generate_forward_exception();
11734 
11735     StubRoutines::_call_stub_entry =
11736       generate_call_stub(StubRoutines::_call_stub_return_address);
11737 
11738     // is referenced by megamorphic call
11739     StubRoutines::_catch_exception_entry = generate_catch_exception();
11740 
11741     // Initialize table for copy memory (arraycopy) check.
11742     if (UnsafeMemoryAccess::_table == nullptr) {
11743       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11744     }
11745 
11746     if (UseCRC32Intrinsics) {
11747       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11748     }
11749 
11750     if (UseCRC32CIntrinsics) {
11751       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11752     }
11753 
11754     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11755       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11756     }
11757 
11758     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11759       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11760     }
11761 
11762     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11763         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11764       StubRoutines::_hf2f = generate_float16ToFloat();
11765       StubRoutines::_f2hf = generate_floatToFloat16();
11766     }
11767   }
11768 
11769   void generate_continuation_stubs() {
11770     // Continuation stubs:
11771     StubRoutines::_cont_thaw          = generate_cont_thaw();
11772     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11773     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11774     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11775   }
11776 
11777   void generate_final_stubs() {
11778     // support for verify_oop (must happen after universe_init)
11779     if (VerifyOops) {
11780       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11781     }
11782 
11783     // arraycopy stubs used by compilers
11784     generate_arraycopy_stubs();
11785 
11786     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11787 
11788     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11789 
11790     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11791     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11792 
11793 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11794 
11795     generate_atomic_entry_points();
11796 
11797 #endif // LINUX
11798 
11799 #ifdef COMPILER2
11800     if (UseSecondarySupersTable) {
11801       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11802       if (! InlineSecondarySupersTest) {
11803         generate_lookup_secondary_supers_table_stub();
11804       }
11805     }
11806 #endif
11807 
11808     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11809 
11810     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11811   }
11812 
11813   void generate_compiler_stubs() {
11814 #if COMPILER2_OR_JVMCI
11815 
11816     if (UseSVE == 0) {
11817       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11818     }
11819 
11820     // array equals stub for large arrays.
11821     if (!UseSimpleArrayEquals) {
11822       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11823     }
11824 
11825     // arrays_hascode stub for large arrays.
11826     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11827     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11828     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11829     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11830     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11831 
11832     // byte_array_inflate stub for large arrays.
11833     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11834 
11835     // countPositives stub for large arrays.
11836     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11837 
11838     generate_compare_long_strings();
11839 
11840     generate_string_indexof_stubs();
11841 
11842 #ifdef COMPILER2
11843     if (UseMultiplyToLenIntrinsic) {
11844       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11845     }
11846 
11847     if (UseSquareToLenIntrinsic) {
11848       StubRoutines::_squareToLen = generate_squareToLen();
11849     }
11850 
11851     if (UseMulAddIntrinsic) {
11852       StubRoutines::_mulAdd = generate_mulAdd();
11853     }
11854 
11855     if (UseSIMDForBigIntegerShiftIntrinsics) {
11856       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11857       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11858     }
11859 
11860     if (UseMontgomeryMultiplyIntrinsic) {
11861       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11862       StubCodeMark mark(this, stub_id);
11863       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11864       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11865     }
11866 
11867     if (UseMontgomerySquareIntrinsic) {
11868       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11869       StubCodeMark mark(this, stub_id);
11870       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11871       // We use generate_multiply() rather than generate_square()
11872       // because it's faster for the sizes of modulus we care about.
11873       StubRoutines::_montgomerySquare = g.generate_multiply();
11874     }
11875 
11876 #endif // COMPILER2
11877 
11878     if (UseChaCha20Intrinsics) {
11879       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11880     }
11881 
11882     if (UseKyberIntrinsics) {
11883       StubRoutines::_kyberNtt = generate_kyberNtt();
11884       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11885       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11886       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11887       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11888       StubRoutines::_kyber12To16 = generate_kyber12To16();
11889       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11890     }
11891 
11892     if (UseDilithiumIntrinsics) {
11893       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11894       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11895       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11896       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11897       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11898     }
11899 
11900     if (UseBASE64Intrinsics) {
11901         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11902         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11903     }
11904 
11905     // data cache line writeback
11906     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11907     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11908 
11909     if (UseAESIntrinsics) {
11910       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11911       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11912       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11913       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11914       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11915     }
11916     if (UseGHASHIntrinsics) {
11917       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11918       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11919     }
11920     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11921       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11922     }
11923 
11924     if (UseMD5Intrinsics) {
11925       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11926       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11927     }
11928     if (UseSHA1Intrinsics) {
11929       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11930       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11931     }
11932     if (UseSHA256Intrinsics) {
11933       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11934       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11935     }
11936     if (UseSHA512Intrinsics) {
11937       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11938       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11939     }
11940     if (UseSHA3Intrinsics) {
11941 
11942       StubRoutines::_double_keccak         = generate_double_keccak();
11943       if (UseSIMDForSHA3Intrinsic) {
11944          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11945          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11946       } else {
11947          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11948          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11949       }
11950     }
11951 
11952     if (UsePoly1305Intrinsics) {
11953       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11954     }
11955 
11956     // generate Adler32 intrinsics code
11957     if (UseAdler32Intrinsics) {
11958       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11959     }
11960 
11961 #endif // COMPILER2_OR_JVMCI
11962   }
11963 
11964  public:
11965   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11966     switch(blob_id) {
11967     case BlobId::stubgen_preuniverse_id:
11968       generate_preuniverse_stubs();
11969       break;
11970     case BlobId::stubgen_initial_id:
11971       generate_initial_stubs();
11972       break;
11973      case BlobId::stubgen_continuation_id:
11974       generate_continuation_stubs();
11975       break;
11976     case BlobId::stubgen_compiler_id:
11977       generate_compiler_stubs();
11978       break;
11979     case BlobId::stubgen_final_id:
11980       generate_final_stubs();
11981       break;
11982     default:
11983       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11984       break;
11985     };
11986   }
11987 }; // end class declaration
11988 
11989 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11990   StubGenerator g(code, blob_id);
11991 }
11992 
11993 
11994 #if defined (LINUX)
11995 
11996 // Define pointers to atomic stubs and initialize them to point to the
11997 // code in atomic_aarch64.S.
11998 
11999 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12000   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12001     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12002   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12003     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12004 
12005 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12006 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12007 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12008 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12009 DEFAULT_ATOMIC_OP(xchg, 4, )
12010 DEFAULT_ATOMIC_OP(xchg, 8, )
12011 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12012 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12013 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12014 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12015 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12016 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12017 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12018 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12019 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12020 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12021 
12022 #undef DEFAULT_ATOMIC_OP
12023 
12024 #endif // LINUX