Old src/hotspot/cpu/aarch64/stubGenerator

    1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     __ ldr(j_rarg2, result);
  332     Label is_long, is_float, is_double, exit;
  333     __ ldr(j_rarg1, result_type);
  334     __ cmp(j_rarg1, (u1)T_OBJECT);
  335     __ br(Assembler::EQ, is_long);
  336     __ cmp(j_rarg1, (u1)T_LONG);
  337     __ br(Assembler::EQ, is_long);
  338     __ cmp(j_rarg1, (u1)T_FLOAT);
  339     __ br(Assembler::EQ, is_float);
  340     __ cmp(j_rarg1, (u1)T_DOUBLE);
  341     __ br(Assembler::EQ, is_double);
  342 
  343     // handle T_INT case
  344     __ strw(r0, Address(j_rarg2));
  345 
  346     __ BIND(exit);
  347 
  348     // pop parameters
  349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  350 
  351 #ifdef ASSERT
  352     // verify that threads correspond
  353     {
  354       Label L, S;
  355       __ ldr(rscratch1, thread);
  356       __ cmp(rthread, rscratch1);
  357       __ br(Assembler::NE, S);
  358       __ get_thread(rscratch1);
  359       __ cmp(rthread, rscratch1);
  360       __ br(Assembler::EQ, L);
  361       __ BIND(S);
  362       __ stop("StubRoutines::call_stub: threads must correspond");
  363       __ BIND(L);
  364     }
  365 #endif
  366 
  367     __ pop_cont_fastpath(rthread);
  368 
  369     // restore callee-save registers
  370     __ ldpd(v15, v14,  d15_save);
  371     __ ldpd(v13, v12,  d13_save);
  372     __ ldpd(v11, v10,  d11_save);
  373     __ ldpd(v9,  v8,   d9_save);
  374 
  375     __ ldp(r28, r27,   r28_save);
  376     __ ldp(r26, r25,   r26_save);
  377     __ ldp(r24, r23,   r24_save);
  378     __ ldp(r22, r21,   r22_save);
  379     __ ldp(r20, r19,   r20_save);
  380 
  381     // restore fpcr
  382     __ ldr(rscratch1,  fpcr_save);
  383     __ set_fpcr(rscratch1);
  384 
  385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  386     __ ldrw(c_rarg2, result_type);
  387     __ ldr(c_rarg3,  method);
  388     __ ldp(c_rarg4, c_rarg5,  entry_point);
  389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  390 
  391     // leave frame and return to caller
  392     __ leave();
  393     __ ret(lr);
  394 
  395     // handle return types different from T_INT
  396 
  397     __ BIND(is_long);
  398     __ str(r0, Address(j_rarg2, 0));
  399     __ br(Assembler::AL, exit);
  400 
  401     __ BIND(is_float);
  402     __ strs(j_farg0, Address(j_rarg2, 0));
  403     __ br(Assembler::AL, exit);
  404 
  405     __ BIND(is_double);
  406     __ strd(j_farg0, Address(j_rarg2, 0));
  407     __ br(Assembler::AL, exit);
  408 
  409     return start;
  410   }
  411 
  412   // Return point for a Java call if there's an exception thrown in
  413   // Java code.  The exception is caught and transformed into a
  414   // pending exception stored in JavaThread that can be tested from
  415   // within the VM.
  416   //
  417   // Note: Usually the parameters are removed by the callee. In case
  418   // of an exception crossing an activation frame boundary, that is
  419   // not the case if the callee is compiled code => need to setup the
  420   // rsp.
  421   //
  422   // r0: exception oop
  423 
  424   address generate_catch_exception() {
  425     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  426     StubCodeMark mark(this, stub_id);
  427     address start = __ pc();
  428 
  429     // same as in generate_call_stub():
  430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  431     const Address thread        (rfp, thread_off         * wordSize);
  432 
  433 #ifdef ASSERT
  434     // verify that threads correspond
  435     {
  436       Label L, S;
  437       __ ldr(rscratch1, thread);
  438       __ cmp(rthread, rscratch1);
  439       __ br(Assembler::NE, S);
  440       __ get_thread(rscratch1);
  441       __ cmp(rthread, rscratch1);
  442       __ br(Assembler::EQ, L);
  443       __ bind(S);
  444       __ stop("StubRoutines::catch_exception: threads must correspond");
  445       __ bind(L);
  446     }
  447 #endif
  448 
  449     // set pending exception
  450     __ verify_oop(r0);
  451 
  452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  453     __ mov(rscratch1, (address)__FILE__);
  454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  455     __ movw(rscratch1, (int)__LINE__);
  456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  457 
  458     // complete return to VM
  459     assert(StubRoutines::_call_stub_return_address != nullptr,
  460            "_call_stub_return_address must have been generated before");
  461     __ b(StubRoutines::_call_stub_return_address);
  462 
  463     return start;
  464   }
  465 
  466   // Continuation point for runtime calls returning with a pending
  467   // exception.  The pending exception check happened in the runtime
  468   // or native call stub.  The pending exception in Thread is
  469   // converted into a Java-level exception.
  470   //
  471   // Contract with Java-level exception handlers:
  472   // r0: exception
  473   // r3: throwing pc
  474   //
  475   // NOTE: At entry of this stub, exception-pc must be in LR !!
  476 
  477   // NOTE: this is always used as a jump target within generated code
  478   // so it just needs to be generated code with no x86 prolog
  479 
  480   address generate_forward_exception() {
  481     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  482     StubCodeMark mark(this, stub_id);
  483     address start = __ pc();
  484 
  485     // Upon entry, LR points to the return address returning into
  486     // Java (interpreted or compiled) code; i.e., the return address
  487     // becomes the throwing pc.
  488     //
  489     // Arguments pushed before the runtime call are still on the stack
  490     // but the exception handler will reset the stack pointer ->
  491     // ignore them.  A potential result in registers can be ignored as
  492     // well.
  493 
  494 #ifdef ASSERT
  495     // make sure this code is only executed if there is a pending exception
  496     {
  497       Label L;
  498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  499       __ cbnz(rscratch1, L);
  500       __ stop("StubRoutines::forward exception: no pending exception (1)");
  501       __ bind(L);
  502     }
  503 #endif
  504 
  505     // compute exception handler into r19
  506 
  507     // call the VM to find the handler address associated with the
  508     // caller address. pass thread in r0 and caller pc (ret address)
  509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  510     // the stack.
  511     __ mov(c_rarg1, lr);
  512     // lr will be trashed by the VM call so we move it to R19
  513     // (callee-saved) because we also need to pass it to the handler
  514     // returned by this call.
  515     __ mov(r19, lr);
  516     BLOCK_COMMENT("call exception_handler_for_return_address");
  517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  518                          SharedRuntime::exception_handler_for_return_address),
  519                     rthread, c_rarg1);
  520     // Reinitialize the ptrue predicate register, in case the external runtime
  521     // call clobbers ptrue reg, as we may return to SVE compiled code.
  522     __ reinitialize_ptrue();
  523 
  524     // we should not really care that lr is no longer the callee
  525     // address. we saved the value the handler needs in r19 so we can
  526     // just copy it to r3. however, the C2 handler will push its own
  527     // frame and then calls into the VM and the VM code asserts that
  528     // the PC for the frame above the handler belongs to a compiled
  529     // Java method. So, we restore lr here to satisfy that assert.
  530     __ mov(lr, r19);
  531     // setup r0 & r3 & clear pending exception
  532     __ mov(r3, r19);
  533     __ mov(r19, r0);
  534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  536 
  537 #ifdef ASSERT
  538     // make sure exception is set
  539     {
  540       Label L;
  541       __ cbnz(r0, L);
  542       __ stop("StubRoutines::forward exception: no pending exception (2)");
  543       __ bind(L);
  544     }
  545 #endif
  546 
  547     // continue at exception handler
  548     // r0: exception
  549     // r3: throwing pc
  550     // r19: exception handler
  551     __ verify_oop(r0);
  552     __ br(r19);
  553 
  554     return start;
  555   }
  556 
  557   // Non-destructive plausibility checks for oops
  558   //
  559   // Arguments:
  560   //    r0: oop to verify
  561   //    rscratch1: error message
  562   //
  563   // Stack after saving c_rarg3:
  564   //    [tos + 0]: saved c_rarg3
  565   //    [tos + 1]: saved c_rarg2
  566   //    [tos + 2]: saved lr
  567   //    [tos + 3]: saved rscratch2
  568   //    [tos + 4]: saved r0
  569   //    [tos + 5]: saved rscratch1
  570   address generate_verify_oop() {
  571     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  572     StubCodeMark mark(this, stub_id);
  573     address start = __ pc();
  574 
  575     Label exit, error;
  576 
  577     // save c_rarg2 and c_rarg3
  578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  579 
  580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ ldr(c_rarg3, Address(c_rarg2));
  583     __ add(c_rarg3, c_rarg3, 1);
  584     __ str(c_rarg3, Address(c_rarg2));
  585 
  586     // object is in r0
  587     // make sure object is 'reasonable'
  588     __ cbz(r0, exit); // if obj is null it is OK
  589 
  590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  592 
  593     // return if everything seems ok
  594     __ bind(exit);
  595 
  596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  597     __ ret(lr);
  598 
  599     // handle errors
  600     __ bind(error);
  601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  602 
  603     __ push(RegSet::range(r0, r29), sp);
  604     // debug(char* msg, int64_t pc, int64_t regs[])
  605     __ mov(c_rarg0, rscratch1);      // pass address of error message
  606     __ mov(c_rarg1, lr);             // pass return address
  607     __ mov(c_rarg2, sp);             // pass address of regs on stack
  608 #ifndef PRODUCT
  609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  610 #endif
  611     BLOCK_COMMENT("call MacroAssembler::debug");
  612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  613     __ blr(rscratch1);
  614     __ hlt(0);
  615 
  616     return start;
  617   }
  618 
  619   // Generate indices for iota vector.
  620   address generate_iota_indices(StubGenStubId stub_id) {
  621     __ align(CodeEntryAlignment);
  622     StubCodeMark mark(this, stub_id);
  623     address start = __ pc();
  624     // B
  625     __ emit_data64(0x0706050403020100, relocInfo::none);
  626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  627     // H
  628     __ emit_data64(0x0003000200010000, relocInfo::none);
  629     __ emit_data64(0x0007000600050004, relocInfo::none);
  630     // S
  631     __ emit_data64(0x0000000100000000, relocInfo::none);
  632     __ emit_data64(0x0000000300000002, relocInfo::none);
  633     // D
  634     __ emit_data64(0x0000000000000000, relocInfo::none);
  635     __ emit_data64(0x0000000000000001, relocInfo::none);
  636     // S - FP
  637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  639     // D - FP
  640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  642     return start;
  643   }
  644 
  645   // The inner part of zero_words().  This is the bulk operation,
  646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  647   // caller is responsible for zeroing the last few words.
  648   //
  649   // Inputs:
  650   // r10: the HeapWord-aligned base address of an array to zero.
  651   // r11: the count in HeapWords, r11 > 0.
  652   //
  653   // Returns r10 and r11, adjusted for the caller to clear.
  654   // r10: the base address of the tail of words left to clear.
  655   // r11: the number of words in the tail.
  656   //      r11 < MacroAssembler::zero_words_block_size.
  657 
  658   address generate_zero_blocks() {
  659     Label done;
  660     Label base_aligned;
  661 
  662     Register base = r10, cnt = r11;
  663 
  664     __ align(CodeEntryAlignment);
  665     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  666     StubCodeMark mark(this, stub_id);
  667     address start = __ pc();
  668 
  669     if (UseBlockZeroing) {
  670       int zva_length = VM_Version::zva_length();
  671 
  672       // Ensure ZVA length can be divided by 16. This is required by
  673       // the subsequent operations.
  674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  675 
  676       __ tbz(base, 3, base_aligned);
  677       __ str(zr, Address(__ post(base, 8)));
  678       __ sub(cnt, cnt, 1);
  679       __ bind(base_aligned);
  680 
  681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  682       // alignment.
  683       Label small;
  684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  685       __ subs(rscratch1, cnt, low_limit >> 3);
  686       __ br(Assembler::LT, small);
  687       __ zero_dcache_blocks(base, cnt);
  688       __ bind(small);
  689     }
  690 
  691     {
  692       // Number of stp instructions we'll unroll
  693       const int unroll =
  694         MacroAssembler::zero_words_block_size / 2;
  695       // Clear the remaining blocks.
  696       Label loop;
  697       __ subs(cnt, cnt, unroll * 2);
  698       __ br(Assembler::LT, done);
  699       __ bind(loop);
  700       for (int i = 0; i < unroll; i++)
  701         __ stp(zr, zr, __ post(base, 16));
  702       __ subs(cnt, cnt, unroll * 2);
  703       __ br(Assembler::GE, loop);
  704       __ bind(done);
  705       __ add(cnt, cnt, unroll * 2);
  706     }
  707 
  708     __ ret(lr);
  709 
  710     return start;
  711   }
  712 
  713 
  714   typedef enum {
  715     copy_forwards = 1,
  716     copy_backwards = -1
  717   } copy_direction;
  718 
  719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  720   // for arraycopy stubs.
  721   class ArrayCopyBarrierSetHelper : StackObj {
  722     BarrierSetAssembler* _bs_asm;
  723     MacroAssembler* _masm;
  724     DecoratorSet _decorators;
  725     BasicType _type;
  726     Register _gct1;
  727     Register _gct2;
  728     Register _gct3;
  729     FloatRegister _gcvt1;
  730     FloatRegister _gcvt2;
  731     FloatRegister _gcvt3;
  732 
  733   public:
  734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  735                               DecoratorSet decorators,
  736                               BasicType type,
  737                               Register gct1,
  738                               Register gct2,
  739                               Register gct3,
  740                               FloatRegister gcvt1,
  741                               FloatRegister gcvt2,
  742                               FloatRegister gcvt3)
  743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  744         _masm(masm),
  745         _decorators(decorators),
  746         _type(type),
  747         _gct1(gct1),
  748         _gct2(gct2),
  749         _gct3(gct3),
  750         _gcvt1(gcvt1),
  751         _gcvt2(gcvt2),
  752         _gcvt3(gcvt3) {
  753     }
  754 
  755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  757                             dst1, dst2, src,
  758                             _gct1, _gct2, _gcvt1);
  759     }
  760 
  761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  763                              dst, src1, src2,
  764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  765     }
  766 
  767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  769                             dst1, dst2, src,
  770                             _gct1);
  771     }
  772 
  773     void copy_store_at_16(Address dst, Register src1, Register src2) {
  774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  775                              dst, src1, src2,
  776                              _gct1, _gct2, _gct3);
  777     }
  778 
  779     void copy_load_at_8(Register dst, Address src) {
  780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  781                             dst, noreg, src,
  782                             _gct1);
  783     }
  784 
  785     void copy_store_at_8(Address dst, Register src) {
  786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  787                              dst, src, noreg,
  788                              _gct1, _gct2, _gct3);
  789     }
  790   };
  791 
  792   // Bulk copy of blocks of 8 words.
  793   //
  794   // count is a count of words.
  795   //
  796   // Precondition: count >= 8
  797   //
  798   // Postconditions:
  799   //
  800   // The least significant bit of count contains the remaining count
  801   // of words to copy.  The rest of count is trash.
  802   //
  803   // s and d are adjusted to point to the remaining words to copy
  804   //
  805   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  806     BasicType type;
  807     copy_direction direction;
  808 
  809     switch (stub_id) {
  810     case copy_byte_f_id:
  811       direction = copy_forwards;
  812       type = T_BYTE;
  813       break;
  814     case copy_byte_b_id:
  815       direction = copy_backwards;
  816       type = T_BYTE;
  817       break;
  818     case copy_oop_f_id:
  819       direction = copy_forwards;
  820       type = T_OBJECT;
  821       break;
  822     case copy_oop_b_id:
  823       direction = copy_backwards;
  824       type = T_OBJECT;
  825       break;
  826     case copy_oop_uninit_f_id:
  827       direction = copy_forwards;
  828       type = T_OBJECT;
  829       break;
  830     case copy_oop_uninit_b_id:
  831       direction = copy_backwards;
  832       type = T_OBJECT;
  833       break;
  834     default:
  835       ShouldNotReachHere();
  836     }
  837 
  838     int unit = wordSize * direction;
  839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  840 
  841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  843     const Register stride = r14;
  844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  847 
  848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  849     assert_different_registers(s, d, count, rscratch1, rscratch2);
  850 
  851     Label again, drain;
  852 
  853     __ align(CodeEntryAlignment);
  854 
  855     StubCodeMark mark(this, stub_id);
  856 
  857     __ bind(start);
  858 
  859     Label unaligned_copy_long;
  860     if (AvoidUnalignedAccesses) {
  861       __ tbnz(d, 3, unaligned_copy_long);
  862     }
  863 
  864     if (direction == copy_forwards) {
  865       __ sub(s, s, bias);
  866       __ sub(d, d, bias);
  867     }
  868 
  869 #ifdef ASSERT
  870     // Make sure we are never given < 8 words
  871     {
  872       Label L;
  873       __ cmp(count, (u1)8);
  874       __ br(Assembler::GE, L);
  875       __ stop("genrate_copy_longs called with < 8 words");
  876       __ bind(L);
  877     }
  878 #endif
  879 
  880     // Fill 8 registers
  881     if (UseSIMDForMemoryOps) {
  882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  884     } else {
  885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  889     }
  890 
  891     __ subs(count, count, 16);
  892     __ br(Assembler::LO, drain);
  893 
  894     int prefetch = PrefetchCopyIntervalInBytes;
  895     bool use_stride = false;
  896     if (direction == copy_backwards) {
  897        use_stride = prefetch > 256;
  898        prefetch = -prefetch;
  899        if (use_stride) __ mov(stride, prefetch);
  900     }
  901 
  902     __ bind(again);
  903 
  904     if (PrefetchCopyIntervalInBytes > 0)
  905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  906 
  907     if (UseSIMDForMemoryOps) {
  908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  912     } else {
  913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  921     }
  922 
  923     __ subs(count, count, 8);
  924     __ br(Assembler::HS, again);
  925 
  926     // Drain
  927     __ bind(drain);
  928     if (UseSIMDForMemoryOps) {
  929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  931     } else {
  932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936     }
  937 
  938     {
  939       Label L1, L2;
  940       __ tbz(count, exact_log2(4), L1);
  941       if (UseSIMDForMemoryOps) {
  942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  944       } else {
  945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  949       }
  950       __ bind(L1);
  951 
  952       if (direction == copy_forwards) {
  953         __ add(s, s, bias);
  954         __ add(d, d, bias);
  955       }
  956 
  957       __ tbz(count, 1, L2);
  958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  960       __ bind(L2);
  961     }
  962 
  963     __ ret(lr);
  964 
  965     if (AvoidUnalignedAccesses) {
  966       Label drain, again;
  967       // Register order for storing. Order is different for backward copy.
  968 
  969       __ bind(unaligned_copy_long);
  970 
  971       // source address is even aligned, target odd aligned
  972       //
  973       // when forward copying word pairs we read long pairs at offsets
  974       // {0, 2, 4, 6} (in long words). when backwards copying we read
  975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  976       // address by -2 in the forwards case so we can compute the
  977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  978       // or -1.
  979       //
  980       // when forward copying we need to store 1 word, 3 pairs and
  981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  982       // zero offset We adjust the destination by -1 which means we
  983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  984       //
  985       // When backwards copyng we need to store 1 word, 3 pairs and
  986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  987       // offsets {1, 3, 5, 7, 8} * unit.
  988 
  989       if (direction == copy_forwards) {
  990         __ sub(s, s, 16);
  991         __ sub(d, d, 8);
  992       }
  993 
  994       // Fill 8 registers
  995       //
  996       // for forwards copy s was offset by -16 from the original input
  997       // value of s so the register contents are at these offsets
  998       // relative to the 64 bit block addressed by that original input
  999       // and so on for each successive 64 byte block when s is updated
 1000       //
 1001       // t0 at offset 0,  t1 at offset 8
 1002       // t2 at offset 16, t3 at offset 24
 1003       // t4 at offset 32, t5 at offset 40
 1004       // t6 at offset 48, t7 at offset 56
 1005 
 1006       // for backwards copy s was not offset so the register contents
 1007       // are at these offsets into the preceding 64 byte block
 1008       // relative to that original input and so on for each successive
 1009       // preceding 64 byte block when s is updated. this explains the
 1010       // slightly counter-intuitive looking pattern of register usage
 1011       // in the stp instructions for backwards copy.
 1012       //
 1013       // t0 at offset -16, t1 at offset -8
 1014       // t2 at offset -32, t3 at offset -24
 1015       // t4 at offset -48, t5 at offset -40
 1016       // t6 at offset -64, t7 at offset -56
 1017 
 1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1022 
 1023       __ subs(count, count, 16);
 1024       __ br(Assembler::LO, drain);
 1025 
 1026       int prefetch = PrefetchCopyIntervalInBytes;
 1027       bool use_stride = false;
 1028       if (direction == copy_backwards) {
 1029          use_stride = prefetch > 256;
 1030          prefetch = -prefetch;
 1031          if (use_stride) __ mov(stride, prefetch);
 1032       }
 1033 
 1034       __ bind(again);
 1035 
 1036       if (PrefetchCopyIntervalInBytes > 0)
 1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1038 
 1039       if (direction == copy_forwards) {
 1040        // allowing for the offset of -8 the store instructions place
 1041        // registers into the target 64 bit block at the following
 1042        // offsets
 1043        //
 1044        // t0 at offset 0
 1045        // t1 at offset 8,  t2 at offset 16
 1046        // t3 at offset 24, t4 at offset 32
 1047        // t5 at offset 40, t6 at offset 48
 1048        // t7 at offset 56
 1049 
 1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1059       } else {
 1060        // d was not offset when we started so the registers are
 1061        // written into the 64 bit block preceding d with the following
 1062        // offsets
 1063        //
 1064        // t1 at offset -8
 1065        // t3 at offset -24, t0 at offset -16
 1066        // t5 at offset -48, t2 at offset -32
 1067        // t7 at offset -56, t4 at offset -48
 1068        //                   t6 at offset -64
 1069        //
 1070        // note that this matches the offsets previously noted for the
 1071        // loads
 1072 
 1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1082       }
 1083 
 1084       __ subs(count, count, 8);
 1085       __ br(Assembler::HS, again);
 1086 
 1087       // Drain
 1088       //
 1089       // this uses the same pattern of offsets and register arguments
 1090       // as above
 1091       __ bind(drain);
 1092       if (direction == copy_forwards) {
 1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1098       } else {
 1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1104       }
 1105       // now we need to copy any remaining part block which may
 1106       // include a 4 word block subblock and/or a 2 word subblock.
 1107       // bits 2 and 1 in the count are the tell-tale for whether we
 1108       // have each such subblock
 1109       {
 1110         Label L1, L2;
 1111         __ tbz(count, exact_log2(4), L1);
 1112        // this is the same as above but copying only 4 longs hence
 1113        // with only one intervening stp between the str instructions
 1114        // but note that the offsets and registers still follow the
 1115        // same pattern
 1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1118         if (direction == copy_forwards) {
 1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1122         } else {
 1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1126         }
 1127         __ bind(L1);
 1128 
 1129         __ tbz(count, 1, L2);
 1130        // this is the same as above but copying only 2 longs hence
 1131        // there is no intervening stp between the str instructions
 1132        // but note that the offset and register patterns are still
 1133        // the same
 1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1135         if (direction == copy_forwards) {
 1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1141         }
 1142         __ bind(L2);
 1143 
 1144        // for forwards copy we need to re-adjust the offsets we
 1145        // applied so that s and d are follow the last words written
 1146 
 1147        if (direction == copy_forwards) {
 1148          __ add(s, s, 16);
 1149          __ add(d, d, 8);
 1150        }
 1151 
 1152       }
 1153 
 1154       __ ret(lr);
 1155       }
 1156   }
 1157 
 1158   // Small copy: less than 16 bytes.
 1159   //
 1160   // NB: Ignores all of the bits of count which represent more than 15
 1161   // bytes, so a caller doesn't have to mask them.
 1162 
 1163   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1164     bool is_backwards = step < 0;
 1165     size_t granularity = uabs(step);
 1166     int direction = is_backwards ? -1 : 1;
 1167 
 1168     Label Lword, Lint, Lshort, Lbyte;
 1169 
 1170     assert(granularity
 1171            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1172 
 1173     const Register t0 = r3;
 1174     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1175     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1176 
 1177     // ??? I don't know if this bit-test-and-branch is the right thing
 1178     // to do.  It does a lot of jumping, resulting in several
 1179     // mispredicted branches.  It might make more sense to do this
 1180     // with something like Duff's device with a single computed branch.
 1181 
 1182     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1183     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1184     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1185     __ bind(Lword);
 1186 
 1187     if (granularity <= sizeof (jint)) {
 1188       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1189       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1190       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1191       __ bind(Lint);
 1192     }
 1193 
 1194     if (granularity <= sizeof (jshort)) {
 1195       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1196       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1197       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1198       __ bind(Lshort);
 1199     }
 1200 
 1201     if (granularity <= sizeof (jbyte)) {
 1202       __ tbz(count, 0, Lbyte);
 1203       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1204       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1205       __ bind(Lbyte);
 1206     }
 1207   }
 1208 
 1209   Label copy_f, copy_b;
 1210   Label copy_obj_f, copy_obj_b;
 1211   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1212 
 1213   // All-singing all-dancing memory copy.
 1214   //
 1215   // Copy count units of memory from s to d.  The size of a unit is
 1216   // step, which can be positive or negative depending on the direction
 1217   // of copy.  If is_aligned is false, we align the source address.
 1218   //
 1219 
 1220   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1221                    Register s, Register d, Register count, int step) {
 1222     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1223     bool is_backwards = step < 0;
 1224     unsigned int granularity = uabs(step);
 1225     const Register t0 = r3, t1 = r4;
 1226 
 1227     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1228     // load all the data before writing anything
 1229     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1230     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1231     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1232     const Register send = r17, dend = r16;
 1233     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1234     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1235     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1236 
 1237     if (PrefetchCopyIntervalInBytes > 0)
 1238       __ prfm(Address(s, 0), PLDL1KEEP);
 1239     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1240     __ br(Assembler::HI, copy_big);
 1241 
 1242     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1243     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1244 
 1245     __ cmp(count, u1(16/granularity));
 1246     __ br(Assembler::LS, copy16);
 1247 
 1248     __ cmp(count, u1(64/granularity));
 1249     __ br(Assembler::HI, copy80);
 1250 
 1251     __ cmp(count, u1(32/granularity));
 1252     __ br(Assembler::LS, copy32);
 1253 
 1254     // 33..64 bytes
 1255     if (UseSIMDForMemoryOps) {
 1256       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1257       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1258       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1259       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1260     } else {
 1261       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1262       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1263       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1264       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1265 
 1266       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1267       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1268       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1269       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1270     }
 1271     __ b(finish);
 1272 
 1273     // 17..32 bytes
 1274     __ bind(copy32);
 1275     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1276     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1277 
 1278     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1279     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1280     __ b(finish);
 1281 
 1282     // 65..80/96 bytes
 1283     // (96 bytes if SIMD because we do 32 byes per instruction)
 1284     __ bind(copy80);
 1285     if (UseSIMDForMemoryOps) {
 1286       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1287       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1288       // Unaligned pointers can be an issue for copying.
 1289       // The issue has more chances to happen when granularity of data is
 1290       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1291       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1292       // The most performance drop has been seen for the range 65-80 bytes.
 1293       // For such cases using the pair of ldp/stp instead of the third pair of
 1294       // ldpq/stpq fixes the performance issue.
 1295       if (granularity < sizeof (jint)) {
 1296         Label copy96;
 1297         __ cmp(count, u1(80/granularity));
 1298         __ br(Assembler::HI, copy96);
 1299         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1300 
 1301         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1302         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1303 
 1304         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1305         __ b(finish);
 1306 
 1307         __ bind(copy96);
 1308       }
 1309       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1310 
 1311       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1312       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1313 
 1314       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1315     } else {
 1316       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1317       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1318       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1319       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1320       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1321 
 1322       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1323       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1324       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1325       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1326       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1327     }
 1328     __ b(finish);
 1329 
 1330     // 0..16 bytes
 1331     __ bind(copy16);
 1332     __ cmp(count, u1(8/granularity));
 1333     __ br(Assembler::LO, copy8);
 1334 
 1335     // 8..16 bytes
 1336     bs.copy_load_at_8(t0, Address(s, 0));
 1337     bs.copy_load_at_8(t1, Address(send, -8));
 1338     bs.copy_store_at_8(Address(d, 0), t0);
 1339     bs.copy_store_at_8(Address(dend, -8), t1);
 1340     __ b(finish);
 1341 
 1342     if (granularity < 8) {
 1343       // 4..7 bytes
 1344       __ bind(copy8);
 1345       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1346       __ ldrw(t0, Address(s, 0));
 1347       __ ldrw(t1, Address(send, -4));
 1348       __ strw(t0, Address(d, 0));
 1349       __ strw(t1, Address(dend, -4));
 1350       __ b(finish);
 1351       if (granularity < 4) {
 1352         // 0..3 bytes
 1353         __ bind(copy4);
 1354         __ cbz(count, finish); // get rid of 0 case
 1355         if (granularity == 2) {
 1356           __ ldrh(t0, Address(s, 0));
 1357           __ strh(t0, Address(d, 0));
 1358         } else { // granularity == 1
 1359           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1360           // the first and last byte.
 1361           // Handle the 3 byte case by loading and storing base + count/2
 1362           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1363           // This does means in the 1 byte case we load/store the same
 1364           // byte 3 times.
 1365           __ lsr(count, count, 1);
 1366           __ ldrb(t0, Address(s, 0));
 1367           __ ldrb(t1, Address(send, -1));
 1368           __ ldrb(t2, Address(s, count));
 1369           __ strb(t0, Address(d, 0));
 1370           __ strb(t1, Address(dend, -1));
 1371           __ strb(t2, Address(d, count));
 1372         }
 1373         __ b(finish);
 1374       }
 1375     }
 1376 
 1377     __ bind(copy_big);
 1378     if (is_backwards) {
 1379       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1380       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1381     }
 1382 
 1383     // Now we've got the small case out of the way we can align the
 1384     // source address on a 2-word boundary.
 1385 
 1386     // Here we will materialize a count in r15, which is used by copy_memory_small
 1387     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1388     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1389     // can not be used as a temp register, as it contains the count.
 1390 
 1391     Label aligned;
 1392 
 1393     if (is_aligned) {
 1394       // We may have to adjust by 1 word to get s 2-word-aligned.
 1395       __ tbz(s, exact_log2(wordSize), aligned);
 1396       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1397       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1398       __ sub(count, count, wordSize/granularity);
 1399     } else {
 1400       if (is_backwards) {
 1401         __ andr(r15, s, 2 * wordSize - 1);
 1402       } else {
 1403         __ neg(r15, s);
 1404         __ andr(r15, r15, 2 * wordSize - 1);
 1405       }
 1406       // r15 is the byte adjustment needed to align s.
 1407       __ cbz(r15, aligned);
 1408       int shift = exact_log2(granularity);
 1409       if (shift > 0) {
 1410         __ lsr(r15, r15, shift);
 1411       }
 1412       __ sub(count, count, r15);
 1413 
 1414 #if 0
 1415       // ?? This code is only correct for a disjoint copy.  It may or
 1416       // may not make sense to use it in that case.
 1417 
 1418       // Copy the first pair; s and d may not be aligned.
 1419       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1420       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1421 
 1422       // Align s and d, adjust count
 1423       if (is_backwards) {
 1424         __ sub(s, s, r15);
 1425         __ sub(d, d, r15);
 1426       } else {
 1427         __ add(s, s, r15);
 1428         __ add(d, d, r15);
 1429       }
 1430 #else
 1431       copy_memory_small(decorators, type, s, d, r15, step);
 1432 #endif
 1433     }
 1434 
 1435     __ bind(aligned);
 1436 
 1437     // s is now 2-word-aligned.
 1438 
 1439     // We have a count of units and some trailing bytes. Adjust the
 1440     // count and do a bulk copy of words. If the shift is zero
 1441     // perform a move instead to benefit from zero latency moves.
 1442     int shift = exact_log2(wordSize/granularity);
 1443     if (shift > 0) {
 1444       __ lsr(r15, count, shift);
 1445     } else {
 1446       __ mov(r15, count);
 1447     }
 1448     if (direction == copy_forwards) {
 1449       if (type != T_OBJECT) {
 1450         __ bl(copy_f);
 1451       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1452         __ bl(copy_obj_uninit_f);
 1453       } else {
 1454         __ bl(copy_obj_f);
 1455       }
 1456     } else {
 1457       if (type != T_OBJECT) {
 1458         __ bl(copy_b);
 1459       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1460         __ bl(copy_obj_uninit_b);
 1461       } else {
 1462         __ bl(copy_obj_b);
 1463       }
 1464     }
 1465 
 1466     // And the tail.
 1467     copy_memory_small(decorators, type, s, d, count, step);
 1468 
 1469     if (granularity >= 8) __ bind(copy8);
 1470     if (granularity >= 4) __ bind(copy4);
 1471     __ bind(finish);
 1472   }
 1473 
 1474 
 1475   void clobber_registers() {
 1476 #ifdef ASSERT
 1477     RegSet clobbered
 1478       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1479     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1480     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1481     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1482       __ mov(*it, rscratch1);
 1483     }
 1484 #endif
 1485 
 1486   }
 1487 
 1488   // Scan over array at a for count oops, verifying each one.
 1489   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1490   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1491     Label loop, end;
 1492     __ mov(rscratch1, a);
 1493     __ mov(rscratch2, zr);
 1494     __ bind(loop);
 1495     __ cmp(rscratch2, count);
 1496     __ br(Assembler::HS, end);
 1497     if (size == wordSize) {
 1498       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1499       __ verify_oop(temp);
 1500     } else {
 1501       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1502       __ decode_heap_oop(temp); // calls verify_oop
 1503     }
 1504     __ add(rscratch2, rscratch2, 1);
 1505     __ b(loop);
 1506     __ bind(end);
 1507   }
 1508 
 1509   // Arguments:
 1510   //   stub_id - is used to name the stub and identify all details of
 1511   //             how to perform the copy.
 1512   //
 1513   //   entry - is assigned to the stub's post push entry point unless
 1514   //           it is null
 1515   //
 1516   // Inputs:
 1517   //   c_rarg0   - source array address
 1518   //   c_rarg1   - destination array address
 1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1520   //
 1521   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1522   // the hardware handle it.  The two dwords within qwords that span
 1523   // cache line boundaries will still be loaded and stored atomically.
 1524   //
 1525   // Side Effects: entry is set to the (post push) entry point so it
 1526   //               can be used by the corresponding conjoint copy
 1527   //               method
 1528   //
 1529   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1530     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1531     RegSet saved_reg = RegSet::of(s, d, count);
 1532     int size;
 1533     bool aligned;
 1534     bool is_oop;
 1535     bool dest_uninitialized;
 1536     switch (stub_id) {
 1537     case jbyte_disjoint_arraycopy_id:
 1538       size = sizeof(jbyte);
 1539       aligned = false;
 1540       is_oop = false;
 1541       dest_uninitialized = false;
 1542       break;
 1543     case arrayof_jbyte_disjoint_arraycopy_id:
 1544       size = sizeof(jbyte);
 1545       aligned = true;
 1546       is_oop = false;
 1547       dest_uninitialized = false;
 1548       break;
 1549     case jshort_disjoint_arraycopy_id:
 1550       size = sizeof(jshort);
 1551       aligned = false;
 1552       is_oop = false;
 1553       dest_uninitialized = false;
 1554       break;
 1555     case arrayof_jshort_disjoint_arraycopy_id:
 1556       size = sizeof(jshort);
 1557       aligned = true;
 1558       is_oop = false;
 1559       dest_uninitialized = false;
 1560       break;
 1561     case jint_disjoint_arraycopy_id:
 1562       size = sizeof(jint);
 1563       aligned = false;
 1564       is_oop = false;
 1565       dest_uninitialized = false;
 1566       break;
 1567     case arrayof_jint_disjoint_arraycopy_id:
 1568       size = sizeof(jint);
 1569       aligned = true;
 1570       is_oop = false;
 1571       dest_uninitialized = false;
 1572       break;
 1573     case jlong_disjoint_arraycopy_id:
 1574       // since this is always aligned we can (should!) use the same
 1575       // stub as for case arrayof_jlong_disjoint_arraycopy
 1576       ShouldNotReachHere();
 1577       break;
 1578     case arrayof_jlong_disjoint_arraycopy_id:
 1579       size = sizeof(jlong);
 1580       aligned = true;
 1581       is_oop = false;
 1582       dest_uninitialized = false;
 1583       break;
 1584     case oop_disjoint_arraycopy_id:
 1585       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1586       aligned = !UseCompressedOops;
 1587       is_oop = true;
 1588       dest_uninitialized = false;
 1589       break;
 1590     case arrayof_oop_disjoint_arraycopy_id:
 1591       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1592       aligned = !UseCompressedOops;
 1593       is_oop = true;
 1594       dest_uninitialized = false;
 1595       break;
 1596     case oop_disjoint_arraycopy_uninit_id:
 1597       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1598       aligned = !UseCompressedOops;
 1599       is_oop = true;
 1600       dest_uninitialized = true;
 1601       break;
 1602     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1603       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1604       aligned = !UseCompressedOops;
 1605       is_oop = true;
 1606       dest_uninitialized = true;
 1607       break;
 1608     default:
 1609       ShouldNotReachHere();
 1610       break;
 1611     }
 1612 
 1613     __ align(CodeEntryAlignment);
 1614     StubCodeMark mark(this, stub_id);
 1615     address start = __ pc();
 1616     __ enter();
 1617 
 1618     if (entry != nullptr) {
 1619       *entry = __ pc();
 1620       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1621       BLOCK_COMMENT("Entry:");
 1622     }
 1623 
 1624     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1625     if (dest_uninitialized) {
 1626       decorators |= IS_DEST_UNINITIALIZED;
 1627     }
 1628     if (aligned) {
 1629       decorators |= ARRAYCOPY_ALIGNED;
 1630     }
 1631 
 1632     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1633     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1634 
 1635     if (is_oop) {
 1636       // save regs before copy_memory
 1637       __ push(RegSet::of(d, count), sp);
 1638     }
 1639     {
 1640       // UnsafeMemoryAccess page error: continue after unsafe access
 1641       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1642       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1643       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1644     }
 1645 
 1646     if (is_oop) {
 1647       __ pop(RegSet::of(d, count), sp);
 1648       if (VerifyOops)
 1649         verify_oop_array(size, d, count, r16);
 1650     }
 1651 
 1652     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1653 
 1654     __ leave();
 1655     __ mov(r0, zr); // return 0
 1656     __ ret(lr);
 1657     return start;
 1658   }
 1659 
 1660   // Arguments:
 1661   //   stub_id - is used to name the stub and identify all details of
 1662   //             how to perform the copy.
 1663   //
 1664   //   nooverlap_target - identifes the (post push) entry for the
 1665   //             corresponding disjoint copy routine which can be
 1666   //             jumped to if the ranges do not actually overlap
 1667   //
 1668   //   entry - is assigned to the stub's post push entry point unless
 1669   //           it is null
 1670   //
 1671   //
 1672   // Inputs:
 1673   //   c_rarg0   - source array address
 1674   //   c_rarg1   - destination array address
 1675   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1676   //
 1677   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1678   // the hardware handle it.  The two dwords within qwords that span
 1679   // cache line boundaries will still be loaded and stored atomically.
 1680   //
 1681   // Side Effects:
 1682   //   entry is set to the no-overlap entry point so it can be used by
 1683   //   some other conjoint copy method
 1684   //
 1685   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1686     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1687     RegSet saved_regs = RegSet::of(s, d, count);
 1688     int size;
 1689     bool aligned;
 1690     bool is_oop;
 1691     bool dest_uninitialized;
 1692     switch (stub_id) {
 1693     case jbyte_arraycopy_id:
 1694       size = sizeof(jbyte);
 1695       aligned = false;
 1696       is_oop = false;
 1697       dest_uninitialized = false;
 1698       break;
 1699     case arrayof_jbyte_arraycopy_id:
 1700       size = sizeof(jbyte);
 1701       aligned = true;
 1702       is_oop = false;
 1703       dest_uninitialized = false;
 1704       break;
 1705     case jshort_arraycopy_id:
 1706       size = sizeof(jshort);
 1707       aligned = false;
 1708       is_oop = false;
 1709       dest_uninitialized = false;
 1710       break;
 1711     case arrayof_jshort_arraycopy_id:
 1712       size = sizeof(jshort);
 1713       aligned = true;
 1714       is_oop = false;
 1715       dest_uninitialized = false;
 1716       break;
 1717     case jint_arraycopy_id:
 1718       size = sizeof(jint);
 1719       aligned = false;
 1720       is_oop = false;
 1721       dest_uninitialized = false;
 1722       break;
 1723     case arrayof_jint_arraycopy_id:
 1724       size = sizeof(jint);
 1725       aligned = true;
 1726       is_oop = false;
 1727       dest_uninitialized = false;
 1728       break;
 1729     case jlong_arraycopy_id:
 1730       // since this is always aligned we can (should!) use the same
 1731       // stub as for case arrayof_jlong_disjoint_arraycopy
 1732       ShouldNotReachHere();
 1733       break;
 1734     case arrayof_jlong_arraycopy_id:
 1735       size = sizeof(jlong);
 1736       aligned = true;
 1737       is_oop = false;
 1738       dest_uninitialized = false;
 1739       break;
 1740     case oop_arraycopy_id:
 1741       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1742       aligned = !UseCompressedOops;
 1743       is_oop = true;
 1744       dest_uninitialized = false;
 1745       break;
 1746     case arrayof_oop_arraycopy_id:
 1747       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1748       aligned = !UseCompressedOops;
 1749       is_oop = true;
 1750       dest_uninitialized = false;
 1751       break;
 1752     case oop_arraycopy_uninit_id:
 1753       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1754       aligned = !UseCompressedOops;
 1755       is_oop = true;
 1756       dest_uninitialized = true;
 1757       break;
 1758     case arrayof_oop_arraycopy_uninit_id:
 1759       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1760       aligned = !UseCompressedOops;
 1761       is_oop = true;
 1762       dest_uninitialized = true;
 1763       break;
 1764     default:
 1765       ShouldNotReachHere();
 1766     }
 1767 
 1768     StubCodeMark mark(this, stub_id);
 1769     address start = __ pc();
 1770     __ enter();
 1771 
 1772     if (entry != nullptr) {
 1773       *entry = __ pc();
 1774       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1775       BLOCK_COMMENT("Entry:");
 1776     }
 1777 
 1778     // use fwd copy when (d-s) above_equal (count*size)
 1779     __ sub(rscratch1, d, s);
 1780     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1781     __ br(Assembler::HS, nooverlap_target);
 1782 
 1783     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1784     if (dest_uninitialized) {
 1785       decorators |= IS_DEST_UNINITIALIZED;
 1786     }
 1787     if (aligned) {
 1788       decorators |= ARRAYCOPY_ALIGNED;
 1789     }
 1790 
 1791     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1792     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1793 
 1794     if (is_oop) {
 1795       // save regs before copy_memory
 1796       __ push(RegSet::of(d, count), sp);
 1797     }
 1798     {
 1799       // UnsafeMemoryAccess page error: continue after unsafe access
 1800       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1801       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1802       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1803     }
 1804     if (is_oop) {
 1805       __ pop(RegSet::of(d, count), sp);
 1806       if (VerifyOops)
 1807         verify_oop_array(size, d, count, r16);
 1808     }
 1809     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1810     __ leave();
 1811     __ mov(r0, zr); // return 0
 1812     __ ret(lr);
 1813     return start;
 1814   }
 1815 
 1816   // Helper for generating a dynamic type check.
 1817   // Smashes rscratch1, rscratch2.
 1818   void generate_type_check(Register sub_klass,
 1819                            Register super_check_offset,
 1820                            Register super_klass,
 1821                            Register temp1,
 1822                            Register temp2,
 1823                            Register result,
 1824                            Label& L_success) {
 1825     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1826 
 1827     BLOCK_COMMENT("type_check:");
 1828 
 1829     Label L_miss;
 1830 
 1831     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1832                                      super_check_offset);
 1833     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1834 
 1835     // Fall through on failure!
 1836     __ BIND(L_miss);
 1837   }
 1838 
 1839   //
 1840   //  Generate checkcasting array copy stub
 1841   //
 1842   //  Input:
 1843   //    c_rarg0   - source array address
 1844   //    c_rarg1   - destination array address
 1845   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1846   //    c_rarg3   - size_t ckoff (super_check_offset)
 1847   //    c_rarg4   - oop ckval (super_klass)
 1848   //
 1849   //  Output:
 1850   //    r0 ==  0  -  success
 1851   //    r0 == -1^K - failure, where K is partial transfer count
 1852   //
 1853   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1854     bool dest_uninitialized;
 1855     switch (stub_id) {
 1856     case checkcast_arraycopy_id:
 1857       dest_uninitialized = false;
 1858       break;
 1859     case checkcast_arraycopy_uninit_id:
 1860       dest_uninitialized = true;
 1861       break;
 1862     default:
 1863       ShouldNotReachHere();
 1864     }
 1865 
 1866     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1867 
 1868     // Input registers (after setup_arg_regs)
 1869     const Register from        = c_rarg0;   // source array address
 1870     const Register to          = c_rarg1;   // destination array address
 1871     const Register count       = c_rarg2;   // elementscount
 1872     const Register ckoff       = c_rarg3;   // super_check_offset
 1873     const Register ckval       = c_rarg4;   // super_klass
 1874 
 1875     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1876     RegSet wb_post_saved_regs = RegSet::of(count);
 1877 
 1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1879     const Register copied_oop  = r22;       // actual oop copied
 1880     const Register count_save  = r21;       // orig elementscount
 1881     const Register start_to    = r20;       // destination array start address
 1882     const Register r19_klass   = r19;       // oop._klass
 1883 
 1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1886 
 1887     //---------------------------------------------------------------
 1888     // Assembler stub will be used for this call to arraycopy
 1889     // if the two arrays are subtypes of Object[] but the
 1890     // destination array type is not equal to or a supertype
 1891     // of the source type.  Each element must be separately
 1892     // checked.
 1893 
 1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1895                                copied_oop, r19_klass, count_save);
 1896 
 1897     __ align(CodeEntryAlignment);
 1898     StubCodeMark mark(this, stub_id);
 1899     address start = __ pc();
 1900 
 1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1902 
 1903 #ifdef ASSERT
 1904     // caller guarantees that the arrays really are different
 1905     // otherwise, we would have to make conjoint checks
 1906     { Label L;
 1907       __ b(L);                  // conjoint check not yet implemented
 1908       __ stop("checkcast_copy within a single array");
 1909       __ bind(L);
 1910     }
 1911 #endif //ASSERT
 1912 
 1913     // Caller of this entry point must set up the argument registers.
 1914     if (entry != nullptr) {
 1915       *entry = __ pc();
 1916       BLOCK_COMMENT("Entry:");
 1917     }
 1918 
 1919      // Empty array:  Nothing to do.
 1920     __ cbz(count, L_done);
 1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1922 
 1923 #ifdef ASSERT
 1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1925     // The ckoff and ckval must be mutually consistent,
 1926     // even though caller generates both.
 1927     { Label L;
 1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1929       __ ldrw(start_to, Address(ckval, sco_offset));
 1930       __ cmpw(ckoff, start_to);
 1931       __ br(Assembler::EQ, L);
 1932       __ stop("super_check_offset inconsistent");
 1933       __ bind(L);
 1934     }
 1935 #endif //ASSERT
 1936 
 1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1938     bool is_oop = true;
 1939     int element_size = UseCompressedOops ? 4 : 8;
 1940     if (dest_uninitialized) {
 1941       decorators |= IS_DEST_UNINITIALIZED;
 1942     }
 1943 
 1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1946 
 1947     // save the original count
 1948     __ mov(count_save, count);
 1949 
 1950     // Copy from low to high addresses
 1951     __ mov(start_to, to);              // Save destination array start address
 1952     __ b(L_load_element);
 1953 
 1954     // ======== begin loop ========
 1955     // (Loop is rotated; its entry is L_load_element.)
 1956     // Loop control:
 1957     //   for (; count != 0; count--) {
 1958     //     copied_oop = load_heap_oop(from++);
 1959     //     ... generate_type_check ...;
 1960     //     store_heap_oop(to++, copied_oop);
 1961     //   }
 1962     __ align(OptoLoopAlignment);
 1963 
 1964     __ BIND(L_store_element);
 1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1966                       __ post(to, element_size), copied_oop, noreg,
 1967                       gct1, gct2, gct3);
 1968     __ sub(count, count, 1);
 1969     __ cbz(count, L_do_card_marks);
 1970 
 1971     // ======== loop entry is here ========
 1972     __ BIND(L_load_element);
 1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1974                      copied_oop, noreg, __ post(from, element_size),
 1975                      gct1);
 1976     __ cbz(copied_oop, L_store_element);
 1977 
 1978     __ load_klass(r19_klass, copied_oop);// query the object klass
 1979 
 1980     BLOCK_COMMENT("type_check:");
 1981     generate_type_check(/*sub_klass*/r19_klass,
 1982                         /*super_check_offset*/ckoff,
 1983                         /*super_klass*/ckval,
 1984                         /*r_array_base*/gct1,
 1985                         /*temp2*/gct2,
 1986                         /*result*/r10, L_store_element);
 1987 
 1988     // Fall through on failure!
 1989 
 1990     // ======== end loop ========
 1991 
 1992     // It was a real error; we must depend on the caller to finish the job.
 1993     // Register count = remaining oops, count_orig = total oops.
 1994     // Emit GC store barriers for the oops we have copied and report
 1995     // their number to the caller.
 1996 
 1997     __ subs(count, count_save, count);     // K = partially copied oop count
 1998     __ eon(count, count, zr);              // report (-1^K) to caller
 1999     __ br(Assembler::EQ, L_done_pop);
 2000 
 2001     __ BIND(L_do_card_marks);
 2002     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2003 
 2004     __ bind(L_done_pop);
 2005     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2007 
 2008     __ bind(L_done);
 2009     __ mov(r0, count);
 2010     __ leave();
 2011     __ ret(lr);
 2012 
 2013     return start;
 2014   }
 2015 
 2016   // Perform range checks on the proposed arraycopy.
 2017   // Kills temp, but nothing else.
 2018   // Also, clean the sign bits of src_pos and dst_pos.
 2019   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2020                               Register src_pos, // source position (c_rarg1)
 2021                               Register dst,     // destination array oo (c_rarg2)
 2022                               Register dst_pos, // destination position (c_rarg3)
 2023                               Register length,
 2024                               Register temp,
 2025                               Label& L_failed) {
 2026     BLOCK_COMMENT("arraycopy_range_checks:");
 2027 
 2028     assert_different_registers(rscratch1, temp);
 2029 
 2030     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2031     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2032     __ addw(temp, length, src_pos);
 2033     __ cmpw(temp, rscratch1);
 2034     __ br(Assembler::HI, L_failed);
 2035 
 2036     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2037     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2038     __ addw(temp, length, dst_pos);
 2039     __ cmpw(temp, rscratch1);
 2040     __ br(Assembler::HI, L_failed);
 2041 
 2042     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2043     __ movw(src_pos, src_pos);
 2044     __ movw(dst_pos, dst_pos);
 2045 
 2046     BLOCK_COMMENT("arraycopy_range_checks done");
 2047   }
 2048 
 2049   // These stubs get called from some dumb test routine.
 2050   // I'll write them properly when they're called from
 2051   // something that's actually doing something.
 2052   static void fake_arraycopy_stub(address src, address dst, int count) {
 2053     assert(count == 0, "huh?");
 2054   }
 2055 
 2056 
 2057   //
 2058   //  Generate 'unsafe' array copy stub
 2059   //  Though just as safe as the other stubs, it takes an unscaled
 2060   //  size_t argument instead of an element count.
 2061   //
 2062   //  Input:
 2063   //    c_rarg0   - source array address
 2064   //    c_rarg1   - destination array address
 2065   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2066   //
 2067   // Examines the alignment of the operands and dispatches
 2068   // to a long, int, short, or byte copy loop.
 2069   //
 2070   address generate_unsafe_copy(address byte_copy_entry,
 2071                                address short_copy_entry,
 2072                                address int_copy_entry,
 2073                                address long_copy_entry) {
 2074     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2075 
 2076     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2077     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2078 
 2079     __ align(CodeEntryAlignment);
 2080     StubCodeMark mark(this, stub_id);
 2081     address start = __ pc();
 2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2083 
 2084     // bump this on entry, not on exit:
 2085     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2086 
 2087     __ orr(rscratch1, s, d);
 2088     __ orr(rscratch1, rscratch1, count);
 2089 
 2090     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2091     __ cbz(rscratch1, L_long_aligned);
 2092     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2093     __ cbz(rscratch1, L_int_aligned);
 2094     __ tbz(rscratch1, 0, L_short_aligned);
 2095     __ b(RuntimeAddress(byte_copy_entry));
 2096 
 2097     __ BIND(L_short_aligned);
 2098     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2099     __ b(RuntimeAddress(short_copy_entry));
 2100     __ BIND(L_int_aligned);
 2101     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2102     __ b(RuntimeAddress(int_copy_entry));
 2103     __ BIND(L_long_aligned);
 2104     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2105     __ b(RuntimeAddress(long_copy_entry));
 2106 
 2107     return start;
 2108   }
 2109 
 2110   //
 2111   //  Generate generic array copy stubs
 2112   //
 2113   //  Input:
 2114   //    c_rarg0    -  src oop
 2115   //    c_rarg1    -  src_pos (32-bits)
 2116   //    c_rarg2    -  dst oop
 2117   //    c_rarg3    -  dst_pos (32-bits)
 2118   //    c_rarg4    -  element count (32-bits)
 2119   //
 2120   //  Output:
 2121   //    r0 ==  0  -  success
 2122   //    r0 == -1^K - failure, where K is partial transfer count
 2123   //
 2124   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2125                                 address int_copy_entry, address oop_copy_entry,
 2126                                 address long_copy_entry, address checkcast_copy_entry) {
 2127     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2128 
 2129     Label L_failed, L_objArray;
 2130     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2131 
 2132     // Input registers
 2133     const Register src        = c_rarg0;  // source array oop
 2134     const Register src_pos    = c_rarg1;  // source position
 2135     const Register dst        = c_rarg2;  // destination array oop
 2136     const Register dst_pos    = c_rarg3;  // destination position
 2137     const Register length     = c_rarg4;
 2138 
 2139 
 2140     // Registers used as temps
 2141     const Register dst_klass  = c_rarg5;
 2142 
 2143     __ align(CodeEntryAlignment);
 2144 
 2145     StubCodeMark mark(this, stub_id);
 2146 
 2147     address start = __ pc();
 2148 
 2149     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2150 
 2151     // bump this on entry, not on exit:
 2152     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2153 
 2154     //-----------------------------------------------------------------------
 2155     // Assembler stub will be used for this call to arraycopy
 2156     // if the following conditions are met:
 2157     //
 2158     // (1) src and dst must not be null.
 2159     // (2) src_pos must not be negative.
 2160     // (3) dst_pos must not be negative.
 2161     // (4) length  must not be negative.
 2162     // (5) src klass and dst klass should be the same and not null.
 2163     // (6) src and dst should be arrays.
 2164     // (7) src_pos + length must not exceed length of src.
 2165     // (8) dst_pos + length must not exceed length of dst.
 2166     //
 2167 
 2168     //  if (src == nullptr) return -1;
 2169     __ cbz(src, L_failed);
 2170 
 2171     //  if (src_pos < 0) return -1;
 2172     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2173 
 2174     //  if (dst == nullptr) return -1;
 2175     __ cbz(dst, L_failed);
 2176 
 2177     //  if (dst_pos < 0) return -1;
 2178     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2179 
 2180     // registers used as temp
 2181     const Register scratch_length    = r16; // elements count to copy
 2182     const Register scratch_src_klass = r17; // array klass
 2183     const Register lh                = r15; // layout helper
 2184 
 2185     //  if (length < 0) return -1;
 2186     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2187     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2188 
 2189     __ load_klass(scratch_src_klass, src);
 2190 #ifdef ASSERT
 2191     //  assert(src->klass() != nullptr);
 2192     {
 2193       BLOCK_COMMENT("assert klasses not null {");
 2194       Label L1, L2;
 2195       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2196       __ bind(L1);
 2197       __ stop("broken null klass");
 2198       __ bind(L2);
 2199       __ load_klass(rscratch1, dst);
 2200       __ cbz(rscratch1, L1);     // this would be broken also
 2201       BLOCK_COMMENT("} assert klasses not null done");
 2202     }
 2203 #endif
 2204 
 2205     // Load layout helper (32-bits)
 2206     //
 2207     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2208     // 32        30    24            16              8     2                 0
 2209     //
 2210     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2211     //
 2212 
 2213     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2214 
 2215     // Handle objArrays completely differently...
 2216     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2217     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2218     __ movw(rscratch1, objArray_lh);
 2219     __ eorw(rscratch2, lh, rscratch1);
 2220     __ cbzw(rscratch2, L_objArray);
 2221 
 2222     //  if (src->klass() != dst->klass()) return -1;
 2223     __ load_klass(rscratch2, dst);
 2224     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2225     __ cbnz(rscratch2, L_failed);
 2226 
 2227     //  if (!src->is_Array()) return -1;
 2228     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2229 
 2230     // At this point, it is known to be a typeArray (array_tag 0x3).
 2231 #ifdef ASSERT
 2232     {
 2233       BLOCK_COMMENT("assert primitive array {");
 2234       Label L;
 2235       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2236       __ cmpw(lh, rscratch2);
 2237       __ br(Assembler::GE, L);
 2238       __ stop("must be a primitive array");
 2239       __ bind(L);
 2240       BLOCK_COMMENT("} assert primitive array done");
 2241     }
 2242 #endif
 2243 
 2244     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2245                            rscratch2, L_failed);
 2246 
 2247     // TypeArrayKlass
 2248     //
 2249     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2250     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2251     //
 2252 
 2253     const Register rscratch1_offset = rscratch1;    // array offset
 2254     const Register r15_elsize = lh; // element size
 2255 
 2256     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2257            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2258     __ add(src, src, rscratch1_offset);           // src array offset
 2259     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2260     BLOCK_COMMENT("choose copy loop based on element size");
 2261 
 2262     // next registers should be set before the jump to corresponding stub
 2263     const Register from     = c_rarg0;  // source array address
 2264     const Register to       = c_rarg1;  // destination array address
 2265     const Register count    = c_rarg2;  // elements count
 2266 
 2267     // 'from', 'to', 'count' registers should be set in such order
 2268     // since they are the same as 'src', 'src_pos', 'dst'.
 2269 
 2270     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2271 
 2272     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2273     // size in bytes).  We do a simple bitwise binary search.
 2274   __ BIND(L_copy_bytes);
 2275     __ tbnz(r15_elsize, 1, L_copy_ints);
 2276     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2277     __ lea(from, Address(src, src_pos));// src_addr
 2278     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2279     __ movw(count, scratch_length); // length
 2280     __ b(RuntimeAddress(byte_copy_entry));
 2281 
 2282   __ BIND(L_copy_shorts);
 2283     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2285     __ movw(count, scratch_length); // length
 2286     __ b(RuntimeAddress(short_copy_entry));
 2287 
 2288   __ BIND(L_copy_ints);
 2289     __ tbnz(r15_elsize, 0, L_copy_longs);
 2290     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2291     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2292     __ movw(count, scratch_length); // length
 2293     __ b(RuntimeAddress(int_copy_entry));
 2294 
 2295   __ BIND(L_copy_longs);
 2296 #ifdef ASSERT
 2297     {
 2298       BLOCK_COMMENT("assert long copy {");
 2299       Label L;
 2300       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2301       __ cmpw(r15_elsize, LogBytesPerLong);
 2302       __ br(Assembler::EQ, L);
 2303       __ stop("must be long copy, but elsize is wrong");
 2304       __ bind(L);
 2305       BLOCK_COMMENT("} assert long copy done");
 2306     }
 2307 #endif
 2308     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2310     __ movw(count, scratch_length); // length
 2311     __ b(RuntimeAddress(long_copy_entry));
 2312 
 2313     // ObjArrayKlass
 2314   __ BIND(L_objArray);
 2315     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2316 
 2317     Label L_plain_copy, L_checkcast_copy;
 2318     //  test array classes for subtyping
 2319     __ load_klass(r15, dst);
 2320     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2321     __ br(Assembler::NE, L_checkcast_copy);
 2322 
 2323     // Identically typed arrays can be copied without element-wise checks.
 2324     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2325                            rscratch2, L_failed);
 2326 
 2327     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2328     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2329     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2330     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2331     __ movw(count, scratch_length); // length
 2332   __ BIND(L_plain_copy);
 2333     __ b(RuntimeAddress(oop_copy_entry));
 2334 
 2335   __ BIND(L_checkcast_copy);
 2336     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2337     {
 2338       // Before looking at dst.length, make sure dst is also an objArray.
 2339       __ ldrw(rscratch1, Address(r15, lh_offset));
 2340       __ movw(rscratch2, objArray_lh);
 2341       __ eorw(rscratch1, rscratch1, rscratch2);
 2342       __ cbnzw(rscratch1, L_failed);
 2343 
 2344       // It is safe to examine both src.length and dst.length.
 2345       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2346                              r15, L_failed);
 2347 
 2348       __ load_klass(dst_klass, dst); // reload
 2349 
 2350       // Marshal the base address arguments now, freeing registers.
 2351       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2352       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2354       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2355       __ movw(count, length);           // length (reloaded)
 2356       Register sco_temp = c_rarg3;      // this register is free now
 2357       assert_different_registers(from, to, count, sco_temp,
 2358                                  dst_klass, scratch_src_klass);
 2359       // assert_clean_int(count, sco_temp);
 2360 
 2361       // Generate the type check.
 2362       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2364 
 2365       // Smashes rscratch1, rscratch2
 2366       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2367                           L_plain_copy);
 2368 
 2369       // Fetch destination element klass from the ObjArrayKlass header.
 2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2373 
 2374       // the checkcast_copy loop needs two extra arguments:
 2375       assert(c_rarg3 == sco_temp, "#3 already in place");
 2376       // Set up arguments for checkcast_copy_entry.
 2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2378       __ b(RuntimeAddress(checkcast_copy_entry));
 2379     }
 2380 
 2381   __ BIND(L_failed);
 2382     __ mov(r0, -1);
 2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2384     __ ret(lr);
 2385 
 2386     return start;
 2387   }
 2388 
 2389   //
 2390   // Generate stub for array fill. If "aligned" is true, the
 2391   // "to" address is assumed to be heapword aligned.
 2392   //
 2393   // Arguments for generated stub:
 2394   //   to:    c_rarg0
 2395   //   value: c_rarg1
 2396   //   count: c_rarg2 treated as signed
 2397   //
 2398   address generate_fill(StubGenStubId stub_id) {
 2399     BasicType t;
 2400     bool aligned;
 2401 
 2402     switch (stub_id) {
 2403     case jbyte_fill_id:
 2404       t = T_BYTE;
 2405       aligned = false;
 2406       break;
 2407     case jshort_fill_id:
 2408       t = T_SHORT;
 2409       aligned = false;
 2410       break;
 2411     case jint_fill_id:
 2412       t = T_INT;
 2413       aligned = false;
 2414       break;
 2415     case arrayof_jbyte_fill_id:
 2416       t = T_BYTE;
 2417       aligned = true;
 2418       break;
 2419     case arrayof_jshort_fill_id:
 2420       t = T_SHORT;
 2421       aligned = true;
 2422       break;
 2423     case arrayof_jint_fill_id:
 2424       t = T_INT;
 2425       aligned = true;
 2426       break;
 2427     default:
 2428       ShouldNotReachHere();
 2429     };
 2430 
 2431     __ align(CodeEntryAlignment);
 2432     StubCodeMark mark(this, stub_id);
 2433     address start = __ pc();
 2434 
 2435     BLOCK_COMMENT("Entry:");
 2436 
 2437     const Register to        = c_rarg0;  // source array address
 2438     const Register value     = c_rarg1;  // value
 2439     const Register count     = c_rarg2;  // elements count
 2440 
 2441     const Register bz_base = r10;        // base for block_zero routine
 2442     const Register cnt_words = r11;      // temp register
 2443 
 2444     __ enter();
 2445 
 2446     Label L_fill_elements, L_exit1;
 2447 
 2448     int shift = -1;
 2449     switch (t) {
 2450       case T_BYTE:
 2451         shift = 0;
 2452         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2453         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2454         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2455         __ br(Assembler::LO, L_fill_elements);
 2456         break;
 2457       case T_SHORT:
 2458         shift = 1;
 2459         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2461         __ br(Assembler::LO, L_fill_elements);
 2462         break;
 2463       case T_INT:
 2464         shift = 2;
 2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2466         __ br(Assembler::LO, L_fill_elements);
 2467         break;
 2468       default: ShouldNotReachHere();
 2469     }
 2470 
 2471     // Align source address at 8 bytes address boundary.
 2472     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2473     if (!aligned) {
 2474       switch (t) {
 2475         case T_BYTE:
 2476           // One byte misalignment happens only for byte arrays.
 2477           __ tbz(to, 0, L_skip_align1);
 2478           __ strb(value, Address(__ post(to, 1)));
 2479           __ subw(count, count, 1);
 2480           __ bind(L_skip_align1);
 2481           // Fallthrough
 2482         case T_SHORT:
 2483           // Two bytes misalignment happens only for byte and short (char) arrays.
 2484           __ tbz(to, 1, L_skip_align2);
 2485           __ strh(value, Address(__ post(to, 2)));
 2486           __ subw(count, count, 2 >> shift);
 2487           __ bind(L_skip_align2);
 2488           // Fallthrough
 2489         case T_INT:
 2490           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2491           __ tbz(to, 2, L_skip_align4);
 2492           __ strw(value, Address(__ post(to, 4)));
 2493           __ subw(count, count, 4 >> shift);
 2494           __ bind(L_skip_align4);
 2495           break;
 2496         default: ShouldNotReachHere();
 2497       }
 2498     }
 2499 
 2500     //
 2501     //  Fill large chunks
 2502     //
 2503     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2504     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2505     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2506     if (UseBlockZeroing) {
 2507       Label non_block_zeroing, rest;
 2508       // If the fill value is zero we can use the fast zero_words().
 2509       __ cbnz(value, non_block_zeroing);
 2510       __ mov(bz_base, to);
 2511       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2512       address tpc = __ zero_words(bz_base, cnt_words);
 2513       if (tpc == nullptr) {
 2514         fatal("CodeCache is full at generate_fill");
 2515       }
 2516       __ b(rest);
 2517       __ bind(non_block_zeroing);
 2518       __ fill_words(to, cnt_words, value);
 2519       __ bind(rest);
 2520     } else {
 2521       __ fill_words(to, cnt_words, value);
 2522     }
 2523 
 2524     // Remaining count is less than 8 bytes. Fill it by a single store.
 2525     // Note that the total length is no less than 8 bytes.
 2526     if (t == T_BYTE || t == T_SHORT) {
 2527       Label L_exit1;
 2528       __ cbzw(count, L_exit1);
 2529       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2530       __ str(value, Address(to, -8));    // overwrite some elements
 2531       __ bind(L_exit1);
 2532       __ leave();
 2533       __ ret(lr);
 2534     }
 2535 
 2536     // Handle copies less than 8 bytes.
 2537     Label L_fill_2, L_fill_4, L_exit2;
 2538     __ bind(L_fill_elements);
 2539     switch (t) {
 2540       case T_BYTE:
 2541         __ tbz(count, 0, L_fill_2);
 2542         __ strb(value, Address(__ post(to, 1)));
 2543         __ bind(L_fill_2);
 2544         __ tbz(count, 1, L_fill_4);
 2545         __ strh(value, Address(__ post(to, 2)));
 2546         __ bind(L_fill_4);
 2547         __ tbz(count, 2, L_exit2);
 2548         __ strw(value, Address(to));
 2549         break;
 2550       case T_SHORT:
 2551         __ tbz(count, 0, L_fill_4);
 2552         __ strh(value, Address(__ post(to, 2)));
 2553         __ bind(L_fill_4);
 2554         __ tbz(count, 1, L_exit2);
 2555         __ strw(value, Address(to));
 2556         break;
 2557       case T_INT:
 2558         __ cbzw(count, L_exit2);
 2559         __ strw(value, Address(to));
 2560         break;
 2561       default: ShouldNotReachHere();
 2562     }
 2563     __ bind(L_exit2);
 2564     __ leave();
 2565     __ ret(lr);
 2566     return start;
 2567   }
 2568 
 2569   address generate_data_cache_writeback() {
 2570     const Register line        = c_rarg0;  // address of line to write back
 2571 
 2572     __ align(CodeEntryAlignment);
 2573 
 2574     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2575     StubCodeMark mark(this, stub_id);
 2576 
 2577     address start = __ pc();
 2578     __ enter();
 2579     __ cache_wb(Address(line, 0));
 2580     __ leave();
 2581     __ ret(lr);
 2582 
 2583     return start;
 2584   }
 2585 
 2586   address generate_data_cache_writeback_sync() {
 2587     const Register is_pre     = c_rarg0;  // pre or post sync
 2588 
 2589     __ align(CodeEntryAlignment);
 2590 
 2591     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2592     StubCodeMark mark(this, stub_id);
 2593 
 2594     // pre wbsync is a no-op
 2595     // post wbsync translates to an sfence
 2596 
 2597     Label skip;
 2598     address start = __ pc();
 2599     __ enter();
 2600     __ cbnz(is_pre, skip);
 2601     __ cache_wbsync(false);
 2602     __ bind(skip);
 2603     __ leave();
 2604     __ ret(lr);
 2605 
 2606     return start;
 2607   }
 2608 
 2609   void generate_arraycopy_stubs() {
 2610     address entry;
 2611     address entry_jbyte_arraycopy;
 2612     address entry_jshort_arraycopy;
 2613     address entry_jint_arraycopy;
 2614     address entry_oop_arraycopy;
 2615     address entry_jlong_arraycopy;
 2616     address entry_checkcast_arraycopy;
 2617 
 2618     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2619     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2620 
 2621     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2622     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2623 
 2624     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2625     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2626 
 2627     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2628 
 2629     //*** jbyte
 2630     // Always need aligned and unaligned versions
 2631     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2632     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2633     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2634     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2635 
 2636     //*** jshort
 2637     // Always need aligned and unaligned versions
 2638     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2639     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2640     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2641     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2642 
 2643     //*** jint
 2644     // Aligned versions
 2645     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2646     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2647     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2648     // entry_jint_arraycopy always points to the unaligned version
 2649     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2650     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2651 
 2652     //*** jlong
 2653     // It is always aligned
 2654     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2655     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2656     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2657     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2658 
 2659     //*** oops
 2660     {
 2661       // With compressed oops we need unaligned versions; notice that
 2662       // we overwrite entry_oop_arraycopy.
 2663       bool aligned = !UseCompressedOops;
 2664 
 2665       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2666         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2667       StubRoutines::_arrayof_oop_arraycopy
 2668         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2669       // Aligned versions without pre-barriers
 2670       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2671         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2672       StubRoutines::_arrayof_oop_arraycopy_uninit
 2673         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2674     }
 2675 
 2676     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2677     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2678     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2679     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2680 
 2681     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2682     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2683 
 2684     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2685                                                               entry_jshort_arraycopy,
 2686                                                               entry_jint_arraycopy,
 2687                                                               entry_jlong_arraycopy);
 2688 
 2689     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2690                                                                entry_jshort_arraycopy,
 2691                                                                entry_jint_arraycopy,
 2692                                                                entry_oop_arraycopy,
 2693                                                                entry_jlong_arraycopy,
 2694                                                                entry_checkcast_arraycopy);
 2695 
 2696     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2697     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2698     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2699     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2700     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2701     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2702   }
 2703 
 2704   void generate_math_stubs() { Unimplemented(); }
 2705 
 2706   // Arguments:
 2707   //
 2708   // Inputs:
 2709   //   c_rarg0   - source byte array address
 2710   //   c_rarg1   - destination byte array address
 2711   //   c_rarg2   - K (key) in little endian int array
 2712   //
 2713   address generate_aescrypt_encryptBlock() {
 2714     __ align(CodeEntryAlignment);
 2715     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2716     StubCodeMark mark(this, stub_id);
 2717 
 2718     const Register from        = c_rarg0;  // source array address
 2719     const Register to          = c_rarg1;  // destination array address
 2720     const Register key         = c_rarg2;  // key array address
 2721     const Register keylen      = rscratch1;
 2722 
 2723     address start = __ pc();
 2724     __ enter();
 2725 
 2726     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2727 
 2728     __ aesenc_loadkeys(key, keylen);
 2729     __ aesecb_encrypt(from, to, keylen);
 2730 
 2731     __ mov(r0, 0);
 2732 
 2733     __ leave();
 2734     __ ret(lr);
 2735 
 2736     return start;
 2737   }
 2738 
 2739   // Arguments:
 2740   //
 2741   // Inputs:
 2742   //   c_rarg0   - source byte array address
 2743   //   c_rarg1   - destination byte array address
 2744   //   c_rarg2   - K (key) in little endian int array
 2745   //
 2746   address generate_aescrypt_decryptBlock() {
 2747     assert(UseAES, "need AES cryptographic extension support");
 2748     __ align(CodeEntryAlignment);
 2749     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2750     StubCodeMark mark(this, stub_id);
 2751     Label L_doLast;
 2752 
 2753     const Register from        = c_rarg0;  // source array address
 2754     const Register to          = c_rarg1;  // destination array address
 2755     const Register key         = c_rarg2;  // key array address
 2756     const Register keylen      = rscratch1;
 2757 
 2758     address start = __ pc();
 2759     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2760 
 2761     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2762 
 2763     __ aesecb_decrypt(from, to, key, keylen);
 2764 
 2765     __ mov(r0, 0);
 2766 
 2767     __ leave();
 2768     __ ret(lr);
 2769 
 2770     return start;
 2771   }
 2772 
 2773   // Arguments:
 2774   //
 2775   // Inputs:
 2776   //   c_rarg0   - source byte array address
 2777   //   c_rarg1   - destination byte array address
 2778   //   c_rarg2   - K (key) in little endian int array
 2779   //   c_rarg3   - r vector byte array address
 2780   //   c_rarg4   - input length
 2781   //
 2782   // Output:
 2783   //   x0        - input length
 2784   //
 2785   address generate_cipherBlockChaining_encryptAESCrypt() {
 2786     assert(UseAES, "need AES cryptographic extension support");
 2787     __ align(CodeEntryAlignment);
 2788     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2789     StubCodeMark mark(this, stub_id);
 2790 
 2791     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2792 
 2793     const Register from        = c_rarg0;  // source array address
 2794     const Register to          = c_rarg1;  // destination array address
 2795     const Register key         = c_rarg2;  // key array address
 2796     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2797                                            // and left with the results of the last encryption block
 2798     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2799     const Register keylen      = rscratch1;
 2800 
 2801     address start = __ pc();
 2802 
 2803       __ enter();
 2804 
 2805       __ movw(rscratch2, len_reg);
 2806 
 2807       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2808 
 2809       __ ld1(v0, __ T16B, rvec);
 2810 
 2811       __ cmpw(keylen, 52);
 2812       __ br(Assembler::CC, L_loadkeys_44);
 2813       __ br(Assembler::EQ, L_loadkeys_52);
 2814 
 2815       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2816       __ rev32(v17, __ T16B, v17);
 2817       __ rev32(v18, __ T16B, v18);
 2818     __ BIND(L_loadkeys_52);
 2819       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2820       __ rev32(v19, __ T16B, v19);
 2821       __ rev32(v20, __ T16B, v20);
 2822     __ BIND(L_loadkeys_44);
 2823       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2824       __ rev32(v21, __ T16B, v21);
 2825       __ rev32(v22, __ T16B, v22);
 2826       __ rev32(v23, __ T16B, v23);
 2827       __ rev32(v24, __ T16B, v24);
 2828       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2829       __ rev32(v25, __ T16B, v25);
 2830       __ rev32(v26, __ T16B, v26);
 2831       __ rev32(v27, __ T16B, v27);
 2832       __ rev32(v28, __ T16B, v28);
 2833       __ ld1(v29, v30, v31, __ T16B, key);
 2834       __ rev32(v29, __ T16B, v29);
 2835       __ rev32(v30, __ T16B, v30);
 2836       __ rev32(v31, __ T16B, v31);
 2837 
 2838     __ BIND(L_aes_loop);
 2839       __ ld1(v1, __ T16B, __ post(from, 16));
 2840       __ eor(v0, __ T16B, v0, v1);
 2841 
 2842       __ br(Assembler::CC, L_rounds_44);
 2843       __ br(Assembler::EQ, L_rounds_52);
 2844 
 2845       __ aese(v0, v17); __ aesmc(v0, v0);
 2846       __ aese(v0, v18); __ aesmc(v0, v0);
 2847     __ BIND(L_rounds_52);
 2848       __ aese(v0, v19); __ aesmc(v0, v0);
 2849       __ aese(v0, v20); __ aesmc(v0, v0);
 2850     __ BIND(L_rounds_44);
 2851       __ aese(v0, v21); __ aesmc(v0, v0);
 2852       __ aese(v0, v22); __ aesmc(v0, v0);
 2853       __ aese(v0, v23); __ aesmc(v0, v0);
 2854       __ aese(v0, v24); __ aesmc(v0, v0);
 2855       __ aese(v0, v25); __ aesmc(v0, v0);
 2856       __ aese(v0, v26); __ aesmc(v0, v0);
 2857       __ aese(v0, v27); __ aesmc(v0, v0);
 2858       __ aese(v0, v28); __ aesmc(v0, v0);
 2859       __ aese(v0, v29); __ aesmc(v0, v0);
 2860       __ aese(v0, v30);
 2861       __ eor(v0, __ T16B, v0, v31);
 2862 
 2863       __ st1(v0, __ T16B, __ post(to, 16));
 2864 
 2865       __ subw(len_reg, len_reg, 16);
 2866       __ cbnzw(len_reg, L_aes_loop);
 2867 
 2868       __ st1(v0, __ T16B, rvec);
 2869 
 2870       __ mov(r0, rscratch2);
 2871 
 2872       __ leave();
 2873       __ ret(lr);
 2874 
 2875       return start;
 2876   }
 2877 
 2878   // Arguments:
 2879   //
 2880   // Inputs:
 2881   //   c_rarg0   - source byte array address
 2882   //   c_rarg1   - destination byte array address
 2883   //   c_rarg2   - K (key) in little endian int array
 2884   //   c_rarg3   - r vector byte array address
 2885   //   c_rarg4   - input length
 2886   //
 2887   // Output:
 2888   //   r0        - input length
 2889   //
 2890   address generate_cipherBlockChaining_decryptAESCrypt() {
 2891     assert(UseAES, "need AES cryptographic extension support");
 2892     __ align(CodeEntryAlignment);
 2893     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2894     StubCodeMark mark(this, stub_id);
 2895 
 2896     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2897 
 2898     const Register from        = c_rarg0;  // source array address
 2899     const Register to          = c_rarg1;  // destination array address
 2900     const Register key         = c_rarg2;  // key array address
 2901     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2902                                            // and left with the results of the last encryption block
 2903     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2904     const Register keylen      = rscratch1;
 2905 
 2906     address start = __ pc();
 2907 
 2908       __ enter();
 2909 
 2910       __ movw(rscratch2, len_reg);
 2911 
 2912       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2913 
 2914       __ ld1(v2, __ T16B, rvec);
 2915 
 2916       __ ld1(v31, __ T16B, __ post(key, 16));
 2917       __ rev32(v31, __ T16B, v31);
 2918 
 2919       __ cmpw(keylen, 52);
 2920       __ br(Assembler::CC, L_loadkeys_44);
 2921       __ br(Assembler::EQ, L_loadkeys_52);
 2922 
 2923       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2924       __ rev32(v17, __ T16B, v17);
 2925       __ rev32(v18, __ T16B, v18);
 2926     __ BIND(L_loadkeys_52);
 2927       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2928       __ rev32(v19, __ T16B, v19);
 2929       __ rev32(v20, __ T16B, v20);
 2930     __ BIND(L_loadkeys_44);
 2931       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2932       __ rev32(v21, __ T16B, v21);
 2933       __ rev32(v22, __ T16B, v22);
 2934       __ rev32(v23, __ T16B, v23);
 2935       __ rev32(v24, __ T16B, v24);
 2936       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2937       __ rev32(v25, __ T16B, v25);
 2938       __ rev32(v26, __ T16B, v26);
 2939       __ rev32(v27, __ T16B, v27);
 2940       __ rev32(v28, __ T16B, v28);
 2941       __ ld1(v29, v30, __ T16B, key);
 2942       __ rev32(v29, __ T16B, v29);
 2943       __ rev32(v30, __ T16B, v30);
 2944 
 2945     __ BIND(L_aes_loop);
 2946       __ ld1(v0, __ T16B, __ post(from, 16));
 2947       __ orr(v1, __ T16B, v0, v0);
 2948 
 2949       __ br(Assembler::CC, L_rounds_44);
 2950       __ br(Assembler::EQ, L_rounds_52);
 2951 
 2952       __ aesd(v0, v17); __ aesimc(v0, v0);
 2953       __ aesd(v0, v18); __ aesimc(v0, v0);
 2954     __ BIND(L_rounds_52);
 2955       __ aesd(v0, v19); __ aesimc(v0, v0);
 2956       __ aesd(v0, v20); __ aesimc(v0, v0);
 2957     __ BIND(L_rounds_44);
 2958       __ aesd(v0, v21); __ aesimc(v0, v0);
 2959       __ aesd(v0, v22); __ aesimc(v0, v0);
 2960       __ aesd(v0, v23); __ aesimc(v0, v0);
 2961       __ aesd(v0, v24); __ aesimc(v0, v0);
 2962       __ aesd(v0, v25); __ aesimc(v0, v0);
 2963       __ aesd(v0, v26); __ aesimc(v0, v0);
 2964       __ aesd(v0, v27); __ aesimc(v0, v0);
 2965       __ aesd(v0, v28); __ aesimc(v0, v0);
 2966       __ aesd(v0, v29); __ aesimc(v0, v0);
 2967       __ aesd(v0, v30);
 2968       __ eor(v0, __ T16B, v0, v31);
 2969       __ eor(v0, __ T16B, v0, v2);
 2970 
 2971       __ st1(v0, __ T16B, __ post(to, 16));
 2972       __ orr(v2, __ T16B, v1, v1);
 2973 
 2974       __ subw(len_reg, len_reg, 16);
 2975       __ cbnzw(len_reg, L_aes_loop);
 2976 
 2977       __ st1(v2, __ T16B, rvec);
 2978 
 2979       __ mov(r0, rscratch2);
 2980 
 2981       __ leave();
 2982       __ ret(lr);
 2983 
 2984     return start;
 2985   }
 2986 
 2987   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 2988   // Inputs: 128-bits. in is preserved.
 2989   // The least-significant 64-bit word is in the upper dword of each vector.
 2990   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 2991   // Output: result
 2992   void be_add_128_64(FloatRegister result, FloatRegister in,
 2993                      FloatRegister inc, FloatRegister tmp) {
 2994     assert_different_registers(result, tmp, inc);
 2995 
 2996     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 2997                                            // input
 2998     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 2999     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3000                                            // MSD == 0 (must be!) to LSD
 3001     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3002   }
 3003 
 3004   // CTR AES crypt.
 3005   // Arguments:
 3006   //
 3007   // Inputs:
 3008   //   c_rarg0   - source byte array address
 3009   //   c_rarg1   - destination byte array address
 3010   //   c_rarg2   - K (key) in little endian int array
 3011   //   c_rarg3   - counter vector byte array address
 3012   //   c_rarg4   - input length
 3013   //   c_rarg5   - saved encryptedCounter start
 3014   //   c_rarg6   - saved used length
 3015   //
 3016   // Output:
 3017   //   r0       - input length
 3018   //
 3019   address generate_counterMode_AESCrypt() {
 3020     const Register in = c_rarg0;
 3021     const Register out = c_rarg1;
 3022     const Register key = c_rarg2;
 3023     const Register counter = c_rarg3;
 3024     const Register saved_len = c_rarg4, len = r10;
 3025     const Register saved_encrypted_ctr = c_rarg5;
 3026     const Register used_ptr = c_rarg6, used = r12;
 3027 
 3028     const Register offset = r7;
 3029     const Register keylen = r11;
 3030 
 3031     const unsigned char block_size = 16;
 3032     const int bulk_width = 4;
 3033     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3034     // performance with larger data sizes, but it also means that the
 3035     // fast path isn't used until you have at least 8 blocks, and up
 3036     // to 127 bytes of data will be executed on the slow path. For
 3037     // that reason, and also so as not to blow away too much icache, 4
 3038     // blocks seems like a sensible compromise.
 3039 
 3040     // Algorithm:
 3041     //
 3042     //    if (len == 0) {
 3043     //        goto DONE;
 3044     //    }
 3045     //    int result = len;
 3046     //    do {
 3047     //        if (used >= blockSize) {
 3048     //            if (len >= bulk_width * blockSize) {
 3049     //                CTR_large_block();
 3050     //                if (len == 0)
 3051     //                    goto DONE;
 3052     //            }
 3053     //            for (;;) {
 3054     //                16ByteVector v0 = counter;
 3055     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3056     //                used = 0;
 3057     //                if (len < blockSize)
 3058     //                    break;    /* goto NEXT */
 3059     //                16ByteVector v1 = load16Bytes(in, offset);
 3060     //                v1 = v1 ^ encryptedCounter;
 3061     //                store16Bytes(out, offset);
 3062     //                used = blockSize;
 3063     //                offset += blockSize;
 3064     //                len -= blockSize;
 3065     //                if (len == 0)
 3066     //                    goto DONE;
 3067     //            }
 3068     //        }
 3069     //      NEXT:
 3070     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3071     //        len--;
 3072     //    } while (len != 0);
 3073     //  DONE:
 3074     //    return result;
 3075     //
 3076     // CTR_large_block()
 3077     //    Wide bulk encryption of whole blocks.
 3078 
 3079     __ align(CodeEntryAlignment);
 3080     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3081     StubCodeMark mark(this, stub_id);
 3082     const address start = __ pc();
 3083     __ enter();
 3084 
 3085     Label DONE, CTR_large_block, large_block_return;
 3086     __ ldrw(used, Address(used_ptr));
 3087     __ cbzw(saved_len, DONE);
 3088 
 3089     __ mov(len, saved_len);
 3090     __ mov(offset, 0);
 3091 
 3092     // Compute #rounds for AES based on the length of the key array
 3093     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3094 
 3095     __ aesenc_loadkeys(key, keylen);
 3096 
 3097     {
 3098       Label L_CTR_loop, NEXT;
 3099 
 3100       __ bind(L_CTR_loop);
 3101 
 3102       __ cmp(used, block_size);
 3103       __ br(__ LO, NEXT);
 3104 
 3105       // Maybe we have a lot of data
 3106       __ subsw(rscratch1, len, bulk_width * block_size);
 3107       __ br(__ HS, CTR_large_block);
 3108       __ BIND(large_block_return);
 3109       __ cbzw(len, DONE);
 3110 
 3111       // Setup the counter
 3112       __ movi(v4, __ T4S, 0);
 3113       __ movi(v5, __ T4S, 1);
 3114       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3115 
 3116       // 128-bit big-endian increment
 3117       __ ld1(v0, __ T16B, counter);
 3118       __ rev64(v16, __ T16B, v0);
 3119       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3120       __ rev64(v16, __ T16B, v16);
 3121       __ st1(v16, __ T16B, counter);
 3122       // Previous counter value is in v0
 3123       // v4 contains { 0, 1 }
 3124 
 3125       {
 3126         // We have fewer than bulk_width blocks of data left. Encrypt
 3127         // them one by one until there is less than a full block
 3128         // remaining, being careful to save both the encrypted counter
 3129         // and the counter.
 3130 
 3131         Label inner_loop;
 3132         __ bind(inner_loop);
 3133         // Counter to encrypt is in v0
 3134         __ aesecb_encrypt(noreg, noreg, keylen);
 3135         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3136 
 3137         // Do we have a remaining full block?
 3138 
 3139         __ mov(used, 0);
 3140         __ cmp(len, block_size);
 3141         __ br(__ LO, NEXT);
 3142 
 3143         // Yes, we have a full block
 3144         __ ldrq(v1, Address(in, offset));
 3145         __ eor(v1, __ T16B, v1, v0);
 3146         __ strq(v1, Address(out, offset));
 3147         __ mov(used, block_size);
 3148         __ add(offset, offset, block_size);
 3149 
 3150         __ subw(len, len, block_size);
 3151         __ cbzw(len, DONE);
 3152 
 3153         // Increment the counter, store it back
 3154         __ orr(v0, __ T16B, v16, v16);
 3155         __ rev64(v16, __ T16B, v16);
 3156         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3157         __ rev64(v16, __ T16B, v16);
 3158         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3159 
 3160         __ b(inner_loop);
 3161       }
 3162 
 3163       __ BIND(NEXT);
 3164 
 3165       // Encrypt a single byte, and loop.
 3166       // We expect this to be a rare event.
 3167       __ ldrb(rscratch1, Address(in, offset));
 3168       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3169       __ eor(rscratch1, rscratch1, rscratch2);
 3170       __ strb(rscratch1, Address(out, offset));
 3171       __ add(offset, offset, 1);
 3172       __ add(used, used, 1);
 3173       __ subw(len, len,1);
 3174       __ cbnzw(len, L_CTR_loop);
 3175     }
 3176 
 3177     __ bind(DONE);
 3178     __ strw(used, Address(used_ptr));
 3179     __ mov(r0, saved_len);
 3180 
 3181     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3182     __ ret(lr);
 3183 
 3184     // Bulk encryption
 3185 
 3186     __ BIND (CTR_large_block);
 3187     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3188 
 3189     if (bulk_width == 8) {
 3190       __ sub(sp, sp, 4 * 16);
 3191       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3192     }
 3193     __ sub(sp, sp, 4 * 16);
 3194     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3195     RegSet saved_regs = (RegSet::of(in, out, offset)
 3196                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3197     __ push(saved_regs, sp);
 3198     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3199     __ add(in, in, offset);
 3200     __ add(out, out, offset);
 3201 
 3202     // Keys should already be loaded into the correct registers
 3203 
 3204     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3205     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3206 
 3207     // AES/CTR loop
 3208     {
 3209       Label L_CTR_loop;
 3210       __ BIND(L_CTR_loop);
 3211 
 3212       // Setup the counters
 3213       __ movi(v8, __ T4S, 0);
 3214       __ movi(v9, __ T4S, 1);
 3215       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3216 
 3217       for (int i = 0; i < bulk_width; i++) {
 3218         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3219         __ rev64(v0_ofs, __ T16B, v16);
 3220         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3221       }
 3222 
 3223       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3224 
 3225       // Encrypt the counters
 3226       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3227 
 3228       if (bulk_width == 8) {
 3229         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3230       }
 3231 
 3232       // XOR the encrypted counters with the inputs
 3233       for (int i = 0; i < bulk_width; i++) {
 3234         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3235         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3236         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3237       }
 3238 
 3239       // Write the encrypted data
 3240       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3241       if (bulk_width == 8) {
 3242         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3243       }
 3244 
 3245       __ subw(len, len, 16 * bulk_width);
 3246       __ cbnzw(len, L_CTR_loop);
 3247     }
 3248 
 3249     // Save the counter back where it goes
 3250     __ rev64(v16, __ T16B, v16);
 3251     __ st1(v16, __ T16B, counter);
 3252 
 3253     __ pop(saved_regs, sp);
 3254 
 3255     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3256     if (bulk_width == 8) {
 3257       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3258     }
 3259 
 3260     __ andr(rscratch1, len, -16 * bulk_width);
 3261     __ sub(len, len, rscratch1);
 3262     __ add(offset, offset, rscratch1);
 3263     __ mov(used, 16);
 3264     __ strw(used, Address(used_ptr));
 3265     __ b(large_block_return);
 3266 
 3267     return start;
 3268   }
 3269 
 3270   // Vector AES Galois Counter Mode implementation. Parameters:
 3271   //
 3272   // in = c_rarg0
 3273   // len = c_rarg1
 3274   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3275   // out = c_rarg3
 3276   // key = c_rarg4
 3277   // state = c_rarg5 - GHASH.state
 3278   // subkeyHtbl = c_rarg6 - powers of H
 3279   // counter = c_rarg7 - 16 bytes of CTR
 3280   // return - number of processed bytes
 3281   address generate_galoisCounterMode_AESCrypt() {
 3282     address ghash_polynomial = __ pc();
 3283     __ emit_int64(0x87);  // The low-order bits of the field
 3284                           // polynomial (i.e. p = z^7+z^2+z+1)
 3285                           // repeated in the low and high parts of a
 3286                           // 128-bit vector
 3287     __ emit_int64(0x87);
 3288 
 3289     __ align(CodeEntryAlignment);
 3290     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3291     StubCodeMark mark(this, stub_id);
 3292     address start = __ pc();
 3293     __ enter();
 3294 
 3295     const Register in = c_rarg0;
 3296     const Register len = c_rarg1;
 3297     const Register ct = c_rarg2;
 3298     const Register out = c_rarg3;
 3299     // and updated with the incremented counter in the end
 3300 
 3301     const Register key = c_rarg4;
 3302     const Register state = c_rarg5;
 3303 
 3304     const Register subkeyHtbl = c_rarg6;
 3305 
 3306     const Register counter = c_rarg7;
 3307 
 3308     const Register keylen = r10;
 3309     // Save state before entering routine
 3310     __ sub(sp, sp, 4 * 16);
 3311     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3312     __ sub(sp, sp, 4 * 16);
 3313     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3314 
 3315     // __ andr(len, len, -512);
 3316     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3317     __ str(len, __ pre(sp, -2 * wordSize));
 3318 
 3319     Label DONE;
 3320     __ cbz(len, DONE);
 3321 
 3322     // Compute #rounds for AES based on the length of the key array
 3323     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3324 
 3325     __ aesenc_loadkeys(key, keylen);
 3326     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3327     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3328 
 3329     // AES/CTR loop
 3330     {
 3331       Label L_CTR_loop;
 3332       __ BIND(L_CTR_loop);
 3333 
 3334       // Setup the counters
 3335       __ movi(v8, __ T4S, 0);
 3336       __ movi(v9, __ T4S, 1);
 3337       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3338 
 3339       assert(v0->encoding() < v8->encoding(), "");
 3340       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3341         FloatRegister f = as_FloatRegister(i);
 3342         __ rev32(f, __ T16B, v16);
 3343         __ addv(v16, __ T4S, v16, v8);
 3344       }
 3345 
 3346       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3347 
 3348       // Encrypt the counters
 3349       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3350 
 3351       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3352 
 3353       // XOR the encrypted counters with the inputs
 3354       for (int i = 0; i < 8; i++) {
 3355         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3356         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3357         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3358       }
 3359       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3360       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3361 
 3362       __ subw(len, len, 16 * 8);
 3363       __ cbnzw(len, L_CTR_loop);
 3364     }
 3365 
 3366     __ rev32(v16, __ T16B, v16);
 3367     __ st1(v16, __ T16B, counter);
 3368 
 3369     __ ldr(len, Address(sp));
 3370     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3371 
 3372     // GHASH/CTR loop
 3373     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3374                                 len, /*unrolls*/4);
 3375 
 3376 #ifdef ASSERT
 3377     { Label L;
 3378       __ cmp(len, (unsigned char)0);
 3379       __ br(Assembler::EQ, L);
 3380       __ stop("stubGenerator: abort");
 3381       __ bind(L);
 3382   }
 3383 #endif
 3384 
 3385   __ bind(DONE);
 3386     // Return the number of bytes processed
 3387     __ ldr(r0, __ post(sp, 2 * wordSize));
 3388 
 3389     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3390     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3391 
 3392     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3393     __ ret(lr);
 3394      return start;
 3395   }
 3396 
 3397   class Cached64Bytes {
 3398   private:
 3399     MacroAssembler *_masm;
 3400     Register _regs[8];
 3401 
 3402   public:
 3403     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3404       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3405       auto it = rs.begin();
 3406       for (auto &r: _regs) {
 3407         r = *it;
 3408         ++it;
 3409       }
 3410     }
 3411 
 3412     void gen_loads(Register base) {
 3413       for (int i = 0; i < 8; i += 2) {
 3414         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3415       }
 3416     }
 3417 
 3418     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3419     void extract_u32(Register dest, int i) {
 3420       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3421     }
 3422   };
 3423 
 3424   // Utility routines for md5.
 3425   // Clobbers r10 and r11.
 3426   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3427               int k, int s, int t) {
 3428     Register rscratch3 = r10;
 3429     Register rscratch4 = r11;
 3430 
 3431     __ eorw(rscratch3, r3, r4);
 3432     __ movw(rscratch2, t);
 3433     __ andw(rscratch3, rscratch3, r2);
 3434     __ addw(rscratch4, r1, rscratch2);
 3435     reg_cache.extract_u32(rscratch1, k);
 3436     __ eorw(rscratch3, rscratch3, r4);
 3437     __ addw(rscratch4, rscratch4, rscratch1);
 3438     __ addw(rscratch3, rscratch3, rscratch4);
 3439     __ rorw(rscratch2, rscratch3, 32 - s);
 3440     __ addw(r1, rscratch2, r2);
 3441   }
 3442 
 3443   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3444               int k, int s, int t) {
 3445     Register rscratch3 = r10;
 3446     Register rscratch4 = r11;
 3447 
 3448     reg_cache.extract_u32(rscratch1, k);
 3449     __ movw(rscratch2, t);
 3450     __ addw(rscratch4, r1, rscratch2);
 3451     __ addw(rscratch4, rscratch4, rscratch1);
 3452     __ bicw(rscratch2, r3, r4);
 3453     __ andw(rscratch3, r2, r4);
 3454     __ addw(rscratch2, rscratch2, rscratch4);
 3455     __ addw(rscratch2, rscratch2, rscratch3);
 3456     __ rorw(rscratch2, rscratch2, 32 - s);
 3457     __ addw(r1, rscratch2, r2);
 3458   }
 3459 
 3460   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3461               int k, int s, int t) {
 3462     Register rscratch3 = r10;
 3463     Register rscratch4 = r11;
 3464 
 3465     __ eorw(rscratch3, r3, r4);
 3466     __ movw(rscratch2, t);
 3467     __ addw(rscratch4, r1, rscratch2);
 3468     reg_cache.extract_u32(rscratch1, k);
 3469     __ eorw(rscratch3, rscratch3, r2);
 3470     __ addw(rscratch4, rscratch4, rscratch1);
 3471     __ addw(rscratch3, rscratch3, rscratch4);
 3472     __ rorw(rscratch2, rscratch3, 32 - s);
 3473     __ addw(r1, rscratch2, r2);
 3474   }
 3475 
 3476   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3477               int k, int s, int t) {
 3478     Register rscratch3 = r10;
 3479     Register rscratch4 = r11;
 3480 
 3481     __ movw(rscratch3, t);
 3482     __ ornw(rscratch2, r2, r4);
 3483     __ addw(rscratch4, r1, rscratch3);
 3484     reg_cache.extract_u32(rscratch1, k);
 3485     __ eorw(rscratch3, rscratch2, r3);
 3486     __ addw(rscratch4, rscratch4, rscratch1);
 3487     __ addw(rscratch3, rscratch3, rscratch4);
 3488     __ rorw(rscratch2, rscratch3, 32 - s);
 3489     __ addw(r1, rscratch2, r2);
 3490   }
 3491 
 3492   // Arguments:
 3493   //
 3494   // Inputs:
 3495   //   c_rarg0   - byte[]  source+offset
 3496   //   c_rarg1   - int[]   SHA.state
 3497   //   c_rarg2   - int     offset
 3498   //   c_rarg3   - int     limit
 3499   //
 3500   address generate_md5_implCompress(StubGenStubId stub_id) {
 3501     bool multi_block;
 3502     switch (stub_id) {
 3503     case md5_implCompress_id:
 3504       multi_block = false;
 3505       break;
 3506     case md5_implCompressMB_id:
 3507       multi_block = true;
 3508       break;
 3509     default:
 3510       ShouldNotReachHere();
 3511     }
 3512     __ align(CodeEntryAlignment);
 3513 
 3514     StubCodeMark mark(this, stub_id);
 3515     address start = __ pc();
 3516 
 3517     Register buf       = c_rarg0;
 3518     Register state     = c_rarg1;
 3519     Register ofs       = c_rarg2;
 3520     Register limit     = c_rarg3;
 3521     Register a         = r4;
 3522     Register b         = r5;
 3523     Register c         = r6;
 3524     Register d         = r7;
 3525     Register rscratch3 = r10;
 3526     Register rscratch4 = r11;
 3527 
 3528     Register state_regs[2] = { r12, r13 };
 3529     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3530     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3531 
 3532     __ push(saved_regs, sp);
 3533 
 3534     __ ldp(state_regs[0], state_regs[1], Address(state));
 3535     __ ubfx(a, state_regs[0],  0, 32);
 3536     __ ubfx(b, state_regs[0], 32, 32);
 3537     __ ubfx(c, state_regs[1],  0, 32);
 3538     __ ubfx(d, state_regs[1], 32, 32);
 3539 
 3540     Label md5_loop;
 3541     __ BIND(md5_loop);
 3542 
 3543     reg_cache.gen_loads(buf);
 3544 
 3545     // Round 1
 3546     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3547     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3548     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3549     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3550     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3551     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3552     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3553     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3554     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3555     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3556     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3557     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3558     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3559     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3560     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3561     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3562 
 3563     // Round 2
 3564     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3565     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3566     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3567     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3568     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3569     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3570     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3571     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3572     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3573     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3574     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3575     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3576     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3577     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3578     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3579     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3580 
 3581     // Round 3
 3582     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3583     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3584     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3585     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3586     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3587     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3588     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3589     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3590     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3591     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3592     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3593     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3594     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3595     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3596     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3597     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3598 
 3599     // Round 4
 3600     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3601     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3602     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3603     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3604     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3605     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3606     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3607     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3608     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3609     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3610     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3611     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3612     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3613     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3614     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3615     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3616 
 3617     __ addw(a, state_regs[0], a);
 3618     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3619     __ addw(b, rscratch2, b);
 3620     __ addw(c, state_regs[1], c);
 3621     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3622     __ addw(d, rscratch4, d);
 3623 
 3624     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3625     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3626 
 3627     if (multi_block) {
 3628       __ add(buf, buf, 64);
 3629       __ add(ofs, ofs, 64);
 3630       __ cmp(ofs, limit);
 3631       __ br(Assembler::LE, md5_loop);
 3632       __ mov(c_rarg0, ofs); // return ofs
 3633     }
 3634 
 3635     // write hash values back in the correct order
 3636     __ stp(state_regs[0], state_regs[1], Address(state));
 3637 
 3638     __ pop(saved_regs, sp);
 3639 
 3640     __ ret(lr);
 3641 
 3642     return start;
 3643   }
 3644 
 3645   // Arguments:
 3646   //
 3647   // Inputs:
 3648   //   c_rarg0   - byte[]  source+offset
 3649   //   c_rarg1   - int[]   SHA.state
 3650   //   c_rarg2   - int     offset
 3651   //   c_rarg3   - int     limit
 3652   //
 3653   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3654     bool multi_block;
 3655     switch (stub_id) {
 3656     case sha1_implCompress_id:
 3657       multi_block = false;
 3658       break;
 3659     case sha1_implCompressMB_id:
 3660       multi_block = true;
 3661       break;
 3662     default:
 3663       ShouldNotReachHere();
 3664     }
 3665 
 3666     __ align(CodeEntryAlignment);
 3667 
 3668     StubCodeMark mark(this, stub_id);
 3669     address start = __ pc();
 3670 
 3671     Register buf   = c_rarg0;
 3672     Register state = c_rarg1;
 3673     Register ofs   = c_rarg2;
 3674     Register limit = c_rarg3;
 3675 
 3676     Label keys;
 3677     Label sha1_loop;
 3678 
 3679     // load the keys into v0..v3
 3680     __ adr(rscratch1, keys);
 3681     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3682     // load 5 words state into v6, v7
 3683     __ ldrq(v6, Address(state, 0));
 3684     __ ldrs(v7, Address(state, 16));
 3685 
 3686 
 3687     __ BIND(sha1_loop);
 3688     // load 64 bytes of data into v16..v19
 3689     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3690     __ rev32(v16, __ T16B, v16);
 3691     __ rev32(v17, __ T16B, v17);
 3692     __ rev32(v18, __ T16B, v18);
 3693     __ rev32(v19, __ T16B, v19);
 3694 
 3695     // do the sha1
 3696     __ addv(v4, __ T4S, v16, v0);
 3697     __ orr(v20, __ T16B, v6, v6);
 3698 
 3699     FloatRegister d0 = v16;
 3700     FloatRegister d1 = v17;
 3701     FloatRegister d2 = v18;
 3702     FloatRegister d3 = v19;
 3703 
 3704     for (int round = 0; round < 20; round++) {
 3705       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3706       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3707       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3708       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3709       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3710 
 3711       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3712       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3713       __ sha1h(tmp2, __ T4S, v20);
 3714       if (round < 5)
 3715         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3716       else if (round < 10 || round >= 15)
 3717         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3718       else
 3719         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3720       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3721 
 3722       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3723     }
 3724 
 3725     __ addv(v7, __ T2S, v7, v21);
 3726     __ addv(v6, __ T4S, v6, v20);
 3727 
 3728     if (multi_block) {
 3729       __ add(ofs, ofs, 64);
 3730       __ cmp(ofs, limit);
 3731       __ br(Assembler::LE, sha1_loop);
 3732       __ mov(c_rarg0, ofs); // return ofs
 3733     }
 3734 
 3735     __ strq(v6, Address(state, 0));
 3736     __ strs(v7, Address(state, 16));
 3737 
 3738     __ ret(lr);
 3739 
 3740     __ bind(keys);
 3741     __ emit_int32(0x5a827999);
 3742     __ emit_int32(0x6ed9eba1);
 3743     __ emit_int32(0x8f1bbcdc);
 3744     __ emit_int32(0xca62c1d6);
 3745 
 3746     return start;
 3747   }
 3748 
 3749 
 3750   // Arguments:
 3751   //
 3752   // Inputs:
 3753   //   c_rarg0   - byte[]  source+offset
 3754   //   c_rarg1   - int[]   SHA.state
 3755   //   c_rarg2   - int     offset
 3756   //   c_rarg3   - int     limit
 3757   //
 3758   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3759     bool multi_block;
 3760     switch (stub_id) {
 3761     case sha256_implCompress_id:
 3762       multi_block = false;
 3763       break;
 3764     case sha256_implCompressMB_id:
 3765       multi_block = true;
 3766       break;
 3767     default:
 3768       ShouldNotReachHere();
 3769     }
 3770 
 3771     static const uint32_t round_consts[64] = {
 3772       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3773       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3774       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3775       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3776       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3777       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3778       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3779       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3780       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3781       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3782       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3783       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3784       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3785       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3786       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3787       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3788     };
 3789 
 3790     __ align(CodeEntryAlignment);
 3791 
 3792     StubCodeMark mark(this, stub_id);
 3793     address start = __ pc();
 3794 
 3795     Register buf   = c_rarg0;
 3796     Register state = c_rarg1;
 3797     Register ofs   = c_rarg2;
 3798     Register limit = c_rarg3;
 3799 
 3800     Label sha1_loop;
 3801 
 3802     __ stpd(v8, v9, __ pre(sp, -32));
 3803     __ stpd(v10, v11, Address(sp, 16));
 3804 
 3805 // dga == v0
 3806 // dgb == v1
 3807 // dg0 == v2
 3808 // dg1 == v3
 3809 // dg2 == v4
 3810 // t0 == v6
 3811 // t1 == v7
 3812 
 3813     // load 16 keys to v16..v31
 3814     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3815     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3816     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3817     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3818     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3819 
 3820     // load 8 words (256 bits) state
 3821     __ ldpq(v0, v1, state);
 3822 
 3823     __ BIND(sha1_loop);
 3824     // load 64 bytes of data into v8..v11
 3825     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3826     __ rev32(v8, __ T16B, v8);
 3827     __ rev32(v9, __ T16B, v9);
 3828     __ rev32(v10, __ T16B, v10);
 3829     __ rev32(v11, __ T16B, v11);
 3830 
 3831     __ addv(v6, __ T4S, v8, v16);
 3832     __ orr(v2, __ T16B, v0, v0);
 3833     __ orr(v3, __ T16B, v1, v1);
 3834 
 3835     FloatRegister d0 = v8;
 3836     FloatRegister d1 = v9;
 3837     FloatRegister d2 = v10;
 3838     FloatRegister d3 = v11;
 3839 
 3840 
 3841     for (int round = 0; round < 16; round++) {
 3842       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3843       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3844       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3845       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3846 
 3847       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3848        __ orr(v4, __ T16B, v2, v2);
 3849       if (round < 15)
 3850         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3851       __ sha256h(v2, __ T4S, v3, tmp2);
 3852       __ sha256h2(v3, __ T4S, v4, tmp2);
 3853       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3854 
 3855       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3856     }
 3857 
 3858     __ addv(v0, __ T4S, v0, v2);
 3859     __ addv(v1, __ T4S, v1, v3);
 3860 
 3861     if (multi_block) {
 3862       __ add(ofs, ofs, 64);
 3863       __ cmp(ofs, limit);
 3864       __ br(Assembler::LE, sha1_loop);
 3865       __ mov(c_rarg0, ofs); // return ofs
 3866     }
 3867 
 3868     __ ldpd(v10, v11, Address(sp, 16));
 3869     __ ldpd(v8, v9, __ post(sp, 32));
 3870 
 3871     __ stpq(v0, v1, state);
 3872 
 3873     __ ret(lr);
 3874 
 3875     return start;
 3876   }
 3877 
 3878   // Double rounds for sha512.
 3879   void sha512_dround(int dr,
 3880                      FloatRegister vi0, FloatRegister vi1,
 3881                      FloatRegister vi2, FloatRegister vi3,
 3882                      FloatRegister vi4, FloatRegister vrc0,
 3883                      FloatRegister vrc1, FloatRegister vin0,
 3884                      FloatRegister vin1, FloatRegister vin2,
 3885                      FloatRegister vin3, FloatRegister vin4) {
 3886       if (dr < 36) {
 3887         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3888       }
 3889       __ addv(v5, __ T2D, vrc0, vin0);
 3890       __ ext(v6, __ T16B, vi2, vi3, 8);
 3891       __ ext(v5, __ T16B, v5, v5, 8);
 3892       __ ext(v7, __ T16B, vi1, vi2, 8);
 3893       __ addv(vi3, __ T2D, vi3, v5);
 3894       if (dr < 32) {
 3895         __ ext(v5, __ T16B, vin3, vin4, 8);
 3896         __ sha512su0(vin0, __ T2D, vin1);
 3897       }
 3898       __ sha512h(vi3, __ T2D, v6, v7);
 3899       if (dr < 32) {
 3900         __ sha512su1(vin0, __ T2D, vin2, v5);
 3901       }
 3902       __ addv(vi4, __ T2D, vi1, vi3);
 3903       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3904   }
 3905 
 3906   // Arguments:
 3907   //
 3908   // Inputs:
 3909   //   c_rarg0   - byte[]  source+offset
 3910   //   c_rarg1   - int[]   SHA.state
 3911   //   c_rarg2   - int     offset
 3912   //   c_rarg3   - int     limit
 3913   //
 3914   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3915     bool multi_block;
 3916     switch (stub_id) {
 3917     case sha512_implCompress_id:
 3918       multi_block = false;
 3919       break;
 3920     case sha512_implCompressMB_id:
 3921       multi_block = true;
 3922       break;
 3923     default:
 3924       ShouldNotReachHere();
 3925     }
 3926 
 3927     static const uint64_t round_consts[80] = {
 3928       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3929       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3930       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3931       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3932       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3933       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3934       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3935       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3936       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3937       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3938       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3939       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3940       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3941       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3942       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3943       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3944       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3945       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3946       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3947       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3948       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3949       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3950       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3951       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3952       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3953       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3954       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3955     };
 3956 
 3957     __ align(CodeEntryAlignment);
 3958 
 3959     StubCodeMark mark(this, stub_id);
 3960     address start = __ pc();
 3961 
 3962     Register buf   = c_rarg0;
 3963     Register state = c_rarg1;
 3964     Register ofs   = c_rarg2;
 3965     Register limit = c_rarg3;
 3966 
 3967     __ stpd(v8, v9, __ pre(sp, -64));
 3968     __ stpd(v10, v11, Address(sp, 16));
 3969     __ stpd(v12, v13, Address(sp, 32));
 3970     __ stpd(v14, v15, Address(sp, 48));
 3971 
 3972     Label sha512_loop;
 3973 
 3974     // load state
 3975     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3976 
 3977     // load first 4 round constants
 3978     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3979     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 3980 
 3981     __ BIND(sha512_loop);
 3982     // load 128B of data into v12..v19
 3983     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 3984     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 3985     __ rev64(v12, __ T16B, v12);
 3986     __ rev64(v13, __ T16B, v13);
 3987     __ rev64(v14, __ T16B, v14);
 3988     __ rev64(v15, __ T16B, v15);
 3989     __ rev64(v16, __ T16B, v16);
 3990     __ rev64(v17, __ T16B, v17);
 3991     __ rev64(v18, __ T16B, v18);
 3992     __ rev64(v19, __ T16B, v19);
 3993 
 3994     __ mov(rscratch2, rscratch1);
 3995 
 3996     __ mov(v0, __ T16B, v8);
 3997     __ mov(v1, __ T16B, v9);
 3998     __ mov(v2, __ T16B, v10);
 3999     __ mov(v3, __ T16B, v11);
 4000 
 4001     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4002     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4003     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4004     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4005     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4006     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4007     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4008     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4009     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4010     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4011     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4012     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4013     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4014     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4015     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4016     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4017     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4018     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4019     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4020     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4021     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4022     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4023     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4024     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4025     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4026     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4027     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4028     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4029     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4030     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4031     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4032     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4033     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4034     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4035     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4036     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4037     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4038     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4039     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4040     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4041 
 4042     __ addv(v8, __ T2D, v8, v0);
 4043     __ addv(v9, __ T2D, v9, v1);
 4044     __ addv(v10, __ T2D, v10, v2);
 4045     __ addv(v11, __ T2D, v11, v3);
 4046 
 4047     if (multi_block) {
 4048       __ add(ofs, ofs, 128);
 4049       __ cmp(ofs, limit);
 4050       __ br(Assembler::LE, sha512_loop);
 4051       __ mov(c_rarg0, ofs); // return ofs
 4052     }
 4053 
 4054     __ st1(v8, v9, v10, v11, __ T2D, state);
 4055 
 4056     __ ldpd(v14, v15, Address(sp, 48));
 4057     __ ldpd(v12, v13, Address(sp, 32));
 4058     __ ldpd(v10, v11, Address(sp, 16));
 4059     __ ldpd(v8, v9, __ post(sp, 64));
 4060 
 4061     __ ret(lr);
 4062 
 4063     return start;
 4064   }
 4065 
 4066   // Execute one round of keccak of two computations in parallel.
 4067   // One of the states should be loaded into the lower halves of
 4068   // the vector registers v0-v24, the other should be loaded into
 4069   // the upper halves of those registers. The ld1r instruction loads
 4070   // the round constant into both halves of register v31.
 4071   // Intermediate results c0...c5 and d0...d5 are computed
 4072   // in registers v25...v30.
 4073   // All vector instructions that are used operate on both register
 4074   // halves in parallel.
 4075   // If only a single computation is needed, one can only load the lower halves.
 4076   void keccak_round(Register rscratch1) {
 4077   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4078   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4079   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4080   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4081   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4082   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4083   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4084   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4085   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4086   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4087 
 4088   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4089   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4090   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4091   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4092   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4093 
 4094   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4095   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4096   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4097   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4098   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4099   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4100   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4101   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4102   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4103   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4104   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4105   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4106   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4107   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4108   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4109   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4110   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4111   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4112   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4113   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4114   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4115   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4116   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4117   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4118   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4119 
 4120   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4121   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4122   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4123   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4124   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4125 
 4126   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4127 
 4128   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4129   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4130   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4131   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4132   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4133 
 4134   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4135   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4136   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4137   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4138   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4139 
 4140   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4141   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4142   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4143   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4144   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4145 
 4146   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4147   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4148   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4149   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4150   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4151 
 4152   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4153   }
 4154 
 4155   // Arguments:
 4156   //
 4157   // Inputs:
 4158   //   c_rarg0   - byte[]  source+offset
 4159   //   c_rarg1   - byte[]  SHA.state
 4160   //   c_rarg2   - int     block_size
 4161   //   c_rarg3   - int     offset
 4162   //   c_rarg4   - int     limit
 4163   //
 4164   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4165     bool multi_block;
 4166     switch (stub_id) {
 4167     case sha3_implCompress_id:
 4168       multi_block = false;
 4169       break;
 4170     case sha3_implCompressMB_id:
 4171       multi_block = true;
 4172       break;
 4173     default:
 4174       ShouldNotReachHere();
 4175     }
 4176 
 4177     static const uint64_t round_consts[24] = {
 4178       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4179       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4180       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4181       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4182       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4183       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4184       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4185       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4186     };
 4187 
 4188     __ align(CodeEntryAlignment);
 4189 
 4190     StubCodeMark mark(this, stub_id);
 4191     address start = __ pc();
 4192 
 4193     Register buf           = c_rarg0;
 4194     Register state         = c_rarg1;
 4195     Register block_size    = c_rarg2;
 4196     Register ofs           = c_rarg3;
 4197     Register limit         = c_rarg4;
 4198 
 4199     Label sha3_loop, rounds24_loop;
 4200     Label sha3_512_or_sha3_384, shake128;
 4201 
 4202     __ stpd(v8, v9, __ pre(sp, -64));
 4203     __ stpd(v10, v11, Address(sp, 16));
 4204     __ stpd(v12, v13, Address(sp, 32));
 4205     __ stpd(v14, v15, Address(sp, 48));
 4206 
 4207     // load state
 4208     __ add(rscratch1, state, 32);
 4209     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4210     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4211     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4212     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4213     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4214     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4215     __ ld1(v24, __ T1D, rscratch1);
 4216 
 4217     __ BIND(sha3_loop);
 4218 
 4219     // 24 keccak rounds
 4220     __ movw(rscratch2, 24);
 4221 
 4222     // load round_constants base
 4223     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4224 
 4225     // load input
 4226     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4227     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4228     __ eor(v0, __ T8B, v0, v25);
 4229     __ eor(v1, __ T8B, v1, v26);
 4230     __ eor(v2, __ T8B, v2, v27);
 4231     __ eor(v3, __ T8B, v3, v28);
 4232     __ eor(v4, __ T8B, v4, v29);
 4233     __ eor(v5, __ T8B, v5, v30);
 4234     __ eor(v6, __ T8B, v6, v31);
 4235 
 4236     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4237     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4238 
 4239     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4240     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4241     __ eor(v7, __ T8B, v7, v25);
 4242     __ eor(v8, __ T8B, v8, v26);
 4243     __ eor(v9, __ T8B, v9, v27);
 4244     __ eor(v10, __ T8B, v10, v28);
 4245     __ eor(v11, __ T8B, v11, v29);
 4246     __ eor(v12, __ T8B, v12, v30);
 4247     __ eor(v13, __ T8B, v13, v31);
 4248 
 4249     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4250     __ eor(v14, __ T8B, v14, v25);
 4251     __ eor(v15, __ T8B, v15, v26);
 4252     __ eor(v16, __ T8B, v16, v27);
 4253 
 4254     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4255     __ andw(c_rarg5, block_size, 48);
 4256     __ cbzw(c_rarg5, rounds24_loop);
 4257 
 4258     __ tbnz(block_size, 5, shake128);
 4259     // block_size == 144, bit5 == 0, SHA3-224
 4260     __ ldrd(v28, __ post(buf, 8));
 4261     __ eor(v17, __ T8B, v17, v28);
 4262     __ b(rounds24_loop);
 4263 
 4264     __ BIND(shake128);
 4265     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4266     __ eor(v17, __ T8B, v17, v28);
 4267     __ eor(v18, __ T8B, v18, v29);
 4268     __ eor(v19, __ T8B, v19, v30);
 4269     __ eor(v20, __ T8B, v20, v31);
 4270     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4271 
 4272     __ BIND(sha3_512_or_sha3_384);
 4273     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4274     __ eor(v7, __ T8B, v7, v25);
 4275     __ eor(v8, __ T8B, v8, v26);
 4276     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4277 
 4278     // SHA3-384
 4279     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4280     __ eor(v9,  __ T8B, v9,  v27);
 4281     __ eor(v10, __ T8B, v10, v28);
 4282     __ eor(v11, __ T8B, v11, v29);
 4283     __ eor(v12, __ T8B, v12, v30);
 4284 
 4285     __ BIND(rounds24_loop);
 4286     __ subw(rscratch2, rscratch2, 1);
 4287 
 4288     keccak_round(rscratch1);
 4289 
 4290     __ cbnzw(rscratch2, rounds24_loop);
 4291 
 4292     if (multi_block) {
 4293       __ add(ofs, ofs, block_size);
 4294       __ cmp(ofs, limit);
 4295       __ br(Assembler::LE, sha3_loop);
 4296       __ mov(c_rarg0, ofs); // return ofs
 4297     }
 4298 
 4299     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4300     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4301     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4302     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4303     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4304     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4305     __ st1(v24, __ T1D, state);
 4306 
 4307     // restore callee-saved registers
 4308     __ ldpd(v14, v15, Address(sp, 48));
 4309     __ ldpd(v12, v13, Address(sp, 32));
 4310     __ ldpd(v10, v11, Address(sp, 16));
 4311     __ ldpd(v8, v9, __ post(sp, 64));
 4312 
 4313     __ ret(lr);
 4314 
 4315     return start;
 4316   }
 4317 
 4318   // Inputs:
 4319   //   c_rarg0   - long[]  state0
 4320   //   c_rarg1   - long[]  state1
 4321   address generate_double_keccak() {
 4322     static const uint64_t round_consts[24] = {
 4323       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4324       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4325       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4326       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4327       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4328       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4329       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4330       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4331     };
 4332 
 4333     // Implements the double_keccak() method of the
 4334     // sun.secyrity.provider.SHA3Parallel class
 4335     __ align(CodeEntryAlignment);
 4336     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4337     address start = __ pc();
 4338     __ enter();
 4339 
 4340     Register state0        = c_rarg0;
 4341     Register state1        = c_rarg1;
 4342 
 4343     Label rounds24_loop;
 4344 
 4345     // save callee-saved registers
 4346     __ stpd(v8, v9, __ pre(sp, -64));
 4347     __ stpd(v10, v11, Address(sp, 16));
 4348     __ stpd(v12, v13, Address(sp, 32));
 4349     __ stpd(v14, v15, Address(sp, 48));
 4350 
 4351     // load states
 4352     __ add(rscratch1, state0, 32);
 4353     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4354     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4355     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4356     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4357     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4358     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4359     __ ld1(v24, __ D, 0, rscratch1);
 4360     __ add(rscratch1, state1, 32);
 4361     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4362     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4363     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4364     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4365     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4366     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4367     __ ld1(v24, __ D, 1, rscratch1);
 4368 
 4369     // 24 keccak rounds
 4370     __ movw(rscratch2, 24);
 4371 
 4372     // load round_constants base
 4373     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4374 
 4375     __ BIND(rounds24_loop);
 4376     __ subw(rscratch2, rscratch2, 1);
 4377     keccak_round(rscratch1);
 4378     __ cbnzw(rscratch2, rounds24_loop);
 4379 
 4380     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4381     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4382     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4383     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4384     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4385     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4386     __ st1(v24, __ D, 0, state0);
 4387     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4388     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4389     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4390     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4391     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4392     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4393     __ st1(v24, __ D, 1, state1);
 4394 
 4395     // restore callee-saved vector registers
 4396     __ ldpd(v14, v15, Address(sp, 48));
 4397     __ ldpd(v12, v13, Address(sp, 32));
 4398     __ ldpd(v10, v11, Address(sp, 16));
 4399     __ ldpd(v8, v9, __ post(sp, 64));
 4400 
 4401     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4402     __ mov(r0, zr); // return 0
 4403     __ ret(lr);
 4404 
 4405     return start;
 4406   }
 4407 
 4408   // ChaCha20 block function.  This version parallelizes the 32-bit
 4409   // state elements on each of 16 vectors, producing 4 blocks of
 4410   // keystream at a time.
 4411   //
 4412   // state (int[16]) = c_rarg0
 4413   // keystream (byte[256]) = c_rarg1
 4414   // return - number of bytes of produced keystream (always 256)
 4415   //
 4416   // This implementation takes each 32-bit integer from the state
 4417   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4418   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4419   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4420   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4421   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4422   // operations represented by one QUARTERROUND function, we instead stack all
 4423   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4424   // and then do the same for the second set of 4 quarter rounds.  This removes
 4425   // some latency that would otherwise be incurred by waiting for an add to
 4426   // complete before performing an xor (which depends on the result of the
 4427   // add), etc. An adjustment happens between the first and second groups of 4
 4428   // quarter rounds, but this is done only in the inputs to the macro functions
 4429   // that generate the assembly instructions - these adjustments themselves are
 4430   // not part of the resulting assembly.
 4431   // The 4 registers v0-v3 are used during the quarter round operations as
 4432   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4433   // registers become the vectors involved in adding the start state back onto
 4434   // the post-QR working state.  After the adds are complete, each of the 16
 4435   // vectors write their first lane back to the keystream buffer, followed
 4436   // by the second lane from all vectors and so on.
 4437   address generate_chacha20Block_blockpar() {
 4438     Label L_twoRounds, L_cc20_const;
 4439     // The constant data is broken into two 128-bit segments to be loaded
 4440     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4441     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4442     // The second 128-bits is a table constant used for 8-bit left rotations.
 4443     __ BIND(L_cc20_const);
 4444     __ emit_int64(0x0000000100000000UL);
 4445     __ emit_int64(0x0000000300000002UL);
 4446     __ emit_int64(0x0605040702010003UL);
 4447     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4448 
 4449     __ align(CodeEntryAlignment);
 4450     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4451     StubCodeMark mark(this, stub_id);
 4452     address start = __ pc();
 4453     __ enter();
 4454 
 4455     int i, j;
 4456     const Register state = c_rarg0;
 4457     const Register keystream = c_rarg1;
 4458     const Register loopCtr = r10;
 4459     const Register tmpAddr = r11;
 4460     const FloatRegister ctrAddOverlay = v28;
 4461     const FloatRegister lrot8Tbl = v29;
 4462 
 4463     // Organize SIMD registers in an array that facilitates
 4464     // putting repetitive opcodes into loop structures.  It is
 4465     // important that each grouping of 4 registers is monotonically
 4466     // increasing to support the requirements of multi-register
 4467     // instructions (e.g. ld4r, st4, etc.)
 4468     const FloatRegister workSt[16] = {
 4469          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4470         v20, v21, v22, v23, v24, v25, v26, v27
 4471     };
 4472 
 4473     // Pull in constant data.  The first 16 bytes are the add overlay
 4474     // which is applied to the vector holding the counter (state[12]).
 4475     // The second 16 bytes is the index register for the 8-bit left
 4476     // rotation tbl instruction.
 4477     __ adr(tmpAddr, L_cc20_const);
 4478     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4479 
 4480     // Load from memory and interlace across 16 SIMD registers,
 4481     // With each word from memory being broadcast to all lanes of
 4482     // each successive SIMD register.
 4483     //      Addr(0) -> All lanes in workSt[i]
 4484     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4485     __ mov(tmpAddr, state);
 4486     for (i = 0; i < 16; i += 4) {
 4487       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4488           __ post(tmpAddr, 16));
 4489     }
 4490     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4491 
 4492     // Before entering the loop, create 5 4-register arrays.  These
 4493     // will hold the 4 registers that represent the a/b/c/d fields
 4494     // in the quarter round operation.  For instance the "b" field
 4495     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4496     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4497     // since it is part of a diagonal organization.  The aSet and scratch
 4498     // register sets are defined at declaration time because they do not change
 4499     // organization at any point during the 20-round processing.
 4500     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4501     FloatRegister bSet[4];
 4502     FloatRegister cSet[4];
 4503     FloatRegister dSet[4];
 4504     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4505 
 4506     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4507     __ mov(loopCtr, 10);
 4508     __ BIND(L_twoRounds);
 4509 
 4510     // Set to columnar organization and do the following 4 quarter-rounds:
 4511     // QUARTERROUND(0, 4, 8, 12)
 4512     // QUARTERROUND(1, 5, 9, 13)
 4513     // QUARTERROUND(2, 6, 10, 14)
 4514     // QUARTERROUND(3, 7, 11, 15)
 4515     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4516     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4517     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4518 
 4519     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4520     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4521     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4522 
 4523     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4524     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4525     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4526 
 4527     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4528     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4529     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4530 
 4531     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4532     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4533     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4534 
 4535     // Set to diagonal organization and do the next 4 quarter-rounds:
 4536     // QUARTERROUND(0, 5, 10, 15)
 4537     // QUARTERROUND(1, 6, 11, 12)
 4538     // QUARTERROUND(2, 7, 8, 13)
 4539     // QUARTERROUND(3, 4, 9, 14)
 4540     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4541     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4542     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4543 
 4544     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4545     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4546     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4547 
 4548     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4549     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4550     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4551 
 4552     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4553     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4554     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4555 
 4556     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4557     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4558     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4559 
 4560     // Decrement and iterate
 4561     __ sub(loopCtr, loopCtr, 1);
 4562     __ cbnz(loopCtr, L_twoRounds);
 4563 
 4564     __ mov(tmpAddr, state);
 4565 
 4566     // Add the starting state back to the post-loop keystream
 4567     // state.  We read/interlace the state array from memory into
 4568     // 4 registers similar to what we did in the beginning.  Then
 4569     // add the counter overlay onto workSt[12] at the end.
 4570     for (i = 0; i < 16; i += 4) {
 4571       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4572       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4573       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4574       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4575       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4576     }
 4577     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4578 
 4579     // Write working state into the keystream buffer.  This is accomplished
 4580     // by taking the lane "i" from each of the four vectors and writing
 4581     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4582     // repeating with the next 4 vectors until all 16 vectors have been used.
 4583     // Then move to the next lane and repeat the process until all lanes have
 4584     // been written.
 4585     for (i = 0; i < 4; i++) {
 4586       for (j = 0; j < 16; j += 4) {
 4587         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4588             __ post(keystream, 16));
 4589       }
 4590     }
 4591 
 4592     __ mov(r0, 256);             // Return length of output keystream
 4593     __ leave();
 4594     __ ret(lr);
 4595 
 4596     return start;
 4597   }
 4598 
 4599   // Helpers to schedule parallel operation bundles across vector
 4600   // register sequences of size 2, 4 or 8.
 4601 
 4602   // Implement various primitive computations across vector sequences
 4603 
 4604   template<int N>
 4605   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4606                const VSeq<N>& v1, const VSeq<N>& v2) {
 4607     // output must not be constant
 4608     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4609     // output cannot overwrite pending inputs
 4610     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4611     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4612     for (int i = 0; i < N; i++) {
 4613       __ addv(v[i], T, v1[i], v2[i]);
 4614     }
 4615   }
 4616 
 4617   template<int N>
 4618   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4619                const VSeq<N>& v1, const VSeq<N>& v2) {
 4620     // output must not be constant
 4621     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4622     // output cannot overwrite pending inputs
 4623     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4624     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4625     for (int i = 0; i < N; i++) {
 4626       __ subv(v[i], T, v1[i], v2[i]);
 4627     }
 4628   }
 4629 
 4630   template<int N>
 4631   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4632                const VSeq<N>& v1, const VSeq<N>& v2) {
 4633     // output must not be constant
 4634     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4635     // output cannot overwrite pending inputs
 4636     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4637     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4638     for (int i = 0; i < N; i++) {
 4639       __ mulv(v[i], T, v1[i], v2[i]);
 4640     }
 4641   }
 4642 
 4643   template<int N>
 4644   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4645     // output must not be constant
 4646     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4647     // output cannot overwrite pending inputs
 4648     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4649     for (int i = 0; i < N; i++) {
 4650       __ negr(v[i], T, v1[i]);
 4651     }
 4652   }
 4653 
 4654   template<int N>
 4655   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4656                const VSeq<N>& v1, int shift) {
 4657     // output must not be constant
 4658     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4659     // output cannot overwrite pending inputs
 4660     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4661     for (int i = 0; i < N; i++) {
 4662       __ sshr(v[i], T, v1[i], shift);
 4663     }
 4664   }
 4665 
 4666   template<int N>
 4667   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4668     // output must not be constant
 4669     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4670     // output cannot overwrite pending inputs
 4671     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4672     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4673     for (int i = 0; i < N; i++) {
 4674       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4675     }
 4676   }
 4677 
 4678   template<int N>
 4679   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4680     // output must not be constant
 4681     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4682     // output cannot overwrite pending inputs
 4683     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4684     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4685     for (int i = 0; i < N; i++) {
 4686       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4687     }
 4688   }
 4689 
 4690   template<int N>
 4691   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4692     // output must not be constant
 4693     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4694     // output cannot overwrite pending inputs
 4695     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4696     for (int i = 0; i < N; i++) {
 4697       __ notr(v[i], __ T16B, v1[i]);
 4698     }
 4699   }
 4700 
 4701   template<int N>
 4702   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4703     // output must not be constant
 4704     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4705     // output cannot overwrite pending inputs
 4706     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4707     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4708     for (int i = 0; i < N; i++) {
 4709       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4710     }
 4711   }
 4712 
 4713   template<int N>
 4714   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4715     // output must not be constant
 4716     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4717     // output cannot overwrite pending inputs
 4718     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4719     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4720     for (int i = 0; i < N; i++) {
 4721       __ mlsv(v[i], T, v1[i], v2[i]);
 4722     }
 4723   }
 4724 
 4725   // load N/2 successive pairs of quadword values from memory in order
 4726   // into N successive vector registers of the sequence via the
 4727   // address supplied in base.
 4728   template<int N>
 4729   void vs_ldpq(const VSeq<N>& v, Register base) {
 4730     for (int i = 0; i < N; i += 2) {
 4731       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4732     }
 4733   }
 4734 
 4735   // load N/2 successive pairs of quadword values from memory in order
 4736   // into N vector registers of the sequence via the address supplied
 4737   // in base using post-increment addressing
 4738   template<int N>
 4739   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4740     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4741     for (int i = 0; i < N; i += 2) {
 4742       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4743     }
 4744   }
 4745 
 4746   // store N successive vector registers of the sequence into N/2
 4747   // successive pairs of quadword memory locations via the address
 4748   // supplied in base using post-increment addressing
 4749   template<int N>
 4750   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4751     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4752     for (int i = 0; i < N; i += 2) {
 4753       __ stpq(v[i], v[i+1], __ post(base, 32));
 4754     }
 4755   }
 4756 
 4757   // load N/2 pairs of quadword values from memory de-interleaved into
 4758   // N vector registers 2 at a time via the address supplied in base
 4759   // using post-increment addressing.
 4760   template<int N>
 4761   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4762     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4763     for (int i = 0; i < N; i += 2) {
 4764       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4765     }
 4766   }
 4767 
 4768   // store N vector registers interleaved into N/2 pairs of quadword
 4769   // memory locations via the address supplied in base using
 4770   // post-increment addressing.
 4771   template<int N>
 4772   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4773     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4774     for (int i = 0; i < N; i += 2) {
 4775       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4776     }
 4777   }
 4778 
 4779   // load N quadword values from memory de-interleaved into N vector
 4780   // registers 3 elements at a time via the address supplied in base.
 4781   template<int N>
 4782   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4783     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4784     for (int i = 0; i < N; i += 3) {
 4785       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4786     }
 4787   }
 4788 
 4789   // load N quadword values from memory de-interleaved into N vector
 4790   // registers 3 elements at a time via the address supplied in base
 4791   // using post-increment addressing.
 4792   template<int N>
 4793   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4794     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4795     for (int i = 0; i < N; i += 3) {
 4796       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4797     }
 4798   }
 4799 
 4800   // load N/2 pairs of quadword values from memory into N vector
 4801   // registers via the address supplied in base with each pair indexed
 4802   // using the the start offset plus the corresponding entry in the
 4803   // offsets array
 4804   template<int N>
 4805   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4806     for (int i = 0; i < N/2; i++) {
 4807       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4808     }
 4809   }
 4810 
 4811   // store N vector registers into N/2 pairs of quadword memory
 4812   // locations via the address supplied in base with each pair indexed
 4813   // using the the start offset plus the corresponding entry in the
 4814   // offsets array
 4815   template<int N>
 4816   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4817     for (int i = 0; i < N/2; i++) {
 4818       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4819     }
 4820   }
 4821 
 4822   // load N single quadword values from memory into N vector registers
 4823   // via the address supplied in base with each value indexed using
 4824   // the the start offset plus the corresponding entry in the offsets
 4825   // array
 4826   template<int N>
 4827   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4828                       int start, int (&offsets)[N]) {
 4829     for (int i = 0; i < N; i++) {
 4830       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4831     }
 4832   }
 4833 
 4834   // store N vector registers into N single quadword memory locations
 4835   // via the address supplied in base with each value indexed using
 4836   // the the start offset plus the corresponding entry in the offsets
 4837   // array
 4838   template<int N>
 4839   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4840                       int start, int (&offsets)[N]) {
 4841     for (int i = 0; i < N; i++) {
 4842       __ str(v[i], T, Address(base, start + offsets[i]));
 4843     }
 4844   }
 4845 
 4846   // load N/2 pairs of quadword values from memory de-interleaved into
 4847   // N vector registers 2 at a time via the address supplied in base
 4848   // with each pair indexed using the the start offset plus the
 4849   // corresponding entry in the offsets array
 4850   template<int N>
 4851   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4852                       Register tmp, int start, int (&offsets)[N/2]) {
 4853     for (int i = 0; i < N/2; i++) {
 4854       __ add(tmp, base, start + offsets[i]);
 4855       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4856     }
 4857   }
 4858 
 4859   // store N vector registers 2 at a time interleaved into N/2 pairs
 4860   // of quadword memory locations via the address supplied in base
 4861   // with each pair indexed using the the start offset plus the
 4862   // corresponding entry in the offsets array
 4863   template<int N>
 4864   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4865                       Register tmp, int start, int (&offsets)[N/2]) {
 4866     for (int i = 0; i < N/2; i++) {
 4867       __ add(tmp, base, start + offsets[i]);
 4868       __ st2(v[2*i], v[2*i+1], T, tmp);
 4869     }
 4870   }
 4871 
 4872   // Helper routines for various flavours of Montgomery multiply
 4873 
 4874   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 4875   // multiplications in parallel
 4876   //
 4877 
 4878   // See the montMul() method of the sun.security.provider.ML_DSA
 4879   // class.
 4880   //
 4881   // Computes 4x4S results or 8x8H results
 4882   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 4883   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 4884   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 4885   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 4886   // Outputs: va - 4x4S or 4x8H vector register sequences
 4887   // vb, vc, vtmp and vq must all be disjoint
 4888   // va must be disjoint from all other inputs/temps or must equal vc
 4889   // va must have a non-zero delta i.e. it must not be a constant vseq.
 4890   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 4891   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 4892                    Assembler::SIMD_Arrangement T,
 4893                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4894     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 4895     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4896     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4897     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4898 
 4899     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4900     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4901 
 4902     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4903 
 4904     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4905     assert(vs_disjoint(va, vb), "va and vb overlap");
 4906     assert(vs_disjoint(va, vq), "va and vq overlap");
 4907     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4908     assert(!va.is_constant(), "output vector must identify 4 different registers");
 4909 
 4910     // schedule 4 streams of instructions across the vector sequences
 4911     for (int i = 0; i < 4; i++) {
 4912       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4913       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 4914     }
 4915 
 4916     for (int i = 0; i < 4; i++) {
 4917       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 4918     }
 4919 
 4920     for (int i = 0; i < 4; i++) {
 4921       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 4922     }
 4923 
 4924     for (int i = 0; i < 4; i++) {
 4925       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 4926     }
 4927   }
 4928 
 4929   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 4930   // multiplications in parallel
 4931   //
 4932 
 4933   // See the montMul() method of the sun.security.provider.ML_DSA
 4934   // class.
 4935   //
 4936   // Computes 4x4S results or 8x8H results
 4937   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 4938   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 4939   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 4940   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 4941   // Outputs: va - 4x4S or 4x8H vector register sequences
 4942   // vb, vc, vtmp and vq must all be disjoint
 4943   // va must be disjoint from all other inputs/temps or must equal vc
 4944   // va must have a non-zero delta i.e. it must not be a constant vseq.
 4945   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 4946   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 4947                    Assembler::SIMD_Arrangement T,
 4948                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 4949     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 4950     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4951     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4952     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4953 
 4954     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4955     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4956 
 4957     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4958 
 4959     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4960     assert(vs_disjoint(va, vb), "va and vb overlap");
 4961     assert(vs_disjoint(va, vq), "va and vq overlap");
 4962     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4963     assert(!va.is_constant(), "output vector must identify 2 different registers");
 4964 
 4965     // schedule 2 streams of instructions across the vector sequences
 4966     for (int i = 0; i < 2; i++) {
 4967       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4968       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 4969     }
 4970 
 4971     for (int i = 0; i < 2; i++) {
 4972       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 4973     }
 4974 
 4975     for (int i = 0; i < 2; i++) {
 4976       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 4977     }
 4978 
 4979     for (int i = 0; i < 2; i++) {
 4980       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 4981     }
 4982   }
 4983 
 4984   // Perform 16 16-bit Montgomery multiplications in parallel.
 4985   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 4986                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 4987     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 4988     // It will assert that the register use is valid
 4989     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 4990   }
 4991 
 4992   // Perform 32 16-bit Montgomery multiplications in parallel.
 4993   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 4994                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4995     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 4996     // It will assert that the register use is valid
 4997     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 4998   }
 4999 
 5000   // Perform 64 16-bit Montgomery multiplications in parallel.
 5001   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5002                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5003     // Schedule two successive 4x8H multiplies via the montmul helper
 5004     // on the front and back halves of va, vb and vc. The helper will
 5005     // assert that the register use has no overlap conflicts on each
 5006     // individual call but we also need to ensure that the necessary
 5007     // disjoint/equality constraints are met across both calls.
 5008 
 5009     // vb, vc, vtmp and vq must be disjoint. va must either be
 5010     // disjoint from all other registers or equal vc
 5011 
 5012     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5013     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5014     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5015 
 5016     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5017     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5018 
 5019     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5020 
 5021     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5022     assert(vs_disjoint(va, vb), "va and vb overlap");
 5023     assert(vs_disjoint(va, vq), "va and vq overlap");
 5024     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5025 
 5026     // we multiply the front and back halves of each sequence 4 at a
 5027     // time because
 5028     //
 5029     // 1) we are currently only able to get 4-way instruction
 5030     // parallelism at best
 5031     //
 5032     // 2) we need registers for the constants in vq and temporary
 5033     // scratch registers to hold intermediate results so vtmp can only
 5034     // be a VSeq<4> which means we only have 4 scratch slots
 5035 
 5036     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5037     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5038   }
 5039 
 5040   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5041                                const VSeq<4>& vc,
 5042                                const VSeq<4>& vtmp,
 5043                                const VSeq<2>& vq) {
 5044     // compute a = montmul(a1, c)
 5045     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5046     // ouptut a1 = a0 - a
 5047     vs_subv(va1, __ T8H, va0, vc);
 5048     //    and a0 = a0 + a
 5049     vs_addv(va0, __ T8H, va0, vc);
 5050   }
 5051 
 5052   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5053                                const VSeq<4>& vb,
 5054                                const VSeq<4>& vtmp1,
 5055                                const VSeq<4>& vtmp2,
 5056                                const VSeq<2>& vq) {
 5057     // compute c = a0 - a1
 5058     vs_subv(vtmp1, __ T8H, va0, va1);
 5059     // output a0 = a0 + a1
 5060     vs_addv(va0, __ T8H, va0, va1);
 5061     // output a1 = b montmul c
 5062     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5063   }
 5064 
 5065   void load64shorts(const VSeq<8>& v, Register shorts) {
 5066     vs_ldpq_post(v, shorts);
 5067   }
 5068 
 5069   void load32shorts(const VSeq<4>& v, Register shorts) {
 5070     vs_ldpq_post(v, shorts);
 5071   }
 5072 
 5073   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5074     vs_stpq_post(v, tmpAddr);
 5075   }
 5076 
 5077   // Kyber NTT function.
 5078   // Implements
 5079   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5080   //
 5081   // coeffs (short[256]) = c_rarg0
 5082   // ntt_zetas (short[256]) = c_rarg1
 5083   address generate_kyberNtt() {
 5084 
 5085     __ align(CodeEntryAlignment);
 5086     StubGenStubId stub_id = StubGenStubId::kyberNtt_id;
 5087     StubCodeMark mark(this, stub_id);
 5088     address start = __ pc();
 5089     __ enter();
 5090 
 5091     const Register coeffs = c_rarg0;
 5092     const Register zetas = c_rarg1;
 5093 
 5094     const Register kyberConsts = r10;
 5095     const Register tmpAddr = r11;
 5096 
 5097     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5098     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5099     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5100 
 5101     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5102     // load the montmul constants
 5103     vs_ldpq(vq, kyberConsts);
 5104 
 5105     // Each level corresponds to an iteration of the outermost loop of the
 5106     // Java method seilerNTT(int[] coeffs). There are some differences
 5107     // from what is done in the seilerNTT() method, though:
 5108     // 1. The computation is using 16-bit signed values, we do not convert them
 5109     // to ints here.
 5110     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5111     // this array for each level, it is easier that way to fill up the vector
 5112     // registers.
 5113     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5114     // multiplications (this is because that way there should not be any
 5115     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5116     // that we can use the 16-bit arithmetic in the vector unit.
 5117     //
 5118     // On each level, we fill up the vector registers in such a way that the
 5119     // array elements that need to be multiplied by the zetas go into one
 5120     // set of vector registers while the corresponding ones that don't need to
 5121     // be multiplied, go into another set.
 5122     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5123     // registers interleaving the steps of 4 identical computations,
 5124     // each done on 8 16-bit values per register.
 5125 
 5126     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5127     // to the zetas occur in discrete blocks whose size is some multiple
 5128     // of 32.
 5129 
 5130     // level 0
 5131     __ add(tmpAddr, coeffs, 256);
 5132     load64shorts(vs1, tmpAddr);
 5133     load64shorts(vs2, zetas);
 5134     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5135     __ add(tmpAddr, coeffs, 0);
 5136     load64shorts(vs1, tmpAddr);
 5137     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5138     vs_addv(vs1, __ T8H, vs1, vs2);
 5139     __ add(tmpAddr, coeffs, 0);
 5140     vs_stpq_post(vs1, tmpAddr);
 5141     __ add(tmpAddr, coeffs, 256);
 5142     vs_stpq_post(vs3, tmpAddr);
 5143     // restore montmul constants
 5144     vs_ldpq(vq, kyberConsts);
 5145     load64shorts(vs1, tmpAddr);
 5146     load64shorts(vs2, zetas);
 5147     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5148     __ add(tmpAddr, coeffs, 128);
 5149     load64shorts(vs1, tmpAddr);
 5150     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5151     vs_addv(vs1, __ T8H, vs1, vs2);
 5152     __ add(tmpAddr, coeffs, 128);
 5153     store64shorts(vs1, tmpAddr);
 5154     __ add(tmpAddr, coeffs, 384);
 5155     store64shorts(vs3, tmpAddr);
 5156 
 5157     // level 1
 5158     // restore montmul constants
 5159     vs_ldpq(vq, kyberConsts);
 5160     __ add(tmpAddr, coeffs, 128);
 5161     load64shorts(vs1, tmpAddr);
 5162     load64shorts(vs2, zetas);
 5163     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5164     __ add(tmpAddr, coeffs, 0);
 5165     load64shorts(vs1, tmpAddr);
 5166     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5167     vs_addv(vs1, __ T8H, vs1, vs2);
 5168     __ add(tmpAddr, coeffs, 0);
 5169     store64shorts(vs1, tmpAddr);
 5170     store64shorts(vs3, tmpAddr);
 5171     vs_ldpq(vq, kyberConsts);
 5172     __ add(tmpAddr, coeffs, 384);
 5173     load64shorts(vs1, tmpAddr);
 5174     load64shorts(vs2, zetas);
 5175     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5176     __ add(tmpAddr, coeffs, 256);
 5177     load64shorts(vs1, tmpAddr);
 5178     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5179     vs_addv(vs1, __ T8H, vs1, vs2);
 5180     __ add(tmpAddr, coeffs, 256);
 5181     store64shorts(vs1, tmpAddr);
 5182     store64shorts(vs3, tmpAddr);
 5183 
 5184     // level 2
 5185     vs_ldpq(vq, kyberConsts);
 5186     int offsets1[4] = { 0, 32, 128, 160 };
 5187     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5188     load64shorts(vs2, zetas);
 5189     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5190     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5191     // kyber_subv_addv64();
 5192     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5193     vs_addv(vs1, __ T8H, vs1, vs2);
 5194     __ add(tmpAddr, coeffs, 0);
 5195     vs_stpq_post(vs_front(vs1), tmpAddr);
 5196     vs_stpq_post(vs_front(vs3), tmpAddr);
 5197     vs_stpq_post(vs_back(vs1), tmpAddr);
 5198     vs_stpq_post(vs_back(vs3), tmpAddr);
 5199     vs_ldpq(vq, kyberConsts);
 5200     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5201     load64shorts(vs2, zetas);
 5202     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5203     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5204     // kyber_subv_addv64();
 5205     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5206     vs_addv(vs1, __ T8H, vs1, vs2);
 5207     __ add(tmpAddr, coeffs, 256);
 5208     vs_stpq_post(vs_front(vs1), tmpAddr);
 5209     vs_stpq_post(vs_front(vs3), tmpAddr);
 5210     vs_stpq_post(vs_back(vs1), tmpAddr);
 5211     vs_stpq_post(vs_back(vs3), tmpAddr);
 5212 
 5213     // level 3
 5214     vs_ldpq(vq, kyberConsts);
 5215     int offsets2[4] = { 0, 64, 128, 192 };
 5216     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5217     load64shorts(vs2, zetas);
 5218     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5219     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5220     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5221     vs_addv(vs1, __ T8H, vs1, vs2);
 5222     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5223     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5224 
 5225     vs_ldpq(vq, kyberConsts);
 5226     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5227     load64shorts(vs2, zetas);
 5228     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5229     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5230     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5231     vs_addv(vs1, __ T8H, vs1, vs2);
 5232     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5233     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5234 
 5235     // level 4
 5236     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5237     // so they are loaded using employing an ldr at 8 distinct offsets.
 5238 
 5239     vs_ldpq(vq, kyberConsts);
 5240     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5241     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5242     load64shorts(vs2, zetas);
 5243     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5244     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5245     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5246     vs_addv(vs1, __ T8H, vs1, vs2);
 5247     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5248     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5249 
 5250     vs_ldpq(vq, kyberConsts);
 5251     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5252     load64shorts(vs2, zetas);
 5253     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5254     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5255     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5256     vs_addv(vs1, __ T8H, vs1, vs2);
 5257     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5258     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5259 
 5260     // level 5
 5261     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5262     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5263 
 5264     vs_ldpq(vq, kyberConsts);
 5265     int offsets4[4] = { 0, 32, 64, 96 };
 5266     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5267     load32shorts(vs_front(vs2), zetas);
 5268     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5269     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5270     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5271     load32shorts(vs_front(vs2), zetas);
 5272     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5273     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5274     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5275     load32shorts(vs_front(vs2), zetas);
 5276     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5277     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5278 
 5279     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5280     load32shorts(vs_front(vs2), zetas);
 5281     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5282     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5283 
 5284     // level 6
 5285     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5286     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5287 
 5288     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5289     load32shorts(vs_front(vs2), zetas);
 5290     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5291     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5292     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5293     // __ ldpq(v18, v19, __ post(zetas, 32));
 5294     load32shorts(vs_front(vs2), zetas);
 5295     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5296     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5297 
 5298     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5299     load32shorts(vs_front(vs2), zetas);
 5300     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5301     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5302 
 5303     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5304     load32shorts(vs_front(vs2), zetas);
 5305     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5306     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5307 
 5308     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5309     __ mov(r0, zr); // return 0
 5310     __ ret(lr);
 5311 
 5312     return start;
 5313   }
 5314 
 5315   // Kyber Inverse NTT function
 5316   // Implements
 5317   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5318   //
 5319   // coeffs (short[256]) = c_rarg0
 5320   // ntt_zetas (short[256]) = c_rarg1
 5321   address generate_kyberInverseNtt() {
 5322 
 5323     __ align(CodeEntryAlignment);
 5324     StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id;
 5325     StubCodeMark mark(this, stub_id);
 5326     address start = __ pc();
 5327     __ enter();
 5328 
 5329     const Register coeffs = c_rarg0;
 5330     const Register zetas = c_rarg1;
 5331 
 5332     const Register kyberConsts = r10;
 5333     const Register tmpAddr = r11;
 5334     const Register tmpAddr2 = c_rarg2;
 5335 
 5336     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5337     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5338     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5339 
 5340     __ lea(kyberConsts,
 5341              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5342 
 5343     // level 0
 5344     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5345     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5346 
 5347     vs_ldpq(vq, kyberConsts);
 5348     int offsets4[4] = { 0, 32, 64, 96 };
 5349     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5350     load32shorts(vs_front(vs2), zetas);
 5351     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5352                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5353     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5354     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5355     load32shorts(vs_front(vs2), zetas);
 5356     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5357                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5358     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5359     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5360     load32shorts(vs_front(vs2), zetas);
 5361     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5362                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5363     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5364     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5365     load32shorts(vs_front(vs2), zetas);
 5366     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5367                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5368     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5369 
 5370     // level 1
 5371     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5372     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5373 
 5374     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5375     load32shorts(vs_front(vs2), zetas);
 5376     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5377                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5378     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5379     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5380     load32shorts(vs_front(vs2), zetas);
 5381     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5382                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5383     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5384 
 5385     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5386     load32shorts(vs_front(vs2), zetas);
 5387     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5388                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5389     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5390     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5391     load32shorts(vs_front(vs2), zetas);
 5392     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5393                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5394     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5395 
 5396     // level 2
 5397     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5398     // so they are loaded using employing an ldr at 8 distinct offsets.
 5399 
 5400     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5401     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5402     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5403     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5404     vs_subv(vs1, __ T8H, vs1, vs2);
 5405     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5406     load64shorts(vs2, zetas);
 5407     vs_ldpq(vq, kyberConsts);
 5408     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5409     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5410 
 5411     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5412     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5413     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5414     vs_subv(vs1, __ T8H, vs1, vs2);
 5415     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5416     load64shorts(vs2, zetas);
 5417     vs_ldpq(vq, kyberConsts);
 5418     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5419     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5420 
 5421     // Barrett reduction at indexes where overflow may happen
 5422 
 5423     // load q and the multiplier for the Barrett reduction
 5424     __ add(tmpAddr, kyberConsts, 16);
 5425     vs_ldpq(vq, tmpAddr);
 5426 
 5427     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5428     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5429     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5430     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5431     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5432     vs_sshr(vs2, __ T8H, vs2, 11);
 5433     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5434     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5435     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5436     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5437     vs_sshr(vs2, __ T8H, vs2, 11);
 5438     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5439     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5440 
 5441     // level 3
 5442     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5443     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5444 
 5445     int offsets2[4] = { 0, 64, 128, 192 };
 5446     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5447     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5448     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5449     vs_subv(vs1, __ T8H, vs1, vs2);
 5450     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5451     load64shorts(vs2, zetas);
 5452     vs_ldpq(vq, kyberConsts);
 5453     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5454     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5455 
 5456     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5457     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5458     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5459     vs_subv(vs1, __ T8H, vs1, vs2);
 5460     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5461     load64shorts(vs2, zetas);
 5462     vs_ldpq(vq, kyberConsts);
 5463     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5464     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5465 
 5466     // level 4
 5467 
 5468     int offsets1[4] = { 0, 32, 128, 160 };
 5469     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5470     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5471     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5472     vs_subv(vs1, __ T8H, vs1, vs2);
 5473     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5474     load64shorts(vs2, zetas);
 5475     vs_ldpq(vq, kyberConsts);
 5476     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5477     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5478 
 5479     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5480     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5481     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5482     vs_subv(vs1, __ T8H, vs1, vs2);
 5483     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5484     load64shorts(vs2, zetas);
 5485     vs_ldpq(vq, kyberConsts);
 5486     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5487     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5488 
 5489     // level 5
 5490 
 5491     __ add(tmpAddr, coeffs, 0);
 5492     load64shorts(vs1, tmpAddr);
 5493     __ add(tmpAddr, coeffs, 128);
 5494     load64shorts(vs2, tmpAddr);
 5495     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5496     vs_subv(vs1, __ T8H, vs1, vs2);
 5497     __ add(tmpAddr, coeffs, 0);
 5498     store64shorts(vs3, tmpAddr);
 5499     load64shorts(vs2, zetas);
 5500     vs_ldpq(vq, kyberConsts);
 5501     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5502     __ add(tmpAddr, coeffs, 128);
 5503     store64shorts(vs2, tmpAddr);
 5504 
 5505     load64shorts(vs1, tmpAddr);
 5506     __ add(tmpAddr, coeffs, 384);
 5507     load64shorts(vs2, tmpAddr);
 5508     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5509     vs_subv(vs1, __ T8H, vs1, vs2);
 5510     __ add(tmpAddr, coeffs, 256);
 5511     store64shorts(vs3, tmpAddr);
 5512     load64shorts(vs2, zetas);
 5513     vs_ldpq(vq, kyberConsts);
 5514     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5515     __ add(tmpAddr, coeffs, 384);
 5516     store64shorts(vs2, tmpAddr);
 5517 
 5518     // Barrett reduction at indexes where overflow may happen
 5519 
 5520     // load q and the multiplier for the Barrett reduction
 5521     __ add(tmpAddr, kyberConsts, 16);
 5522     vs_ldpq(vq, tmpAddr);
 5523 
 5524     int offsets0[2] = { 0, 256 };
 5525     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5526     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5527     vs_sshr(vs2, __ T8H, vs2, 11);
 5528     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5529     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5530 
 5531     // level 6
 5532 
 5533     __ add(tmpAddr, coeffs, 0);
 5534     load64shorts(vs1, tmpAddr);
 5535     __ add(tmpAddr, coeffs, 256);
 5536     load64shorts(vs2, tmpAddr);
 5537     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5538     vs_subv(vs1, __ T8H, vs1, vs2);
 5539     __ add(tmpAddr, coeffs, 0);
 5540     store64shorts(vs3, tmpAddr);
 5541     load64shorts(vs2, zetas);
 5542     vs_ldpq(vq, kyberConsts);
 5543     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5544     __ add(tmpAddr, coeffs, 256);
 5545     store64shorts(vs2, tmpAddr);
 5546 
 5547     __ add(tmpAddr, coeffs, 128);
 5548     load64shorts(vs1, tmpAddr);
 5549     __ add(tmpAddr, coeffs, 384);
 5550     load64shorts(vs2, tmpAddr);
 5551     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5552     vs_subv(vs1, __ T8H, vs1, vs2);
 5553     __ add(tmpAddr, coeffs, 128);
 5554     store64shorts(vs3, tmpAddr);
 5555     load64shorts(vs2, zetas);
 5556     vs_ldpq(vq, kyberConsts);
 5557     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5558     __ add(tmpAddr, coeffs, 384);
 5559     store64shorts(vs2, tmpAddr);
 5560 
 5561     // multiply by 2^-n
 5562 
 5563     // load toMont(2^-n mod q)
 5564     __ add(tmpAddr, kyberConsts, 48);
 5565     __ ldr(v29, __ Q, tmpAddr);
 5566 
 5567     vs_ldpq(vq, kyberConsts);
 5568     __ add(tmpAddr, coeffs, 0);
 5569     load64shorts(vs1, tmpAddr);
 5570     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5571     __ add(tmpAddr, coeffs, 0);
 5572     store64shorts(vs2, tmpAddr);
 5573 
 5574     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5575     load64shorts(vs1, tmpAddr);
 5576     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5577     __ add(tmpAddr, coeffs, 128);
 5578     store64shorts(vs2, tmpAddr);
 5579 
 5580     // now tmpAddr contains coeffs + 256
 5581     load64shorts(vs1, tmpAddr);
 5582     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5583     __ add(tmpAddr, coeffs, 256);
 5584     store64shorts(vs2, tmpAddr);
 5585 
 5586     // now tmpAddr contains coeffs + 384
 5587     load64shorts(vs1, tmpAddr);
 5588     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5589     __ add(tmpAddr, coeffs, 384);
 5590     store64shorts(vs2, tmpAddr);
 5591 
 5592     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5593     __ mov(r0, zr); // return 0
 5594     __ ret(lr);
 5595 
 5596     return start;
 5597   }
 5598 
 5599   // Kyber multiply polynomials in the NTT domain.
 5600   // Implements
 5601   // static int implKyberNttMult(
 5602   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5603   //
 5604   // result (short[256]) = c_rarg0
 5605   // ntta (short[256]) = c_rarg1
 5606   // nttb (short[256]) = c_rarg2
 5607   // zetas (short[128]) = c_rarg3
 5608   address generate_kyberNttMult() {
 5609 
 5610     __ align(CodeEntryAlignment);
 5611     StubGenStubId stub_id = StubGenStubId::kyberNttMult_id;
 5612     StubCodeMark mark(this, stub_id);
 5613     address start = __ pc();
 5614     __ enter();
 5615 
 5616     const Register result = c_rarg0;
 5617     const Register ntta = c_rarg1;
 5618     const Register nttb = c_rarg2;
 5619     const Register zetas = c_rarg3;
 5620 
 5621     const Register kyberConsts = r10;
 5622     const Register limit = r11;
 5623 
 5624     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5625     VSeq<4> vs3(16), vs4(20);
 5626     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5627     VSeq<2> vz(28);          // pair of zetas
 5628     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5629 
 5630     __ lea(kyberConsts,
 5631              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5632 
 5633     Label kyberNttMult_loop;
 5634 
 5635     __ add(limit, result, 512);
 5636 
 5637     // load q and qinv
 5638     vs_ldpq(vq, kyberConsts);
 5639 
 5640     // load R^2 mod q (to convert back from Montgomery representation)
 5641     __ add(kyberConsts, kyberConsts, 64);
 5642     __ ldr(v27, __ Q, kyberConsts);
 5643 
 5644     __ BIND(kyberNttMult_loop);
 5645 
 5646     // load 16 zetas
 5647     vs_ldpq_post(vz, zetas);
 5648 
 5649     // load 2 sets of 32 coefficients from the two input arrays
 5650     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5651     // are striped across pairs of vector registers
 5652     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5653     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5654     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5655     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5656 
 5657     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5658     // i.e. montmul the first and second halves of vs1 in order and
 5659     // then with one sequence reversed storing the two results in vs3
 5660     //
 5661     // vs3[0] <- montmul(a0, b0)
 5662     // vs3[1] <- montmul(a1, b1)
 5663     // vs3[2] <- montmul(a0, b1)
 5664     // vs3[3] <- montmul(a1, b0)
 5665     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5666     kyber_montmul16(vs_back(vs3),
 5667                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5668 
 5669     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5670     // i.e. montmul the first and second halves of vs4 in order and
 5671     // then with one sequence reversed storing the two results in vs1
 5672     //
 5673     // vs1[0] <- montmul(a2, b2)
 5674     // vs1[1] <- montmul(a3, b3)
 5675     // vs1[2] <- montmul(a2, b3)
 5676     // vs1[3] <- montmul(a3, b2)
 5677     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5678     kyber_montmul16(vs_back(vs1),
 5679                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5680 
 5681     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5682     // We can schedule two montmuls at a time if we use a suitable vector
 5683     // sequence <vs3[1], vs1[1]>.
 5684     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5685     VSeq<2> vs5(vs3[1], delta);
 5686 
 5687     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5688     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5689     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5690 
 5691     // add results in pairs storing in vs3
 5692     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5693     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5694     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5695 
 5696     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5697     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5698     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5699 
 5700     // vs1 <- montmul(vs3, montRSquareModQ)
 5701     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5702 
 5703     // store back the two pairs of result vectors de-interleaved as 8H elements
 5704     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5705     // in memory
 5706     vs_st2_post(vs1, __ T8H, result);
 5707 
 5708     __ cmp(result, limit);
 5709     __ br(Assembler::NE, kyberNttMult_loop);
 5710 
 5711     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5712     __ mov(r0, zr); // return 0
 5713     __ ret(lr);
 5714 
 5715     return start;
 5716   }
 5717 
 5718   // Kyber add 2 polynomials.
 5719   // Implements
 5720   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5721   //
 5722   // result (short[256]) = c_rarg0
 5723   // a (short[256]) = c_rarg1
 5724   // b (short[256]) = c_rarg2
 5725   address generate_kyberAddPoly_2() {
 5726 
 5727     __ align(CodeEntryAlignment);
 5728     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id;
 5729     StubCodeMark mark(this, stub_id);
 5730     address start = __ pc();
 5731     __ enter();
 5732 
 5733     const Register result = c_rarg0;
 5734     const Register a = c_rarg1;
 5735     const Register b = c_rarg2;
 5736 
 5737     const Register kyberConsts = r11;
 5738 
 5739     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5740     // So, we can load, add and store the data in 3 groups of 11,
 5741     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5742     // registers. A further constraint is that the mapping needs
 5743     // to skip callee saves. So, we allocate the register
 5744     // sequences using two 8 sequences, two 2 sequences and two
 5745     // single registers.
 5746     VSeq<8> vs1_1(0);
 5747     VSeq<2> vs1_2(16);
 5748     FloatRegister vs1_3 = v28;
 5749     VSeq<8> vs2_1(18);
 5750     VSeq<2> vs2_2(26);
 5751     FloatRegister vs2_3 = v29;
 5752 
 5753     // two constant vector sequences
 5754     VSeq<8> vc_1(31, 0);
 5755     VSeq<2> vc_2(31, 0);
 5756 
 5757     FloatRegister vc_3 = v31;
 5758     __ lea(kyberConsts,
 5759              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5760 
 5761     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5762     for (int i = 0; i < 3; i++) {
 5763       // load 80 or 88 values from a into vs1_1/2/3
 5764       vs_ldpq_post(vs1_1, a);
 5765       vs_ldpq_post(vs1_2, a);
 5766       if (i < 2) {
 5767         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5768       }
 5769       // load 80 or 88 values from b into vs2_1/2/3
 5770       vs_ldpq_post(vs2_1, b);
 5771       vs_ldpq_post(vs2_2, b);
 5772       if (i < 2) {
 5773         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5774       }
 5775       // sum 80 or 88 values across vs1 and vs2 into vs1
 5776       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5777       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5778       if (i < 2) {
 5779         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5780       }
 5781       // add constant to all 80 or 88 results
 5782       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5783       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5784       if (i < 2) {
 5785         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5786       }
 5787       // store 80 or 88 values
 5788       vs_stpq_post(vs1_1, result);
 5789       vs_stpq_post(vs1_2, result);
 5790       if (i < 2) {
 5791         __ str(vs1_3, __ Q, __ post(result, 16));
 5792       }
 5793     }
 5794 
 5795     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5796     __ mov(r0, zr); // return 0
 5797     __ ret(lr);
 5798 
 5799     return start;
 5800   }
 5801 
 5802   // Kyber add 3 polynomials.
 5803   // Implements
 5804   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5805   //
 5806   // result (short[256]) = c_rarg0
 5807   // a (short[256]) = c_rarg1
 5808   // b (short[256]) = c_rarg2
 5809   // c (short[256]) = c_rarg3
 5810   address generate_kyberAddPoly_3() {
 5811 
 5812     __ align(CodeEntryAlignment);
 5813     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id;
 5814     StubCodeMark mark(this, stub_id);
 5815     address start = __ pc();
 5816     __ enter();
 5817 
 5818     const Register result = c_rarg0;
 5819     const Register a = c_rarg1;
 5820     const Register b = c_rarg2;
 5821     const Register c = c_rarg3;
 5822 
 5823     const Register kyberConsts = r11;
 5824 
 5825     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5826     // quadwords.  So, we can load, add and store the data in 3
 5827     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5828     // of 10 or 11 registers. A further constraint is that the
 5829     // mapping needs to skip callee saves. So, we allocate the
 5830     // register sequences using two 8 sequences, two 2 sequences
 5831     // and two single registers.
 5832     VSeq<8> vs1_1(0);
 5833     VSeq<2> vs1_2(16);
 5834     FloatRegister vs1_3 = v28;
 5835     VSeq<8> vs2_1(18);
 5836     VSeq<2> vs2_2(26);
 5837     FloatRegister vs2_3 = v29;
 5838 
 5839     // two constant vector sequences
 5840     VSeq<8> vc_1(31, 0);
 5841     VSeq<2> vc_2(31, 0);
 5842 
 5843     FloatRegister vc_3 = v31;
 5844 
 5845     __ lea(kyberConsts,
 5846              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5847 
 5848     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5849     for (int i = 0; i < 3; i++) {
 5850       // load 80 or 88 values from a into vs1_1/2/3
 5851       vs_ldpq_post(vs1_1, a);
 5852       vs_ldpq_post(vs1_2, a);
 5853       if (i < 2) {
 5854         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5855       }
 5856       // load 80 or 88 values from b into vs2_1/2/3
 5857       vs_ldpq_post(vs2_1, b);
 5858       vs_ldpq_post(vs2_2, b);
 5859       if (i < 2) {
 5860         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5861       }
 5862       // sum 80 or 88 values across vs1 and vs2 into vs1
 5863       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5864       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5865       if (i < 2) {
 5866         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5867       }
 5868       // load 80 or 88 values from c into vs2_1/2/3
 5869       vs_ldpq_post(vs2_1, c);
 5870       vs_ldpq_post(vs2_2, c);
 5871       if (i < 2) {
 5872         __ ldr(vs2_3, __ Q, __ post(c, 16));
 5873       }
 5874       // sum 80 or 88 values across vs1 and vs2 into vs1
 5875       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5876       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5877       if (i < 2) {
 5878         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5879       }
 5880       // add constant to all 80 or 88 results
 5881       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5882       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5883       if (i < 2) {
 5884         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5885       }
 5886       // store 80 or 88 values
 5887       vs_stpq_post(vs1_1, result);
 5888       vs_stpq_post(vs1_2, result);
 5889       if (i < 2) {
 5890         __ str(vs1_3, __ Q, __ post(result, 16));
 5891       }
 5892     }
 5893 
 5894     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5895     __ mov(r0, zr); // return 0
 5896     __ ret(lr);
 5897 
 5898     return start;
 5899   }
 5900 
 5901   // Kyber parse XOF output to polynomial coefficient candidates
 5902   // or decodePoly(12, ...).
 5903   // Implements
 5904   // static int implKyber12To16(
 5905   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 5906   //
 5907   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 5908   //
 5909   // condensed (byte[]) = c_rarg0
 5910   // condensedIndex = c_rarg1
 5911   // parsed (short[112 or 256]) = c_rarg2
 5912   // parsedLength (112 or 256) = c_rarg3
 5913   address generate_kyber12To16() {
 5914     Label L_F00, L_loop, L_end;
 5915 
 5916     __ BIND(L_F00);
 5917     __ emit_int64(0x0f000f000f000f00);
 5918     __ emit_int64(0x0f000f000f000f00);
 5919 
 5920     __ align(CodeEntryAlignment);
 5921     StubGenStubId stub_id = StubGenStubId::kyber12To16_id;
 5922     StubCodeMark mark(this, stub_id);
 5923     address start = __ pc();
 5924     __ enter();
 5925 
 5926     const Register condensed = c_rarg0;
 5927     const Register condensedOffs = c_rarg1;
 5928     const Register parsed = c_rarg2;
 5929     const Register parsedLength = c_rarg3;
 5930 
 5931     const Register tmpAddr = r11;
 5932 
 5933     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 5934     // quadwords so we need a 6 vector sequence for the inputs.
 5935     // Parsing produces 64 shorts, employing two 8 vector
 5936     // sequences to store and combine the intermediate data.
 5937     VSeq<6> vin(24);
 5938     VSeq<8> va(0), vb(16);
 5939 
 5940     __ adr(tmpAddr, L_F00);
 5941     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 5942     __ add(condensed, condensed, condensedOffs);
 5943 
 5944     __ BIND(L_loop);
 5945     // load 96 (6 x 16B) byte values
 5946     vs_ld3_post(vin, __ T16B, condensed);
 5947 
 5948     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 5949     // holds 48 (16x3) contiguous bytes from memory striped
 5950     // horizontally across each of the 16 byte lanes. Equivalently,
 5951     // that is 16 pairs of 12-bit integers. Likewise the back half
 5952     // holds the next 48 bytes in the same arrangement.
 5953 
 5954     // Each vector in the front half can also be viewed as a vertical
 5955     // strip across the 16 pairs of 12 bit integers. Each byte in
 5956     // vin[0] stores the low 8 bits of the first int in a pair. Each
 5957     // byte in vin[1] stores the high 4 bits of the first int and the
 5958     // low 4 bits of the second int. Each byte in vin[2] stores the
 5959     // high 8 bits of the second int. Likewise the vectors in second
 5960     // half.
 5961 
 5962     // Converting the data to 16-bit shorts requires first of all
 5963     // expanding each of the 6 x 16B vectors into 6 corresponding
 5964     // pairs of 8H vectors. Mask, shift and add operations on the
 5965     // resulting vector pairs can be used to combine 4 and 8 bit
 5966     // parts of related 8H vector elements.
 5967     //
 5968     // The middle vectors (vin[2] and vin[5]) are actually expanded
 5969     // twice, one copy manipulated to provide the lower 4 bits
 5970     // belonging to the first short in a pair and another copy
 5971     // manipulated to provide the higher 4 bits belonging to the
 5972     // second short in a pair. This is why the the vector sequences va
 5973     // and vb used to hold the expanded 8H elements are of length 8.
 5974 
 5975     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 5976     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 5977     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 5978     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 5979     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 5980     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 5981     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 5982     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 5983 
 5984     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 5985     // and vb[4:5]
 5986     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 5987     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 5988     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 5989     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 5990     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 5991     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 5992 
 5993     // shift lo byte of copy 1 of the middle stripe into the high byte
 5994     __ shl(va[2], __ T8H, va[2], 8);
 5995     __ shl(va[3], __ T8H, va[3], 8);
 5996     __ shl(vb[2], __ T8H, vb[2], 8);
 5997     __ shl(vb[3], __ T8H, vb[3], 8);
 5998 
 5999     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6000     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6001     // are in bit positions [4..11].
 6002     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6003     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6004     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6005     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6006 
 6007     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6008     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6009     // copy2
 6010     __ andr(va[2], __ T16B, va[2], v31);
 6011     __ andr(va[3], __ T16B, va[3], v31);
 6012     __ ushr(va[4], __ T8H, va[4], 4);
 6013     __ ushr(va[5], __ T8H, va[5], 4);
 6014     __ andr(vb[2], __ T16B, vb[2], v31);
 6015     __ andr(vb[3], __ T16B, vb[3], v31);
 6016     __ ushr(vb[4], __ T8H, vb[4], 4);
 6017     __ ushr(vb[5], __ T8H, vb[5], 4);
 6018 
 6019     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6020     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6021     // n.b. the ordering ensures: i) inputs are consumed before they
 6022     // are overwritten ii) the order of 16-bit results across successive
 6023     // pairs of vectors in va and then vb reflects the order of the
 6024     // corresponding 12-bit inputs
 6025     __ addv(va[0], __ T8H, va[0], va[2]);
 6026     __ addv(va[2], __ T8H, va[1], va[3]);
 6027     __ addv(va[1], __ T8H, va[4], va[6]);
 6028     __ addv(va[3], __ T8H, va[5], va[7]);
 6029     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6030     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6031     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6032     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6033 
 6034     // store 64 results interleaved as shorts
 6035     vs_st2_post(vs_front(va), __ T8H, parsed);
 6036     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6037 
 6038     __ sub(parsedLength, parsedLength, 64);
 6039     __ cmp(parsedLength, (u1)64);
 6040     __ br(Assembler::GE, L_loop);
 6041     __ cbz(parsedLength, L_end);
 6042 
 6043     // if anything is left it should be a final 72 bytes of input
 6044     // i.e. a final 48 12-bit values. so we handle this by loading
 6045     // 48 bytes into all 16B lanes of front(vin) and only 24
 6046     // bytes into the lower 8B lane of back(vin)
 6047     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6048     vs_ld3(vs_back(vin), __ T8B, condensed);
 6049 
 6050     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6051     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6052     // 5 and target element 2 of vb duplicates element 4.
 6053     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6054     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6055     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6056     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6057     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6058     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6059 
 6060     // This time expand just the lower 8 lanes
 6061     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6062     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6063     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6064 
 6065     // shift lo byte of copy 1 of the middle stripe into the high byte
 6066     __ shl(va[2], __ T8H, va[2], 8);
 6067     __ shl(va[3], __ T8H, va[3], 8);
 6068     __ shl(vb[2], __ T8H, vb[2], 8);
 6069 
 6070     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6071     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6072     // int are in bit positions [4..11].
 6073     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6074     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6075     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6076 
 6077     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6078     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6079     // copy2
 6080     __ andr(va[2], __ T16B, va[2], v31);
 6081     __ andr(va[3], __ T16B, va[3], v31);
 6082     __ ushr(va[4], __ T8H, va[4], 4);
 6083     __ ushr(va[5], __ T8H, va[5], 4);
 6084     __ andr(vb[2], __ T16B, vb[2], v31);
 6085     __ ushr(vb[4], __ T8H, vb[4], 4);
 6086 
 6087 
 6088 
 6089     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6090     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6091 
 6092     // n.b. ordering ensures: i) inputs are consumed before they are
 6093     // overwritten ii) order of 16-bit results across succsessive
 6094     // pairs of vectors in va and then lower half of vb reflects order
 6095     // of corresponding 12-bit inputs
 6096     __ addv(va[0], __ T8H, va[0], va[2]);
 6097     __ addv(va[2], __ T8H, va[1], va[3]);
 6098     __ addv(va[1], __ T8H, va[4], va[6]);
 6099     __ addv(va[3], __ T8H, va[5], va[7]);
 6100     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6101     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6102 
 6103     // store 48 results interleaved as shorts
 6104     vs_st2_post(vs_front(va), __ T8H, parsed);
 6105     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6106 
 6107     __ BIND(L_end);
 6108 
 6109     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6110     __ mov(r0, zr); // return 0
 6111     __ ret(lr);
 6112 
 6113     return start;
 6114   }
 6115 
 6116   // Kyber Barrett reduce function.
 6117   // Implements
 6118   // static int implKyberBarrettReduce(short[] coeffs) {}
 6119   //
 6120   // coeffs (short[256]) = c_rarg0
 6121   address generate_kyberBarrettReduce() {
 6122 
 6123     __ align(CodeEntryAlignment);
 6124     StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id;
 6125     StubCodeMark mark(this, stub_id);
 6126     address start = __ pc();
 6127     __ enter();
 6128 
 6129     const Register coeffs = c_rarg0;
 6130 
 6131     const Register kyberConsts = r10;
 6132     const Register result = r11;
 6133 
 6134     // As above we process 256 sets of values in total i.e. 32 x
 6135     // 8H quadwords. So, we can load, add and store the data in 3
 6136     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6137     // of 10 or 11 registers. A further constraint is that the
 6138     // mapping needs to skip callee saves. So, we allocate the
 6139     // register sequences using two 8 sequences, two 2 sequences
 6140     // and two single registers.
 6141     VSeq<8> vs1_1(0);
 6142     VSeq<2> vs1_2(16);
 6143     FloatRegister vs1_3 = v28;
 6144     VSeq<8> vs2_1(18);
 6145     VSeq<2> vs2_2(26);
 6146     FloatRegister vs2_3 = v29;
 6147 
 6148     // we also need a pair of corresponding constant sequences
 6149 
 6150     VSeq<8> vc1_1(30, 0);
 6151     VSeq<2> vc1_2(30, 0);
 6152     FloatRegister vc1_3 = v30; // for kyber_q
 6153 
 6154     VSeq<8> vc2_1(31, 0);
 6155     VSeq<2> vc2_2(31, 0);
 6156     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6157 
 6158     __ add(result, coeffs, 0);
 6159     __ lea(kyberConsts,
 6160              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6161 
 6162     // load q and the multiplier for the Barrett reduction
 6163     __ add(kyberConsts, kyberConsts, 16);
 6164     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6165 
 6166     for (int i = 0; i < 3; i++) {
 6167       // load 80 or 88 coefficients
 6168       vs_ldpq_post(vs1_1, coeffs);
 6169       vs_ldpq_post(vs1_2, coeffs);
 6170       if (i < 2) {
 6171         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6172       }
 6173 
 6174       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6175       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6176       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6177       if (i < 2) {
 6178         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6179       }
 6180 
 6181       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6182       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6183       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6184       if (i < 2) {
 6185         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6186       }
 6187 
 6188       // vs1 <- vs1 - vs2 * kyber_q
 6189       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6190       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6191       if (i < 2) {
 6192         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6193       }
 6194 
 6195       vs_stpq_post(vs1_1, result);
 6196       vs_stpq_post(vs1_2, result);
 6197       if (i < 2) {
 6198         __ str(vs1_3, __ Q, __ post(result, 16));
 6199       }
 6200     }
 6201 
 6202     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6203     __ mov(r0, zr); // return 0
 6204     __ ret(lr);
 6205 
 6206     return start;
 6207   }
 6208 
 6209 
 6210   // Dilithium-specific montmul helper routines that generate parallel
 6211   // code for, respectively, a single 4x4s vector sequence montmul or
 6212   // two such multiplies in a row.
 6213 
 6214   // Perform 16 32-bit Montgomery multiplications in parallel
 6215   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6216                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6217     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6218     // It will assert that the register use is valid
 6219     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6220   }
 6221 
 6222   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6223   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6224                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6225     // Schedule two successive 4x4S multiplies via the montmul helper
 6226     // on the front and back halves of va, vb and vc. The helper will
 6227     // assert that the register use has no overlap conflicts on each
 6228     // individual call but we also need to ensure that the necessary
 6229     // disjoint/equality constraints are met across both calls.
 6230 
 6231     // vb, vc, vtmp and vq must be disjoint. va must either be
 6232     // disjoint from all other registers or equal vc
 6233 
 6234     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6235     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6236     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6237 
 6238     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6239     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6240 
 6241     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6242 
 6243     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6244     assert(vs_disjoint(va, vb), "va and vb overlap");
 6245     assert(vs_disjoint(va, vq), "va and vq overlap");
 6246     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6247 
 6248     // We multiply the front and back halves of each sequence 4 at a
 6249     // time because
 6250     //
 6251     // 1) we are currently only able to get 4-way instruction
 6252     // parallelism at best
 6253     //
 6254     // 2) we need registers for the constants in vq and temporary
 6255     // scratch registers to hold intermediate results so vtmp can only
 6256     // be a VSeq<4> which means we only have 4 scratch slots.
 6257 
 6258     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6259     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6260   }
 6261 
 6262   // Perform combined montmul then add/sub on 4x4S vectors.
 6263   void dilithium_montmul16_sub_add(
 6264           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6265           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6266     // compute a = montmul(a1, c)
 6267     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6268     // ouptut a1 = a0 - a
 6269     vs_subv(va1, __ T4S, va0, vc);
 6270     //    and a0 = a0 + a
 6271     vs_addv(va0, __ T4S, va0, vc);
 6272   }
 6273 
 6274   // Perform combined add/sub then montul on 4x4S vectors.
 6275   void dilithium_sub_add_montmul16(
 6276           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6277           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6278     // compute c = a0 - a1
 6279     vs_subv(vtmp1, __ T4S, va0, va1);
 6280     // output a0 = a0 + a1
 6281     vs_addv(va0, __ T4S, va0, va1);
 6282     // output a1 = b montmul c
 6283     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6284   }
 6285 
 6286   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6287   // in the Java implementation come in sequences of at least 8, so we
 6288   // can use ldpq to collect the corresponding data into pairs of vector
 6289   // registers.
 6290   // We collect the coefficients corresponding to the 'j+l' indexes into
 6291   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6292   // then we do the (Montgomery) multiplications by the zetas in parallel
 6293   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6294   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6295   // v0-v7 and finally save the results back to the coeffs array.
 6296   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6297     const Register coeffs, const Register zetas) {
 6298     int c1 = 0;
 6299     int c2 = 512;
 6300     int startIncr;
 6301     // don't use callee save registers v8 - v15
 6302     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6303     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6304     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6305     int offsets[4] = { 0, 32, 64, 96 };
 6306 
 6307     for (int level = 0; level < 5; level++) {
 6308       int c1Start = c1;
 6309       int c2Start = c2;
 6310       if (level == 3) {
 6311         offsets[1] = 32;
 6312         offsets[2] = 128;
 6313         offsets[3] = 160;
 6314       } else if (level == 4) {
 6315         offsets[1] = 64;
 6316         offsets[2] = 128;
 6317         offsets[3] = 192;
 6318       }
 6319 
 6320       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6321       // time at 4 different offsets and multiply them in order by the
 6322       // next set of input values. So we employ indexed load and store
 6323       // pair instructions with arrangement 4S.
 6324       for (int i = 0; i < 4; i++) {
 6325         // reload q and qinv
 6326         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6327         // load 8x4S coefficients via second start pos == c2
 6328         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6329         // load next 8x4S inputs == b
 6330         vs_ldpq_post(vs2, zetas);
 6331         // compute a == c2 * b mod MONT_Q
 6332         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6333         // load 8x4s coefficients via first start pos == c1
 6334         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6335         // compute a1 =  c1 + a
 6336         vs_addv(vs3, __ T4S, vs1, vs2);
 6337         // compute a2 =  c1 - a
 6338         vs_subv(vs1, __ T4S, vs1, vs2);
 6339         // output a1 and a2
 6340         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6341         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6342 
 6343         int k = 4 * level + i;
 6344 
 6345         if (k > 7) {
 6346           startIncr = 256;
 6347         } else if (k == 5) {
 6348           startIncr = 384;
 6349         } else {
 6350           startIncr = 128;
 6351         }
 6352 
 6353         c1Start += startIncr;
 6354         c2Start += startIncr;
 6355       }
 6356 
 6357       c2 /= 2;
 6358     }
 6359   }
 6360 
 6361   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6362   // Implements the method
 6363   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6364   // of the Java class sun.security.provider
 6365   //
 6366   // coeffs (int[256]) = c_rarg0
 6367   // zetas (int[256]) = c_rarg1
 6368   address generate_dilithiumAlmostNtt() {
 6369 
 6370     __ align(CodeEntryAlignment);
 6371     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 6372     StubCodeMark mark(this, stub_id);
 6373     address start = __ pc();
 6374     __ enter();
 6375 
 6376     const Register coeffs = c_rarg0;
 6377     const Register zetas = c_rarg1;
 6378 
 6379     const Register tmpAddr = r9;
 6380     const Register dilithiumConsts = r10;
 6381     const Register result = r11;
 6382     // don't use callee save registers v8 - v15
 6383     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6384     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6385     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6386     int offsets[4] = { 0, 32, 64, 96};
 6387     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6388     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6389     __ add(result, coeffs, 0);
 6390     __ lea(dilithiumConsts,
 6391              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6392 
 6393     // Each level represents one iteration of the outer for loop of the Java version.
 6394 
 6395     // level 0-4
 6396     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6397 
 6398     // level 5
 6399 
 6400     // At level 5 the coefficients we need to combine with the zetas
 6401     // are grouped in memory in blocks of size 4. So, for both sets of
 6402     // coefficients we load 4 adjacent values at 8 different offsets
 6403     // using an indexed ldr with register variant Q and multiply them
 6404     // in sequence order by the next set of inputs. Likewise we store
 6405     // the resuls using an indexed str with register variant Q.
 6406     for (int i = 0; i < 1024; i += 256) {
 6407       // reload constants q, qinv each iteration as they get clobbered later
 6408       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6409       // load 32 (8x4S) coefficients via first offsets = c1
 6410       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6411       // load next 32 (8x4S) inputs = b
 6412       vs_ldpq_post(vs2, zetas);
 6413       // a = b montul c1
 6414       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6415       // load 32 (8x4S) coefficients via second offsets = c2
 6416       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6417       // add/sub with result of multiply
 6418       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6419       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6420       // write back new coefficients using same offsets
 6421       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6422       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6423     }
 6424 
 6425     // level 6
 6426     // At level 6 the coefficients we need to combine with the zetas
 6427     // are grouped in memory in pairs, the first two being montmul
 6428     // inputs and the second add/sub inputs. We can still implement
 6429     // the montmul+sub+add using 4-way parallelism but only if we
 6430     // combine the coefficients with the zetas 16 at a time. We load 8
 6431     // adjacent values at 4 different offsets using an ld2 load with
 6432     // arrangement 2D. That interleaves the lower and upper halves of
 6433     // each pair of quadwords into successive vector registers. We
 6434     // then need to montmul the 4 even elements of the coefficients
 6435     // register sequence by the zetas in order and then add/sub the 4
 6436     // odd elements of the coefficients register sequence. We use an
 6437     // equivalent st2 operation to store the results back into memory
 6438     // de-interleaved.
 6439     for (int i = 0; i < 1024; i += 128) {
 6440       // reload constants q, qinv each iteration as they get clobbered later
 6441       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6442       // load interleaved 16 (4x2D) coefficients via offsets
 6443       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6444       // load next 16 (4x4S) inputs
 6445       vs_ldpq_post(vs_front(vs2), zetas);
 6446       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6447       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6448                                   vs_front(vs2), vtmp, vq);
 6449       // store interleaved 16 (4x2D) coefficients via offsets
 6450       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6451     }
 6452 
 6453     // level 7
 6454     // At level 7 the coefficients we need to combine with the zetas
 6455     // occur singly with montmul inputs alterating with add/sub
 6456     // inputs. Once again we can use 4-way parallelism to combine 16
 6457     // zetas at a time. However, we have to load 8 adjacent values at
 6458     // 4 different offsets using an ld2 load with arrangement 4S. That
 6459     // interleaves the the odd words of each pair into one
 6460     // coefficients vector register and the even words of the pair
 6461     // into the next register. We then need to montmul the 4 even
 6462     // elements of the coefficients register sequence by the zetas in
 6463     // order and then add/sub the 4 odd elements of the coefficients
 6464     // register sequence. We use an equivalent st2 operation to store
 6465     // the results back into memory de-interleaved.
 6466 
 6467     for (int i = 0; i < 1024; i += 128) {
 6468       // reload constants q, qinv each iteration as they get clobbered later
 6469       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6470       // load interleaved 16 (4x4S) coefficients via offsets
 6471       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6472       // load next 16 (4x4S) inputs
 6473       vs_ldpq_post(vs_front(vs2), zetas);
 6474       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6475       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6476                                   vs_front(vs2), vtmp, vq);
 6477       // store interleaved 16 (4x4S) coefficients via offsets
 6478       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6479     }
 6480     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6481     __ mov(r0, zr); // return 0
 6482     __ ret(lr);
 6483 
 6484     return start;
 6485   }
 6486 
 6487   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6488   // in the Java implementation come in sequences of at least 8, so we
 6489   // can use ldpq to collect the corresponding data into pairs of vector
 6490   // registers
 6491   // We collect the coefficients that correspond to the 'j's into vs1
 6492   // the coefficiets that correspond to the 'j+l's into vs2 then
 6493   // do the additions into vs3 and the subtractions into vs1 then
 6494   // save the result of the additions, load the zetas into vs2
 6495   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6496   // finally save the results back to the coeffs array
 6497   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6498     const Register coeffs, const Register zetas) {
 6499     int c1 = 0;
 6500     int c2 = 32;
 6501     int startIncr;
 6502     int offsets[4];
 6503     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6504     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6505     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6506 
 6507     offsets[0] = 0;
 6508 
 6509     for (int level = 3; level < 8; level++) {
 6510       int c1Start = c1;
 6511       int c2Start = c2;
 6512       if (level == 3) {
 6513         offsets[1] = 64;
 6514         offsets[2] = 128;
 6515         offsets[3] = 192;
 6516       } else if (level == 4) {
 6517         offsets[1] = 32;
 6518         offsets[2] = 128;
 6519         offsets[3] = 160;
 6520       } else {
 6521         offsets[1] = 32;
 6522         offsets[2] = 64;
 6523         offsets[3] = 96;
 6524       }
 6525 
 6526       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6527       // time at 4 different offsets and multiply them in order by the
 6528       // next set of input values. So we employ indexed load and store
 6529       // pair instructions with arrangement 4S.
 6530       for (int i = 0; i < 4; i++) {
 6531         // load v1 32 (8x4S) coefficients relative to first start index
 6532         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6533         // load v2 32 (8x4S) coefficients relative to second start index
 6534         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6535         // a0 = v1 + v2 -- n.b. clobbers vqs
 6536         vs_addv(vs3, __ T4S, vs1, vs2);
 6537         // a1 = v1 - v2
 6538         vs_subv(vs1, __ T4S, vs1, vs2);
 6539         // save a1 relative to first start index
 6540         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6541         // load constants q, qinv each iteration as they get clobbered above
 6542         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6543         // load b next 32 (8x4S) inputs
 6544         vs_ldpq_post(vs2, zetas);
 6545         // a = a1 montmul b
 6546         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6547         // save a relative to second start index
 6548         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6549 
 6550         int k = 4 * level + i;
 6551 
 6552         if (k < 24) {
 6553           startIncr = 256;
 6554         } else if (k == 25) {
 6555           startIncr = 384;
 6556         } else {
 6557           startIncr = 128;
 6558         }
 6559 
 6560         c1Start += startIncr;
 6561         c2Start += startIncr;
 6562       }
 6563 
 6564       c2 *= 2;
 6565     }
 6566   }
 6567 
 6568   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6569   // Implements the method
 6570   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6571   // the sun.security.provider.ML_DSA class.
 6572   //
 6573   // coeffs (int[256]) = c_rarg0
 6574   // zetas (int[256]) = c_rarg1
 6575   address generate_dilithiumAlmostInverseNtt() {
 6576 
 6577     __ align(CodeEntryAlignment);
 6578     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 6579     StubCodeMark mark(this, stub_id);
 6580     address start = __ pc();
 6581     __ enter();
 6582 
 6583     const Register coeffs = c_rarg0;
 6584     const Register zetas = c_rarg1;
 6585 
 6586     const Register tmpAddr = r9;
 6587     const Register dilithiumConsts = r10;
 6588     const Register result = r11;
 6589     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6590     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6591     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6592     int offsets[4] = { 0, 32, 64, 96 };
 6593     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6594     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6595 
 6596     __ add(result, coeffs, 0);
 6597     __ lea(dilithiumConsts,
 6598              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6599 
 6600     // Each level represents one iteration of the outer for loop of the Java version
 6601 
 6602     // level 0
 6603     // At level 0 we need to interleave adjacent quartets of
 6604     // coefficients before we multiply and add/sub by the next 16
 6605     // zetas just as we did for level 7 in the multiply code. So we
 6606     // load and store the values using an ld2/st2 with arrangement 4S.
 6607     for (int i = 0; i < 1024; i += 128) {
 6608       // load constants q, qinv
 6609       // n.b. this can be moved out of the loop as they do not get
 6610       // clobbered by first two loops
 6611       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6612       // a0/a1 load interleaved 32 (8x4S) coefficients
 6613       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6614       // b load next 32 (8x4S) inputs
 6615       vs_ldpq_post(vs_front(vs2), zetas);
 6616       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6617       // n.b. second half of vs2 provides temporary register storage
 6618       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6619                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6620       // a0/a1 store interleaved 32 (8x4S) coefficients
 6621       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6622     }
 6623 
 6624     // level 1
 6625     // At level 1 we need to interleave pairs of adjacent pairs of
 6626     // coefficients before we multiply by the next 16 zetas just as we
 6627     // did for level 6 in the multiply code. So we load and store the
 6628     // values an ld2/st2 with arrangement 2D.
 6629     for (int i = 0; i < 1024; i += 128) {
 6630       // a0/a1 load interleaved 32 (8x2D) coefficients
 6631       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6632       // b load next 16 (4x4S) inputs
 6633       vs_ldpq_post(vs_front(vs2), zetas);
 6634       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6635       // n.b. second half of vs2 provides temporary register storage
 6636       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6637                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6638       // a0/a1 store interleaved 32 (8x2D) coefficients
 6639       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6640     }
 6641 
 6642     // level 2
 6643     // At level 2 coefficients come in blocks of 4. So, we load 4
 6644     // adjacent coefficients at 8 distinct offsets for both the first
 6645     // and second coefficient sequences, using an ldr with register
 6646     // variant Q then combine them with next set of 32 zetas. Likewise
 6647     // we store the results using an str with register variant Q.
 6648     for (int i = 0; i < 1024; i += 256) {
 6649       // c0 load 32 (8x4S) coefficients via first offsets
 6650       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6651       // c1 load 32 (8x4S) coefficients via second offsets
 6652       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6653       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6654       vs_addv(vs3, __ T4S, vs1, vs2);
 6655       // c = c0 - c1
 6656       vs_subv(vs1, __ T4S, vs1, vs2);
 6657       // store a0 32 (8x4S) coefficients via first offsets
 6658       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6659       // b load 32 (8x4S) next inputs
 6660       vs_ldpq_post(vs2, zetas);
 6661       // reload constants q, qinv -- they were clobbered earlier
 6662       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6663       // compute a1 = b montmul c
 6664       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6665       // store a1 32 (8x4S) coefficients via second offsets
 6666       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6667     }
 6668 
 6669     // level 3-7
 6670     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6671 
 6672     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6673     __ mov(r0, zr); // return 0
 6674     __ ret(lr);
 6675 
 6676     return start;
 6677   }
 6678 
 6679   // Dilithium multiply polynomials in the NTT domain.
 6680   // Straightforward implementation of the method
 6681   // static int implDilithiumNttMult(
 6682   //              int[] result, int[] ntta, int[] nttb {} of
 6683   // the sun.security.provider.ML_DSA class.
 6684   //
 6685   // result (int[256]) = c_rarg0
 6686   // poly1 (int[256]) = c_rarg1
 6687   // poly2 (int[256]) = c_rarg2
 6688   address generate_dilithiumNttMult() {
 6689 
 6690         __ align(CodeEntryAlignment);
 6691     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 6692     StubCodeMark mark(this, stub_id);
 6693     address start = __ pc();
 6694     __ enter();
 6695 
 6696     Label L_loop;
 6697 
 6698     const Register result = c_rarg0;
 6699     const Register poly1 = c_rarg1;
 6700     const Register poly2 = c_rarg2;
 6701 
 6702     const Register dilithiumConsts = r10;
 6703     const Register len = r11;
 6704 
 6705     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6706     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6707     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6708     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6709 
 6710     __ lea(dilithiumConsts,
 6711              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6712 
 6713     // load constants q, qinv
 6714     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6715     // load constant rSquare into v29
 6716     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6717 
 6718     __ mov(len, zr);
 6719     __ add(len, len, 1024);
 6720 
 6721     __ BIND(L_loop);
 6722 
 6723     // b load 32 (8x4S) next inputs from poly1
 6724     vs_ldpq_post(vs1, poly1);
 6725     // c load 32 (8x4S) next inputs from poly2
 6726     vs_ldpq_post(vs2, poly2);
 6727     // compute a = b montmul c
 6728     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6729     // compute a = rsquare montmul a
 6730     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6731     // save a 32 (8x4S) results
 6732     vs_stpq_post(vs2, result);
 6733 
 6734     __ sub(len, len, 128);
 6735     __ cmp(len, (u1)128);
 6736     __ br(Assembler::GE, L_loop);
 6737 
 6738     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6739     __ mov(r0, zr); // return 0
 6740     __ ret(lr);
 6741 
 6742     return start;
 6743   }
 6744 
 6745   // Dilithium Motgomery multiply an array by a constant.
 6746   // A straightforward implementation of the method
 6747   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6748   // of the sun.security.provider.MLDSA class
 6749   //
 6750   // coeffs (int[256]) = c_rarg0
 6751   // constant (int) = c_rarg1
 6752   address generate_dilithiumMontMulByConstant() {
 6753 
 6754     __ align(CodeEntryAlignment);
 6755     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 6756     StubCodeMark mark(this, stub_id);
 6757     address start = __ pc();
 6758     __ enter();
 6759 
 6760     Label L_loop;
 6761 
 6762     const Register coeffs = c_rarg0;
 6763     const Register constant = c_rarg1;
 6764 
 6765     const Register dilithiumConsts = r10;
 6766     const Register result = r11;
 6767     const Register len = r12;
 6768 
 6769     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6770     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6771     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6772     VSeq<8> vconst(29, 0);             // for montmul by constant
 6773 
 6774     // results track inputs
 6775     __ add(result, coeffs, 0);
 6776     __ lea(dilithiumConsts,
 6777              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6778 
 6779     // load constants q, qinv -- they do not get clobbered by first two loops
 6780     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6781     // copy caller supplied constant across vconst
 6782     __ dup(vconst[0], __ T4S, constant);
 6783     __ mov(len, zr);
 6784     __ add(len, len, 1024);
 6785 
 6786     __ BIND(L_loop);
 6787 
 6788     // load next 32 inputs
 6789     vs_ldpq_post(vs2, coeffs);
 6790     // mont mul by constant
 6791     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6792     // write next 32 results
 6793     vs_stpq_post(vs2, result);
 6794 
 6795     __ sub(len, len, 128);
 6796     __ cmp(len, (u1)128);
 6797     __ br(Assembler::GE, L_loop);
 6798 
 6799     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6800     __ mov(r0, zr); // return 0
 6801     __ ret(lr);
 6802 
 6803     return start;
 6804   }
 6805 
 6806   // Dilithium decompose poly.
 6807   // Implements the method
 6808   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6809   // of the sun.security.provider.ML_DSA class
 6810   //
 6811   // input (int[256]) = c_rarg0
 6812   // lowPart (int[256]) = c_rarg1
 6813   // highPart (int[256]) = c_rarg2
 6814   // twoGamma2  (int) = c_rarg3
 6815   // multiplier (int) = c_rarg4
 6816   address generate_dilithiumDecomposePoly() {
 6817 
 6818     __ align(CodeEntryAlignment);
 6819     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 6820     StubCodeMark mark(this, stub_id);
 6821     address start = __ pc();
 6822     Label L_loop;
 6823 
 6824     const Register input = c_rarg0;
 6825     const Register lowPart = c_rarg1;
 6826     const Register highPart = c_rarg2;
 6827     const Register twoGamma2 = c_rarg3;
 6828     const Register multiplier = c_rarg4;
 6829 
 6830     const Register len = r9;
 6831     const Register dilithiumConsts = r10;
 6832     const Register tmp = r11;
 6833 
 6834     // 6 independent sets of 4x4s values
 6835     VSeq<4> vs1(0), vs2(4), vs3(8);
 6836     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6837 
 6838     // 7 constants for cross-multiplying
 6839     VSeq<4> one(25, 0);
 6840     VSeq<4> qminus1(26, 0);
 6841     VSeq<4> g2(27, 0);
 6842     VSeq<4> twog2(28, 0);
 6843     VSeq<4> mult(29, 0);
 6844     VSeq<4> q(30, 0);
 6845     VSeq<4> qadd(31, 0);
 6846 
 6847     __ enter();
 6848 
 6849     __ lea(dilithiumConsts,
 6850              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6851 
 6852     // save callee-saved registers
 6853     __ stpd(v8, v9, __ pre(sp, -64));
 6854     __ stpd(v10, v11, Address(sp, 16));
 6855     __ stpd(v12, v13, Address(sp, 32));
 6856     __ stpd(v14, v15, Address(sp, 48));
 6857 
 6858     // populate constant registers
 6859     __ mov(tmp, zr);
 6860     __ add(tmp, tmp, 1);
 6861     __ dup(one[0], __ T4S, tmp); // 1
 6862     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6863     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6864     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6865     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6866     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6867     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6868 
 6869     __ mov(len, zr);
 6870     __ add(len, len, 1024);
 6871 
 6872     __ BIND(L_loop);
 6873 
 6874     // load next 4x4S inputs interleaved: rplus --> vs1
 6875     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 6876 
 6877     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 6878     vs_addv(vtmp, __ T4S, vs1, qadd);
 6879     vs_sshr(vtmp, __ T4S, vtmp, 23);
 6880     vs_mulv(vtmp, __ T4S, vtmp, q);
 6881     vs_subv(vs1, __ T4S, vs1, vtmp);
 6882 
 6883     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 6884     vs_sshr(vtmp, __ T4S, vs1, 31);
 6885     vs_andr(vtmp, vtmp, q);
 6886     vs_addv(vs1, __ T4S, vs1, vtmp);
 6887 
 6888     // quotient --> vs2
 6889     // int quotient = (rplus * multiplier) >> 22;
 6890     vs_mulv(vtmp, __ T4S, vs1, mult);
 6891     vs_sshr(vs2, __ T4S, vtmp, 22);
 6892 
 6893     // r0 --> vs3
 6894     // int r0 = rplus - quotient * twoGamma2;
 6895     vs_mulv(vtmp, __ T4S, vs2, twog2);
 6896     vs_subv(vs3, __ T4S, vs1, vtmp);
 6897 
 6898     // mask --> vs4
 6899     // int mask = (twoGamma2 - r0) >> 22;
 6900     vs_subv(vtmp, __ T4S, twog2, vs3);
 6901     vs_sshr(vs4, __ T4S, vtmp, 22);
 6902 
 6903     // r0 -= (mask & twoGamma2);
 6904     vs_andr(vtmp, vs4, twog2);
 6905     vs_subv(vs3, __ T4S, vs3, vtmp);
 6906 
 6907     //  quotient += (mask & 1);
 6908     vs_andr(vtmp, vs4, one);
 6909     vs_addv(vs2, __ T4S, vs2, vtmp);
 6910 
 6911     // mask = (twoGamma2 / 2 - r0) >> 31;
 6912     vs_subv(vtmp, __ T4S, g2, vs3);
 6913     vs_sshr(vs4, __ T4S, vtmp, 31);
 6914 
 6915     // r0 -= (mask & twoGamma2);
 6916     vs_andr(vtmp, vs4, twog2);
 6917     vs_subv(vs3, __ T4S, vs3, vtmp);
 6918 
 6919     // quotient += (mask & 1);
 6920     vs_andr(vtmp, vs4, one);
 6921     vs_addv(vs2, __ T4S, vs2, vtmp);
 6922 
 6923     // r1 --> vs5
 6924     // int r1 = rplus - r0 - (dilithium_q - 1);
 6925     vs_subv(vtmp, __ T4S, vs1, vs3);
 6926     vs_subv(vs5, __ T4S, vtmp, qminus1);
 6927 
 6928     // r1 --> vs1 (overwriting rplus)
 6929     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 6930     vs_negr(vtmp, __ T4S, vs5);
 6931     vs_orr(vtmp, vs5, vtmp);
 6932     vs_sshr(vs1, __ T4S, vtmp, 31);
 6933 
 6934     // r0 += ~r1;
 6935     vs_notr(vtmp, vs1);
 6936     vs_addv(vs3, __ T4S, vs3, vtmp);
 6937 
 6938     // r1 = r1 & quotient;
 6939     vs_andr(vs1, vs2, vs1);
 6940 
 6941     // store results inteleaved
 6942     // lowPart[m] = r0;
 6943     // highPart[m] = r1;
 6944     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 6945     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 6946 
 6947     __ sub(len, len, 64);
 6948     __ cmp(len, (u1)64);
 6949     __ br(Assembler::GE, L_loop);
 6950 
 6951     // restore callee-saved vector registers
 6952     __ ldpd(v14, v15, Address(sp, 48));
 6953     __ ldpd(v12, v13, Address(sp, 32));
 6954     __ ldpd(v10, v11, Address(sp, 16));
 6955     __ ldpd(v8, v9, __ post(sp, 64));
 6956 
 6957     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6958     __ mov(r0, zr); // return 0
 6959     __ ret(lr);
 6960 
 6961     return start;
 6962   }
 6963 
 6964   /**
 6965    *  Arguments:
 6966    *
 6967    * Inputs:
 6968    *   c_rarg0   - int crc
 6969    *   c_rarg1   - byte* buf
 6970    *   c_rarg2   - int length
 6971    *
 6972    * Output:
 6973    *       rax   - int crc result
 6974    */
 6975   address generate_updateBytesCRC32() {
 6976     assert(UseCRC32Intrinsics, "what are we doing here?");
 6977 
 6978     __ align(CodeEntryAlignment);
 6979     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 6980     StubCodeMark mark(this, stub_id);
 6981 
 6982     address start = __ pc();
 6983 
 6984     const Register crc   = c_rarg0;  // crc
 6985     const Register buf   = c_rarg1;  // source java byte array address
 6986     const Register len   = c_rarg2;  // length
 6987     const Register table0 = c_rarg3; // crc_table address
 6988     const Register table1 = c_rarg4;
 6989     const Register table2 = c_rarg5;
 6990     const Register table3 = c_rarg6;
 6991     const Register tmp3 = c_rarg7;
 6992 
 6993     BLOCK_COMMENT("Entry:");
 6994     __ enter(); // required for proper stackwalking of RuntimeStub frame
 6995 
 6996     __ kernel_crc32(crc, buf, len,
 6997               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 6998 
 6999     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7000     __ ret(lr);
 7001 
 7002     return start;
 7003   }
 7004 
 7005   /**
 7006    *  Arguments:
 7007    *
 7008    * Inputs:
 7009    *   c_rarg0   - int crc
 7010    *   c_rarg1   - byte* buf
 7011    *   c_rarg2   - int length
 7012    *   c_rarg3   - int* table
 7013    *
 7014    * Output:
 7015    *       r0   - int crc result
 7016    */
 7017   address generate_updateBytesCRC32C() {
 7018     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7019 
 7020     __ align(CodeEntryAlignment);
 7021     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 7022     StubCodeMark mark(this, stub_id);
 7023 
 7024     address start = __ pc();
 7025 
 7026     const Register crc   = c_rarg0;  // crc
 7027     const Register buf   = c_rarg1;  // source java byte array address
 7028     const Register len   = c_rarg2;  // length
 7029     const Register table0 = c_rarg3; // crc_table address
 7030     const Register table1 = c_rarg4;
 7031     const Register table2 = c_rarg5;
 7032     const Register table3 = c_rarg6;
 7033     const Register tmp3 = c_rarg7;
 7034 
 7035     BLOCK_COMMENT("Entry:");
 7036     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7037 
 7038     __ kernel_crc32c(crc, buf, len,
 7039               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7040 
 7041     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7042     __ ret(lr);
 7043 
 7044     return start;
 7045   }
 7046 
 7047   /***
 7048    *  Arguments:
 7049    *
 7050    *  Inputs:
 7051    *   c_rarg0   - int   adler
 7052    *   c_rarg1   - byte* buff
 7053    *   c_rarg2   - int   len
 7054    *
 7055    * Output:
 7056    *   c_rarg0   - int adler result
 7057    */
 7058   address generate_updateBytesAdler32() {
 7059     __ align(CodeEntryAlignment);
 7060     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 7061     StubCodeMark mark(this, stub_id);
 7062     address start = __ pc();
 7063 
 7064     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7065 
 7066     // Aliases
 7067     Register adler  = c_rarg0;
 7068     Register s1     = c_rarg0;
 7069     Register s2     = c_rarg3;
 7070     Register buff   = c_rarg1;
 7071     Register len    = c_rarg2;
 7072     Register nmax  = r4;
 7073     Register base  = r5;
 7074     Register count = r6;
 7075     Register temp0 = rscratch1;
 7076     Register temp1 = rscratch2;
 7077     FloatRegister vbytes = v0;
 7078     FloatRegister vs1acc = v1;
 7079     FloatRegister vs2acc = v2;
 7080     FloatRegister vtable = v3;
 7081 
 7082     // Max number of bytes we can process before having to take the mod
 7083     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7084     uint64_t BASE = 0xfff1;
 7085     uint64_t NMAX = 0x15B0;
 7086 
 7087     __ mov(base, BASE);
 7088     __ mov(nmax, NMAX);
 7089 
 7090     // Load accumulation coefficients for the upper 16 bits
 7091     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7092     __ ld1(vtable, __ T16B, Address(temp0));
 7093 
 7094     // s1 is initialized to the lower 16 bits of adler
 7095     // s2 is initialized to the upper 16 bits of adler
 7096     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7097     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7098 
 7099     // The pipelined loop needs at least 16 elements for 1 iteration
 7100     // It does check this, but it is more effective to skip to the cleanup loop
 7101     __ cmp(len, (u1)16);
 7102     __ br(Assembler::HS, L_nmax);
 7103     __ cbz(len, L_combine);
 7104 
 7105     __ bind(L_simple_by1_loop);
 7106     __ ldrb(temp0, Address(__ post(buff, 1)));
 7107     __ add(s1, s1, temp0);
 7108     __ add(s2, s2, s1);
 7109     __ subs(len, len, 1);
 7110     __ br(Assembler::HI, L_simple_by1_loop);
 7111 
 7112     // s1 = s1 % BASE
 7113     __ subs(temp0, s1, base);
 7114     __ csel(s1, temp0, s1, Assembler::HS);
 7115 
 7116     // s2 = s2 % BASE
 7117     __ lsr(temp0, s2, 16);
 7118     __ lsl(temp1, temp0, 4);
 7119     __ sub(temp1, temp1, temp0);
 7120     __ add(s2, temp1, s2, ext::uxth);
 7121 
 7122     __ subs(temp0, s2, base);
 7123     __ csel(s2, temp0, s2, Assembler::HS);
 7124 
 7125     __ b(L_combine);
 7126 
 7127     __ bind(L_nmax);
 7128     __ subs(len, len, nmax);
 7129     __ sub(count, nmax, 16);
 7130     __ br(Assembler::LO, L_by16);
 7131 
 7132     __ bind(L_nmax_loop);
 7133 
 7134     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7135                                       vbytes, vs1acc, vs2acc, vtable);
 7136 
 7137     __ subs(count, count, 16);
 7138     __ br(Assembler::HS, L_nmax_loop);
 7139 
 7140     // s1 = s1 % BASE
 7141     __ lsr(temp0, s1, 16);
 7142     __ lsl(temp1, temp0, 4);
 7143     __ sub(temp1, temp1, temp0);
 7144     __ add(temp1, temp1, s1, ext::uxth);
 7145 
 7146     __ lsr(temp0, temp1, 16);
 7147     __ lsl(s1, temp0, 4);
 7148     __ sub(s1, s1, temp0);
 7149     __ add(s1, s1, temp1, ext:: uxth);
 7150 
 7151     __ subs(temp0, s1, base);
 7152     __ csel(s1, temp0, s1, Assembler::HS);
 7153 
 7154     // s2 = s2 % BASE
 7155     __ lsr(temp0, s2, 16);
 7156     __ lsl(temp1, temp0, 4);
 7157     __ sub(temp1, temp1, temp0);
 7158     __ add(temp1, temp1, s2, ext::uxth);
 7159 
 7160     __ lsr(temp0, temp1, 16);
 7161     __ lsl(s2, temp0, 4);
 7162     __ sub(s2, s2, temp0);
 7163     __ add(s2, s2, temp1, ext:: uxth);
 7164 
 7165     __ subs(temp0, s2, base);
 7166     __ csel(s2, temp0, s2, Assembler::HS);
 7167 
 7168     __ subs(len, len, nmax);
 7169     __ sub(count, nmax, 16);
 7170     __ br(Assembler::HS, L_nmax_loop);
 7171 
 7172     __ bind(L_by16);
 7173     __ adds(len, len, count);
 7174     __ br(Assembler::LO, L_by1);
 7175 
 7176     __ bind(L_by16_loop);
 7177 
 7178     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7179                                       vbytes, vs1acc, vs2acc, vtable);
 7180 
 7181     __ subs(len, len, 16);
 7182     __ br(Assembler::HS, L_by16_loop);
 7183 
 7184     __ bind(L_by1);
 7185     __ adds(len, len, 15);
 7186     __ br(Assembler::LO, L_do_mod);
 7187 
 7188     __ bind(L_by1_loop);
 7189     __ ldrb(temp0, Address(__ post(buff, 1)));
 7190     __ add(s1, temp0, s1);
 7191     __ add(s2, s2, s1);
 7192     __ subs(len, len, 1);
 7193     __ br(Assembler::HS, L_by1_loop);
 7194 
 7195     __ bind(L_do_mod);
 7196     // s1 = s1 % BASE
 7197     __ lsr(temp0, s1, 16);
 7198     __ lsl(temp1, temp0, 4);
 7199     __ sub(temp1, temp1, temp0);
 7200     __ add(temp1, temp1, s1, ext::uxth);
 7201 
 7202     __ lsr(temp0, temp1, 16);
 7203     __ lsl(s1, temp0, 4);
 7204     __ sub(s1, s1, temp0);
 7205     __ add(s1, s1, temp1, ext:: uxth);
 7206 
 7207     __ subs(temp0, s1, base);
 7208     __ csel(s1, temp0, s1, Assembler::HS);
 7209 
 7210     // s2 = s2 % BASE
 7211     __ lsr(temp0, s2, 16);
 7212     __ lsl(temp1, temp0, 4);
 7213     __ sub(temp1, temp1, temp0);
 7214     __ add(temp1, temp1, s2, ext::uxth);
 7215 
 7216     __ lsr(temp0, temp1, 16);
 7217     __ lsl(s2, temp0, 4);
 7218     __ sub(s2, s2, temp0);
 7219     __ add(s2, s2, temp1, ext:: uxth);
 7220 
 7221     __ subs(temp0, s2, base);
 7222     __ csel(s2, temp0, s2, Assembler::HS);
 7223 
 7224     // Combine lower bits and higher bits
 7225     __ bind(L_combine);
 7226     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7227 
 7228     __ ret(lr);
 7229 
 7230     return start;
 7231   }
 7232 
 7233   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7234           Register temp0, Register temp1, FloatRegister vbytes,
 7235           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7236     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7237     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7238     // In non-vectorized code, we update s1 and s2 as:
 7239     //   s1 <- s1 + b1
 7240     //   s2 <- s2 + s1
 7241     //   s1 <- s1 + b2
 7242     //   s2 <- s2 + b1
 7243     //   ...
 7244     //   s1 <- s1 + b16
 7245     //   s2 <- s2 + s1
 7246     // Putting above assignments together, we have:
 7247     //   s1_new = s1 + b1 + b2 + ... + b16
 7248     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7249     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7250     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7251     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7252 
 7253     // s2 = s2 + s1 * 16
 7254     __ add(s2, s2, s1, Assembler::LSL, 4);
 7255 
 7256     // vs1acc = b1 + b2 + b3 + ... + b16
 7257     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7258     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7259     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7260     __ uaddlv(vs1acc, __ T16B, vbytes);
 7261     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7262 
 7263     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7264     __ fmovd(temp0, vs1acc);
 7265     __ fmovd(temp1, vs2acc);
 7266     __ add(s1, s1, temp0);
 7267     __ add(s2, s2, temp1);
 7268   }
 7269 
 7270   /**
 7271    *  Arguments:
 7272    *
 7273    *  Input:
 7274    *    c_rarg0   - x address
 7275    *    c_rarg1   - x length
 7276    *    c_rarg2   - y address
 7277    *    c_rarg3   - y length
 7278    *    c_rarg4   - z address
 7279    */
 7280   address generate_multiplyToLen() {
 7281     __ align(CodeEntryAlignment);
 7282     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 7283     StubCodeMark mark(this, stub_id);
 7284 
 7285     address start = __ pc();
 7286     const Register x     = r0;
 7287     const Register xlen  = r1;
 7288     const Register y     = r2;
 7289     const Register ylen  = r3;
 7290     const Register z     = r4;
 7291 
 7292     const Register tmp0  = r5;
 7293     const Register tmp1  = r10;
 7294     const Register tmp2  = r11;
 7295     const Register tmp3  = r12;
 7296     const Register tmp4  = r13;
 7297     const Register tmp5  = r14;
 7298     const Register tmp6  = r15;
 7299     const Register tmp7  = r16;
 7300 
 7301     BLOCK_COMMENT("Entry:");
 7302     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7303     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7304     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7305     __ ret(lr);
 7306 
 7307     return start;
 7308   }
 7309 
 7310   address generate_squareToLen() {
 7311     // squareToLen algorithm for sizes 1..127 described in java code works
 7312     // faster than multiply_to_len on some CPUs and slower on others, but
 7313     // multiply_to_len shows a bit better overall results
 7314     __ align(CodeEntryAlignment);
 7315     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 7316     StubCodeMark mark(this, stub_id);
 7317     address start = __ pc();
 7318 
 7319     const Register x     = r0;
 7320     const Register xlen  = r1;
 7321     const Register z     = r2;
 7322     const Register y     = r4; // == x
 7323     const Register ylen  = r5; // == xlen
 7324 
 7325     const Register tmp0  = r3;
 7326     const Register tmp1  = r10;
 7327     const Register tmp2  = r11;
 7328     const Register tmp3  = r12;
 7329     const Register tmp4  = r13;
 7330     const Register tmp5  = r14;
 7331     const Register tmp6  = r15;
 7332     const Register tmp7  = r16;
 7333 
 7334     RegSet spilled_regs = RegSet::of(y, ylen);
 7335     BLOCK_COMMENT("Entry:");
 7336     __ enter();
 7337     __ push(spilled_regs, sp);
 7338     __ mov(y, x);
 7339     __ mov(ylen, xlen);
 7340     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7341     __ pop(spilled_regs, sp);
 7342     __ leave();
 7343     __ ret(lr);
 7344     return start;
 7345   }
 7346 
 7347   address generate_mulAdd() {
 7348     __ align(CodeEntryAlignment);
 7349     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 7350     StubCodeMark mark(this, stub_id);
 7351 
 7352     address start = __ pc();
 7353 
 7354     const Register out     = r0;
 7355     const Register in      = r1;
 7356     const Register offset  = r2;
 7357     const Register len     = r3;
 7358     const Register k       = r4;
 7359 
 7360     BLOCK_COMMENT("Entry:");
 7361     __ enter();
 7362     __ mul_add(out, in, offset, len, k);
 7363     __ leave();
 7364     __ ret(lr);
 7365 
 7366     return start;
 7367   }
 7368 
 7369   // Arguments:
 7370   //
 7371   // Input:
 7372   //   c_rarg0   - newArr address
 7373   //   c_rarg1   - oldArr address
 7374   //   c_rarg2   - newIdx
 7375   //   c_rarg3   - shiftCount
 7376   //   c_rarg4   - numIter
 7377   //
 7378   address generate_bigIntegerRightShift() {
 7379     __ align(CodeEntryAlignment);
 7380     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 7381     StubCodeMark mark(this, stub_id);
 7382     address start = __ pc();
 7383 
 7384     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7385 
 7386     Register newArr        = c_rarg0;
 7387     Register oldArr        = c_rarg1;
 7388     Register newIdx        = c_rarg2;
 7389     Register shiftCount    = c_rarg3;
 7390     Register numIter       = c_rarg4;
 7391     Register idx           = numIter;
 7392 
 7393     Register newArrCur     = rscratch1;
 7394     Register shiftRevCount = rscratch2;
 7395     Register oldArrCur     = r13;
 7396     Register oldArrNext    = r14;
 7397 
 7398     FloatRegister oldElem0        = v0;
 7399     FloatRegister oldElem1        = v1;
 7400     FloatRegister newElem         = v2;
 7401     FloatRegister shiftVCount     = v3;
 7402     FloatRegister shiftVRevCount  = v4;
 7403 
 7404     __ cbz(idx, Exit);
 7405 
 7406     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7407 
 7408     // left shift count
 7409     __ movw(shiftRevCount, 32);
 7410     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7411 
 7412     // numIter too small to allow a 4-words SIMD loop, rolling back
 7413     __ cmp(numIter, (u1)4);
 7414     __ br(Assembler::LT, ShiftThree);
 7415 
 7416     __ dup(shiftVCount,    __ T4S, shiftCount);
 7417     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7418     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7419 
 7420     __ BIND(ShiftSIMDLoop);
 7421 
 7422     // Calculate the load addresses
 7423     __ sub(idx, idx, 4);
 7424     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7425     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7426     __ add(oldArrCur,  oldArrNext, 4);
 7427 
 7428     // Load 4 words and process
 7429     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7430     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7431     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7432     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7433     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7434     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7435 
 7436     __ cmp(idx, (u1)4);
 7437     __ br(Assembler::LT, ShiftTwoLoop);
 7438     __ b(ShiftSIMDLoop);
 7439 
 7440     __ BIND(ShiftTwoLoop);
 7441     __ cbz(idx, Exit);
 7442     __ cmp(idx, (u1)1);
 7443     __ br(Assembler::EQ, ShiftOne);
 7444 
 7445     // Calculate the load addresses
 7446     __ sub(idx, idx, 2);
 7447     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7448     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7449     __ add(oldArrCur,  oldArrNext, 4);
 7450 
 7451     // Load 2 words and process
 7452     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7453     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7454     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7455     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7456     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7457     __ st1(newElem,   __ T2S, Address(newArrCur));
 7458     __ b(ShiftTwoLoop);
 7459 
 7460     __ BIND(ShiftThree);
 7461     __ tbz(idx, 1, ShiftOne);
 7462     __ tbz(idx, 0, ShiftTwo);
 7463     __ ldrw(r10,  Address(oldArr, 12));
 7464     __ ldrw(r11,  Address(oldArr, 8));
 7465     __ lsrvw(r10, r10, shiftCount);
 7466     __ lslvw(r11, r11, shiftRevCount);
 7467     __ orrw(r12,  r10, r11);
 7468     __ strw(r12,  Address(newArr, 8));
 7469 
 7470     __ BIND(ShiftTwo);
 7471     __ ldrw(r10,  Address(oldArr, 8));
 7472     __ ldrw(r11,  Address(oldArr, 4));
 7473     __ lsrvw(r10, r10, shiftCount);
 7474     __ lslvw(r11, r11, shiftRevCount);
 7475     __ orrw(r12,  r10, r11);
 7476     __ strw(r12,  Address(newArr, 4));
 7477 
 7478     __ BIND(ShiftOne);
 7479     __ ldrw(r10,  Address(oldArr, 4));
 7480     __ ldrw(r11,  Address(oldArr));
 7481     __ lsrvw(r10, r10, shiftCount);
 7482     __ lslvw(r11, r11, shiftRevCount);
 7483     __ orrw(r12,  r10, r11);
 7484     __ strw(r12,  Address(newArr));
 7485 
 7486     __ BIND(Exit);
 7487     __ ret(lr);
 7488 
 7489     return start;
 7490   }
 7491 
 7492   // Arguments:
 7493   //
 7494   // Input:
 7495   //   c_rarg0   - newArr address
 7496   //   c_rarg1   - oldArr address
 7497   //   c_rarg2   - newIdx
 7498   //   c_rarg3   - shiftCount
 7499   //   c_rarg4   - numIter
 7500   //
 7501   address generate_bigIntegerLeftShift() {
 7502     __ align(CodeEntryAlignment);
 7503     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 7504     StubCodeMark mark(this, stub_id);
 7505     address start = __ pc();
 7506 
 7507     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7508 
 7509     Register newArr        = c_rarg0;
 7510     Register oldArr        = c_rarg1;
 7511     Register newIdx        = c_rarg2;
 7512     Register shiftCount    = c_rarg3;
 7513     Register numIter       = c_rarg4;
 7514 
 7515     Register shiftRevCount = rscratch1;
 7516     Register oldArrNext    = rscratch2;
 7517 
 7518     FloatRegister oldElem0        = v0;
 7519     FloatRegister oldElem1        = v1;
 7520     FloatRegister newElem         = v2;
 7521     FloatRegister shiftVCount     = v3;
 7522     FloatRegister shiftVRevCount  = v4;
 7523 
 7524     __ cbz(numIter, Exit);
 7525 
 7526     __ add(oldArrNext, oldArr, 4);
 7527     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7528 
 7529     // right shift count
 7530     __ movw(shiftRevCount, 32);
 7531     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7532 
 7533     // numIter too small to allow a 4-words SIMD loop, rolling back
 7534     __ cmp(numIter, (u1)4);
 7535     __ br(Assembler::LT, ShiftThree);
 7536 
 7537     __ dup(shiftVCount,     __ T4S, shiftCount);
 7538     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 7539     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 7540 
 7541     __ BIND(ShiftSIMDLoop);
 7542 
 7543     // load 4 words and process
 7544     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 7545     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 7546     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7547     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7548     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7549     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 7550     __ sub(numIter,   numIter, 4);
 7551 
 7552     __ cmp(numIter, (u1)4);
 7553     __ br(Assembler::LT, ShiftTwoLoop);
 7554     __ b(ShiftSIMDLoop);
 7555 
 7556     __ BIND(ShiftTwoLoop);
 7557     __ cbz(numIter, Exit);
 7558     __ cmp(numIter, (u1)1);
 7559     __ br(Assembler::EQ, ShiftOne);
 7560 
 7561     // load 2 words and process
 7562     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 7563     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 7564     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 7565     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 7566     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 7567     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 7568     __ sub(numIter,   numIter, 2);
 7569     __ b(ShiftTwoLoop);
 7570 
 7571     __ BIND(ShiftThree);
 7572     __ ldrw(r10,  __ post(oldArr, 4));
 7573     __ ldrw(r11,  __ post(oldArrNext, 4));
 7574     __ lslvw(r10, r10, shiftCount);
 7575     __ lsrvw(r11, r11, shiftRevCount);
 7576     __ orrw(r12,  r10, r11);
 7577     __ strw(r12,  __ post(newArr, 4));
 7578     __ tbz(numIter, 1, Exit);
 7579     __ tbz(numIter, 0, ShiftOne);
 7580 
 7581     __ BIND(ShiftTwo);
 7582     __ ldrw(r10,  __ post(oldArr, 4));
 7583     __ ldrw(r11,  __ post(oldArrNext, 4));
 7584     __ lslvw(r10, r10, shiftCount);
 7585     __ lsrvw(r11, r11, shiftRevCount);
 7586     __ orrw(r12,  r10, r11);
 7587     __ strw(r12,  __ post(newArr, 4));
 7588 
 7589     __ BIND(ShiftOne);
 7590     __ ldrw(r10,  Address(oldArr));
 7591     __ ldrw(r11,  Address(oldArrNext));
 7592     __ lslvw(r10, r10, shiftCount);
 7593     __ lsrvw(r11, r11, shiftRevCount);
 7594     __ orrw(r12,  r10, r11);
 7595     __ strw(r12,  Address(newArr));
 7596 
 7597     __ BIND(Exit);
 7598     __ ret(lr);
 7599 
 7600     return start;
 7601   }
 7602 
 7603   address generate_count_positives(address &count_positives_long) {
 7604     const u1 large_loop_size = 64;
 7605     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 7606     int dcache_line = VM_Version::dcache_line_size();
 7607 
 7608     Register ary1 = r1, len = r2, result = r0;
 7609 
 7610     __ align(CodeEntryAlignment);
 7611 
 7612     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 7613     StubCodeMark mark(this, stub_id);
 7614 
 7615     address entry = __ pc();
 7616 
 7617     __ enter();
 7618     // precondition: a copy of len is already in result
 7619     // __ mov(result, len);
 7620 
 7621   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 7622         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 7623 
 7624   __ cmp(len, (u1)15);
 7625   __ br(Assembler::GT, LEN_OVER_15);
 7626   // The only case when execution falls into this code is when pointer is near
 7627   // the end of memory page and we have to avoid reading next page
 7628   __ add(ary1, ary1, len);
 7629   __ subs(len, len, 8);
 7630   __ br(Assembler::GT, LEN_OVER_8);
 7631   __ ldr(rscratch2, Address(ary1, -8));
 7632   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 7633   __ lsrv(rscratch2, rscratch2, rscratch1);
 7634   __ tst(rscratch2, UPPER_BIT_MASK);
 7635   __ csel(result, zr, result, Assembler::NE);
 7636   __ leave();
 7637   __ ret(lr);
 7638   __ bind(LEN_OVER_8);
 7639   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 7640   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 7641   __ tst(rscratch2, UPPER_BIT_MASK);
 7642   __ br(Assembler::NE, RET_NO_POP);
 7643   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 7644   __ lsrv(rscratch1, rscratch1, rscratch2);
 7645   __ tst(rscratch1, UPPER_BIT_MASK);
 7646   __ bind(RET_NO_POP);
 7647   __ csel(result, zr, result, Assembler::NE);
 7648   __ leave();
 7649   __ ret(lr);
 7650 
 7651   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 7652   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 7653 
 7654   count_positives_long = __ pc(); // 2nd entry point
 7655 
 7656   __ enter();
 7657 
 7658   __ bind(LEN_OVER_15);
 7659     __ push(spilled_regs, sp);
 7660     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 7661     __ cbz(rscratch2, ALIGNED);
 7662     __ ldp(tmp6, tmp1, Address(ary1));
 7663     __ mov(tmp5, 16);
 7664     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 7665     __ add(ary1, ary1, rscratch1);
 7666     __ orr(tmp6, tmp6, tmp1);
 7667     __ tst(tmp6, UPPER_BIT_MASK);
 7668     __ br(Assembler::NE, RET_ADJUST);
 7669     __ sub(len, len, rscratch1);
 7670 
 7671   __ bind(ALIGNED);
 7672     __ cmp(len, large_loop_size);
 7673     __ br(Assembler::LT, CHECK_16);
 7674     // Perform 16-byte load as early return in pre-loop to handle situation
 7675     // when initially aligned large array has negative values at starting bytes,
 7676     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 7677     // slower. Cases with negative bytes further ahead won't be affected that
 7678     // much. In fact, it'll be faster due to early loads, less instructions and
 7679     // less branches in LARGE_LOOP.
 7680     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 7681     __ sub(len, len, 16);
 7682     __ orr(tmp6, tmp6, tmp1);
 7683     __ tst(tmp6, UPPER_BIT_MASK);
 7684     __ br(Assembler::NE, RET_ADJUST_16);
 7685     __ cmp(len, large_loop_size);
 7686     __ br(Assembler::LT, CHECK_16);
 7687 
 7688     if (SoftwarePrefetchHintDistance >= 0
 7689         && SoftwarePrefetchHintDistance >= dcache_line) {
 7690       // initial prefetch
 7691       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 7692     }
 7693   __ bind(LARGE_LOOP);
 7694     if (SoftwarePrefetchHintDistance >= 0) {
 7695       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 7696     }
 7697     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 7698     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 7699     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 7700     // instructions per cycle and have less branches, but this approach disables
 7701     // early return, thus, all 64 bytes are loaded and checked every time.
 7702     __ ldp(tmp2, tmp3, Address(ary1));
 7703     __ ldp(tmp4, tmp5, Address(ary1, 16));
 7704     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 7705     __ ldp(tmp6, tmp1, Address(ary1, 48));
 7706     __ add(ary1, ary1, large_loop_size);
 7707     __ sub(len, len, large_loop_size);
 7708     __ orr(tmp2, tmp2, tmp3);
 7709     __ orr(tmp4, tmp4, tmp5);
 7710     __ orr(rscratch1, rscratch1, rscratch2);
 7711     __ orr(tmp6, tmp6, tmp1);
 7712     __ orr(tmp2, tmp2, tmp4);
 7713     __ orr(rscratch1, rscratch1, tmp6);
 7714     __ orr(tmp2, tmp2, rscratch1);
 7715     __ tst(tmp2, UPPER_BIT_MASK);
 7716     __ br(Assembler::NE, RET_ADJUST_LONG);
 7717     __ cmp(len, large_loop_size);
 7718     __ br(Assembler::GE, LARGE_LOOP);
 7719 
 7720   __ bind(CHECK_16); // small 16-byte load pre-loop
 7721     __ cmp(len, (u1)16);
 7722     __ br(Assembler::LT, POST_LOOP16);
 7723 
 7724   __ bind(LOOP16); // small 16-byte load loop
 7725     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 7726     __ sub(len, len, 16);
 7727     __ orr(tmp2, tmp2, tmp3);
 7728     __ tst(tmp2, UPPER_BIT_MASK);
 7729     __ br(Assembler::NE, RET_ADJUST_16);
 7730     __ cmp(len, (u1)16);
 7731     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 7732 
 7733   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 7734     __ cmp(len, (u1)8);
 7735     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 7736     __ ldr(tmp3, Address(__ post(ary1, 8)));
 7737     __ tst(tmp3, UPPER_BIT_MASK);
 7738     __ br(Assembler::NE, RET_ADJUST);
 7739     __ sub(len, len, 8);
 7740 
 7741   __ bind(POST_LOOP16_LOAD_TAIL);
 7742     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 7743     __ ldr(tmp1, Address(ary1));
 7744     __ mov(tmp2, 64);
 7745     __ sub(tmp4, tmp2, len, __ LSL, 3);
 7746     __ lslv(tmp1, tmp1, tmp4);
 7747     __ tst(tmp1, UPPER_BIT_MASK);
 7748     __ br(Assembler::NE, RET_ADJUST);
 7749     // Fallthrough
 7750 
 7751   __ bind(RET_LEN);
 7752     __ pop(spilled_regs, sp);
 7753     __ leave();
 7754     __ ret(lr);
 7755 
 7756     // difference result - len is the count of guaranteed to be
 7757     // positive bytes
 7758 
 7759   __ bind(RET_ADJUST_LONG);
 7760     __ add(len, len, (u1)(large_loop_size - 16));
 7761   __ bind(RET_ADJUST_16);
 7762     __ add(len, len, 16);
 7763   __ bind(RET_ADJUST);
 7764     __ pop(spilled_regs, sp);
 7765     __ leave();
 7766     __ sub(result, result, len);
 7767     __ ret(lr);
 7768 
 7769     return entry;
 7770   }
 7771 
 7772   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 7773         bool usePrefetch, Label &NOT_EQUAL) {
 7774     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7775         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7776         tmp7 = r12, tmp8 = r13;
 7777     Label LOOP;
 7778 
 7779     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7780     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7781     __ bind(LOOP);
 7782     if (usePrefetch) {
 7783       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7784       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7785     }
 7786     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7787     __ eor(tmp1, tmp1, tmp2);
 7788     __ eor(tmp3, tmp3, tmp4);
 7789     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7790     __ orr(tmp1, tmp1, tmp3);
 7791     __ cbnz(tmp1, NOT_EQUAL);
 7792     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7793     __ eor(tmp5, tmp5, tmp6);
 7794     __ eor(tmp7, tmp7, tmp8);
 7795     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7796     __ orr(tmp5, tmp5, tmp7);
 7797     __ cbnz(tmp5, NOT_EQUAL);
 7798     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7799     __ eor(tmp1, tmp1, tmp2);
 7800     __ eor(tmp3, tmp3, tmp4);
 7801     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7802     __ orr(tmp1, tmp1, tmp3);
 7803     __ cbnz(tmp1, NOT_EQUAL);
 7804     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7805     __ eor(tmp5, tmp5, tmp6);
 7806     __ sub(cnt1, cnt1, 8 * wordSize);
 7807     __ eor(tmp7, tmp7, tmp8);
 7808     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7809     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 7810     // cmp) because subs allows an unlimited range of immediate operand.
 7811     __ subs(tmp6, cnt1, loopThreshold);
 7812     __ orr(tmp5, tmp5, tmp7);
 7813     __ cbnz(tmp5, NOT_EQUAL);
 7814     __ br(__ GE, LOOP);
 7815     // post-loop
 7816     __ eor(tmp1, tmp1, tmp2);
 7817     __ eor(tmp3, tmp3, tmp4);
 7818     __ orr(tmp1, tmp1, tmp3);
 7819     __ sub(cnt1, cnt1, 2 * wordSize);
 7820     __ cbnz(tmp1, NOT_EQUAL);
 7821   }
 7822 
 7823   void generate_large_array_equals_loop_simd(int loopThreshold,
 7824         bool usePrefetch, Label &NOT_EQUAL) {
 7825     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7826         tmp2 = rscratch2;
 7827     Label LOOP;
 7828 
 7829     __ bind(LOOP);
 7830     if (usePrefetch) {
 7831       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7832       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7833     }
 7834     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 7835     __ sub(cnt1, cnt1, 8 * wordSize);
 7836     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 7837     __ subs(tmp1, cnt1, loopThreshold);
 7838     __ eor(v0, __ T16B, v0, v4);
 7839     __ eor(v1, __ T16B, v1, v5);
 7840     __ eor(v2, __ T16B, v2, v6);
 7841     __ eor(v3, __ T16B, v3, v7);
 7842     __ orr(v0, __ T16B, v0, v1);
 7843     __ orr(v1, __ T16B, v2, v3);
 7844     __ orr(v0, __ T16B, v0, v1);
 7845     __ umov(tmp1, v0, __ D, 0);
 7846     __ umov(tmp2, v0, __ D, 1);
 7847     __ orr(tmp1, tmp1, tmp2);
 7848     __ cbnz(tmp1, NOT_EQUAL);
 7849     __ br(__ GE, LOOP);
 7850   }
 7851 
 7852   // a1 = r1 - array1 address
 7853   // a2 = r2 - array2 address
 7854   // result = r0 - return value. Already contains "false"
 7855   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 7856   // r3-r5 are reserved temporary registers
 7857   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 7858   address generate_large_array_equals() {
 7859     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7860         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7861         tmp7 = r12, tmp8 = r13;
 7862     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 7863         SMALL_LOOP, POST_LOOP;
 7864     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 7865     // calculate if at least 32 prefetched bytes are used
 7866     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 7867     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 7868     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 7869     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 7870         tmp5, tmp6, tmp7, tmp8);
 7871 
 7872     __ align(CodeEntryAlignment);
 7873 
 7874     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 7875     StubCodeMark mark(this, stub_id);
 7876 
 7877     address entry = __ pc();
 7878     __ enter();
 7879     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 7880     // also advance pointers to use post-increment instead of pre-increment
 7881     __ add(a1, a1, wordSize);
 7882     __ add(a2, a2, wordSize);
 7883     if (AvoidUnalignedAccesses) {
 7884       // both implementations (SIMD/nonSIMD) are using relatively large load
 7885       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 7886       // on some CPUs in case of address is not at least 16-byte aligned.
 7887       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 7888       // load if needed at least for 1st address and make if 16-byte aligned.
 7889       Label ALIGNED16;
 7890       __ tbz(a1, 3, ALIGNED16);
 7891       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 7892       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 7893       __ sub(cnt1, cnt1, wordSize);
 7894       __ eor(tmp1, tmp1, tmp2);
 7895       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 7896       __ bind(ALIGNED16);
 7897     }
 7898     if (UseSIMDForArrayEquals) {
 7899       if (SoftwarePrefetchHintDistance >= 0) {
 7900         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 7901         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 7902         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 7903             /* prfm = */ true, NOT_EQUAL);
 7904         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 7905         __ br(__ LT, TAIL);
 7906       }
 7907       __ bind(NO_PREFETCH_LARGE_LOOP);
 7908       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 7909           /* prfm = */ false, NOT_EQUAL);
 7910     } else {
 7911       __ push(spilled_regs, sp);
 7912       if (SoftwarePrefetchHintDistance >= 0) {
 7913         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 7914         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 7915         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 7916             /* prfm = */ true, NOT_EQUAL);
 7917         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 7918         __ br(__ LT, TAIL);
 7919       }
 7920       __ bind(NO_PREFETCH_LARGE_LOOP);
 7921       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 7922           /* prfm = */ false, NOT_EQUAL);
 7923     }
 7924     __ bind(TAIL);
 7925       __ cbz(cnt1, EQUAL);
 7926       __ subs(cnt1, cnt1, wordSize);
 7927       __ br(__ LE, POST_LOOP);
 7928     __ bind(SMALL_LOOP);
 7929       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 7930       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 7931       __ subs(cnt1, cnt1, wordSize);
 7932       __ eor(tmp1, tmp1, tmp2);
 7933       __ cbnz(tmp1, NOT_EQUAL);
 7934       __ br(__ GT, SMALL_LOOP);
 7935     __ bind(POST_LOOP);
 7936       __ ldr(tmp1, Address(a1, cnt1));
 7937       __ ldr(tmp2, Address(a2, cnt1));
 7938       __ eor(tmp1, tmp1, tmp2);
 7939       __ cbnz(tmp1, NOT_EQUAL);
 7940     __ bind(EQUAL);
 7941       __ mov(result, true);
 7942     __ bind(NOT_EQUAL);
 7943       if (!UseSIMDForArrayEquals) {
 7944         __ pop(spilled_regs, sp);
 7945       }
 7946     __ bind(NOT_EQUAL_NO_POP);
 7947     __ leave();
 7948     __ ret(lr);
 7949     return entry;
 7950   }
 7951 
 7952   // result = r0 - return value. Contains initial hashcode value on entry.
 7953   // ary = r1 - array address
 7954   // cnt = r2 - elements count
 7955   // Clobbers: v0-v13, rscratch1, rscratch2
 7956   address generate_large_arrays_hashcode(BasicType eltype) {
 7957     const Register result = r0, ary = r1, cnt = r2;
 7958     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 7959     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 7960     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 7961     const FloatRegister vpowm = v13;
 7962 
 7963     ARRAYS_HASHCODE_REGISTERS;
 7964 
 7965     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 7966 
 7967     unsigned int vf; // vectorization factor
 7968     bool multiply_by_halves;
 7969     Assembler::SIMD_Arrangement load_arrangement;
 7970     switch (eltype) {
 7971     case T_BOOLEAN:
 7972     case T_BYTE:
 7973       load_arrangement = Assembler::T8B;
 7974       multiply_by_halves = true;
 7975       vf = 8;
 7976       break;
 7977     case T_CHAR:
 7978     case T_SHORT:
 7979       load_arrangement = Assembler::T8H;
 7980       multiply_by_halves = true;
 7981       vf = 8;
 7982       break;
 7983     case T_INT:
 7984       load_arrangement = Assembler::T4S;
 7985       multiply_by_halves = false;
 7986       vf = 4;
 7987       break;
 7988     default:
 7989       ShouldNotReachHere();
 7990     }
 7991 
 7992     // Unroll factor
 7993     const unsigned uf = 4;
 7994 
 7995     // Effective vectorization factor
 7996     const unsigned evf = vf * uf;
 7997 
 7998     __ align(CodeEntryAlignment);
 7999 
 8000     StubGenStubId stub_id;
 8001     switch (eltype) {
 8002     case T_BOOLEAN:
 8003       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 8004       break;
 8005     case T_BYTE:
 8006       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 8007       break;
 8008     case T_CHAR:
 8009       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 8010       break;
 8011     case T_SHORT:
 8012       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 8013       break;
 8014     case T_INT:
 8015       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 8016       break;
 8017     default:
 8018       stub_id = StubGenStubId::NO_STUBID;
 8019       ShouldNotReachHere();
 8020     };
 8021 
 8022     StubCodeMark mark(this, stub_id);
 8023 
 8024     address entry = __ pc();
 8025     __ enter();
 8026 
 8027     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8028     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8029     // value shouldn't change throughout both loops.
 8030     __ movw(rscratch1, intpow(31U, 3));
 8031     __ mov(vpow, Assembler::S, 0, rscratch1);
 8032     __ movw(rscratch1, intpow(31U, 2));
 8033     __ mov(vpow, Assembler::S, 1, rscratch1);
 8034     __ movw(rscratch1, intpow(31U, 1));
 8035     __ mov(vpow, Assembler::S, 2, rscratch1);
 8036     __ movw(rscratch1, intpow(31U, 0));
 8037     __ mov(vpow, Assembler::S, 3, rscratch1);
 8038 
 8039     __ mov(vmul0, Assembler::T16B, 0);
 8040     __ mov(vmul0, Assembler::S, 3, result);
 8041 
 8042     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8043     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8044 
 8045     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8046     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8047 
 8048     // SMALL LOOP
 8049     __ bind(SMALL_LOOP);
 8050 
 8051     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8052     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8053     __ subsw(rscratch2, rscratch2, vf);
 8054 
 8055     if (load_arrangement == Assembler::T8B) {
 8056       // Extend 8B to 8H to be able to use vector multiply
 8057       // instructions
 8058       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8059       if (is_signed_subword_type(eltype)) {
 8060         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8061       } else {
 8062         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8063       }
 8064     }
 8065 
 8066     switch (load_arrangement) {
 8067     case Assembler::T4S:
 8068       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8069       break;
 8070     case Assembler::T8B:
 8071     case Assembler::T8H:
 8072       assert(is_subword_type(eltype), "subword type expected");
 8073       if (is_signed_subword_type(eltype)) {
 8074         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8075       } else {
 8076         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8077       }
 8078       break;
 8079     default:
 8080       __ should_not_reach_here();
 8081     }
 8082 
 8083     // Process the upper half of a vector
 8084     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8085       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8086       if (is_signed_subword_type(eltype)) {
 8087         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8088       } else {
 8089         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8090       }
 8091     }
 8092 
 8093     __ br(Assembler::HI, SMALL_LOOP);
 8094 
 8095     // SMALL LOOP'S EPILOQUE
 8096     __ lsr(rscratch2, cnt, exact_log2(evf));
 8097     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8098 
 8099     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8100     __ addv(vmul0, Assembler::T4S, vmul0);
 8101     __ umov(result, vmul0, Assembler::S, 0);
 8102 
 8103     // TAIL
 8104     __ bind(TAIL);
 8105 
 8106     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8107     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8108     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8109     __ andr(rscratch2, cnt, vf - 1);
 8110     __ bind(TAIL_SHORTCUT);
 8111     __ adr(rscratch1, BR_BASE);
 8112     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 8113     __ movw(rscratch2, 0x1f);
 8114     __ br(rscratch1);
 8115 
 8116     for (size_t i = 0; i < vf - 1; ++i) {
 8117       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8118                                    eltype);
 8119       __ maddw(result, result, rscratch2, rscratch1);
 8120     }
 8121     __ bind(BR_BASE);
 8122 
 8123     __ leave();
 8124     __ ret(lr);
 8125 
 8126     // LARGE LOOP
 8127     __ bind(LARGE_LOOP_PREHEADER);
 8128 
 8129     __ lsr(rscratch2, cnt, exact_log2(evf));
 8130 
 8131     if (multiply_by_halves) {
 8132       // 31^4 - multiplier between lower and upper parts of a register
 8133       __ movw(rscratch1, intpow(31U, vf / 2));
 8134       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8135       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8136       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8137       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8138     } else {
 8139       // 31^16
 8140       __ movw(rscratch1, intpow(31U, evf));
 8141       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8142     }
 8143 
 8144     __ mov(vmul3, Assembler::T16B, 0);
 8145     __ mov(vmul2, Assembler::T16B, 0);
 8146     __ mov(vmul1, Assembler::T16B, 0);
 8147 
 8148     __ bind(LARGE_LOOP);
 8149 
 8150     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8151     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8152     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8153     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8154 
 8155     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8156            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8157 
 8158     if (load_arrangement == Assembler::T8B) {
 8159       // Extend 8B to 8H to be able to use vector multiply
 8160       // instructions
 8161       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8162       if (is_signed_subword_type(eltype)) {
 8163         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8164         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8165         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8166         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8167       } else {
 8168         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8169         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8170         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8171         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8172       }
 8173     }
 8174 
 8175     switch (load_arrangement) {
 8176     case Assembler::T4S:
 8177       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8178       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8179       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8180       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8181       break;
 8182     case Assembler::T8B:
 8183     case Assembler::T8H:
 8184       assert(is_subword_type(eltype), "subword type expected");
 8185       if (is_signed_subword_type(eltype)) {
 8186         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8187         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8188         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8189         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8190       } else {
 8191         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8192         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8193         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8194         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8195       }
 8196       break;
 8197     default:
 8198       __ should_not_reach_here();
 8199     }
 8200 
 8201     // Process the upper half of a vector
 8202     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8203       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8204       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8205       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8206       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8207       if (is_signed_subword_type(eltype)) {
 8208         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8209         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8210         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8211         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8212       } else {
 8213         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8214         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8215         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8216         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8217       }
 8218     }
 8219 
 8220     __ subsw(rscratch2, rscratch2, 1);
 8221     __ br(Assembler::HI, LARGE_LOOP);
 8222 
 8223     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8224     __ addv(vmul3, Assembler::T4S, vmul3);
 8225     __ umov(result, vmul3, Assembler::S, 0);
 8226 
 8227     __ mov(rscratch2, intpow(31U, vf));
 8228 
 8229     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8230     __ addv(vmul2, Assembler::T4S, vmul2);
 8231     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8232     __ maddw(result, result, rscratch2, rscratch1);
 8233 
 8234     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8235     __ addv(vmul1, Assembler::T4S, vmul1);
 8236     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8237     __ maddw(result, result, rscratch2, rscratch1);
 8238 
 8239     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8240     __ addv(vmul0, Assembler::T4S, vmul0);
 8241     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8242     __ maddw(result, result, rscratch2, rscratch1);
 8243 
 8244     __ andr(rscratch2, cnt, vf - 1);
 8245     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8246 
 8247     __ leave();
 8248     __ ret(lr);
 8249 
 8250     return entry;
 8251   }
 8252 
 8253   address generate_dsin_dcos(bool isCos) {
 8254     __ align(CodeEntryAlignment);
 8255     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 8256     StubCodeMark mark(this, stub_id);
 8257     address start = __ pc();
 8258     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8259         (address)StubRoutines::aarch64::_two_over_pi,
 8260         (address)StubRoutines::aarch64::_pio2,
 8261         (address)StubRoutines::aarch64::_dsin_coef,
 8262         (address)StubRoutines::aarch64::_dcos_coef);
 8263     return start;
 8264   }
 8265 
 8266   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8267   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8268       Label &DIFF2) {
 8269     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8270     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8271 
 8272     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8273     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8274     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8275     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8276 
 8277     __ fmovd(tmpL, vtmp3);
 8278     __ eor(rscratch2, tmp3, tmpL);
 8279     __ cbnz(rscratch2, DIFF2);
 8280 
 8281     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8282     __ umov(tmpL, vtmp3, __ D, 1);
 8283     __ eor(rscratch2, tmpU, tmpL);
 8284     __ cbnz(rscratch2, DIFF1);
 8285 
 8286     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8287     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8288     __ fmovd(tmpL, vtmp);
 8289     __ eor(rscratch2, tmp3, tmpL);
 8290     __ cbnz(rscratch2, DIFF2);
 8291 
 8292     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8293     __ umov(tmpL, vtmp, __ D, 1);
 8294     __ eor(rscratch2, tmpU, tmpL);
 8295     __ cbnz(rscratch2, DIFF1);
 8296   }
 8297 
 8298   // r0  = result
 8299   // r1  = str1
 8300   // r2  = cnt1
 8301   // r3  = str2
 8302   // r4  = cnt2
 8303   // r10 = tmp1
 8304   // r11 = tmp2
 8305   address generate_compare_long_string_different_encoding(bool isLU) {
 8306     __ align(CodeEntryAlignment);
 8307     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 8308     StubCodeMark mark(this, stub_id);
 8309     address entry = __ pc();
 8310     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8311         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8312         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8313     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8314         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8315     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8316     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8317 
 8318     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8319 
 8320     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8321     // cnt2 == amount of characters left to compare
 8322     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8323     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8324     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8325     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8326     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8327     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8328     __ eor(rscratch2, tmp1, tmp2);
 8329     __ mov(rscratch1, tmp2);
 8330     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8331     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8332              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8333     __ push(spilled_regs, sp);
 8334     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8335     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8336 
 8337     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8338 
 8339     if (SoftwarePrefetchHintDistance >= 0) {
 8340       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8341       __ br(__ LT, NO_PREFETCH);
 8342       __ bind(LARGE_LOOP_PREFETCH);
 8343         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8344         __ mov(tmp4, 2);
 8345         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8346         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8347           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8348           __ subs(tmp4, tmp4, 1);
 8349           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8350           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8351           __ mov(tmp4, 2);
 8352         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8353           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8354           __ subs(tmp4, tmp4, 1);
 8355           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8356           __ sub(cnt2, cnt2, 64);
 8357           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8358           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8359     }
 8360     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8361     __ bind(NO_PREFETCH);
 8362     __ subs(cnt2, cnt2, 16);
 8363     __ br(__ LT, TAIL);
 8364     __ align(OptoLoopAlignment);
 8365     __ bind(SMALL_LOOP); // smaller loop
 8366       __ subs(cnt2, cnt2, 16);
 8367       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8368       __ br(__ GE, SMALL_LOOP);
 8369       __ cmn(cnt2, (u1)16);
 8370       __ br(__ EQ, LOAD_LAST);
 8371     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8372       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8373       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8374       __ ldr(tmp3, Address(cnt1, -8));
 8375       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8376       __ b(LOAD_LAST);
 8377     __ bind(DIFF2);
 8378       __ mov(tmpU, tmp3);
 8379     __ bind(DIFF1);
 8380       __ pop(spilled_regs, sp);
 8381       __ b(CALCULATE_DIFFERENCE);
 8382     __ bind(LOAD_LAST);
 8383       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8384       // No need to load it again
 8385       __ mov(tmpU, tmp3);
 8386       __ pop(spilled_regs, sp);
 8387 
 8388       // tmp2 points to the address of the last 4 Latin1 characters right now
 8389       __ ldrs(vtmp, Address(tmp2));
 8390       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8391       __ fmovd(tmpL, vtmp);
 8392 
 8393       __ eor(rscratch2, tmpU, tmpL);
 8394       __ cbz(rscratch2, DONE);
 8395 
 8396     // Find the first different characters in the longwords and
 8397     // compute their difference.
 8398     __ bind(CALCULATE_DIFFERENCE);
 8399       __ rev(rscratch2, rscratch2);
 8400       __ clz(rscratch2, rscratch2);
 8401       __ andr(rscratch2, rscratch2, -16);
 8402       __ lsrv(tmp1, tmp1, rscratch2);
 8403       __ uxthw(tmp1, tmp1);
 8404       __ lsrv(rscratch1, rscratch1, rscratch2);
 8405       __ uxthw(rscratch1, rscratch1);
 8406       __ subw(result, tmp1, rscratch1);
 8407     __ bind(DONE);
 8408       __ ret(lr);
 8409     return entry;
 8410   }
 8411 
 8412   // r0 = input (float16)
 8413   // v0 = result (float)
 8414   // v1 = temporary float register
 8415   address generate_float16ToFloat() {
 8416     __ align(CodeEntryAlignment);
 8417     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 8418     StubCodeMark mark(this, stub_id);
 8419     address entry = __ pc();
 8420     BLOCK_COMMENT("Entry:");
 8421     __ flt16_to_flt(v0, r0, v1);
 8422     __ ret(lr);
 8423     return entry;
 8424   }
 8425 
 8426   // v0 = input (float)
 8427   // r0 = result (float16)
 8428   // v1 = temporary float register
 8429   address generate_floatToFloat16() {
 8430     __ align(CodeEntryAlignment);
 8431     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 8432     StubCodeMark mark(this, stub_id);
 8433     address entry = __ pc();
 8434     BLOCK_COMMENT("Entry:");
 8435     __ flt_to_flt16(r0, v0, v1);
 8436     __ ret(lr);
 8437     return entry;
 8438   }
 8439 
 8440   address generate_method_entry_barrier() {
 8441     __ align(CodeEntryAlignment);
 8442     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 8443     StubCodeMark mark(this, stub_id);
 8444 
 8445     Label deoptimize_label;
 8446 
 8447     address start = __ pc();
 8448 
 8449     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8450 
 8451     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8452       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8453       // We can get here despite the nmethod being good, if we have not
 8454       // yet applied our cross modification fence (or data fence).
 8455       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8456       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8457       __ ldrw(rscratch2, rscratch2);
 8458       __ strw(rscratch2, thread_epoch_addr);
 8459       __ isb();
 8460       __ membar(__ LoadLoad);
 8461     }
 8462 
 8463     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8464 
 8465     __ enter();
 8466     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8467 
 8468     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8469 
 8470     __ push_call_clobbered_registers();
 8471 
 8472     __ mov(c_rarg0, rscratch2);
 8473     __ call_VM_leaf
 8474          (CAST_FROM_FN_PTR
 8475           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8476 
 8477     __ reset_last_Java_frame(true);
 8478 
 8479     __ mov(rscratch1, r0);
 8480 
 8481     __ pop_call_clobbered_registers();
 8482 
 8483     __ cbnz(rscratch1, deoptimize_label);
 8484 
 8485     __ leave();
 8486     __ ret(lr);
 8487 
 8488     __ BIND(deoptimize_label);
 8489 
 8490     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8491     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8492 
 8493     __ mov(sp, rscratch1);
 8494     __ br(rscratch2);
 8495 
 8496     return start;
 8497   }
 8498 
 8499   // r0  = result
 8500   // r1  = str1
 8501   // r2  = cnt1
 8502   // r3  = str2
 8503   // r4  = cnt2
 8504   // r10 = tmp1
 8505   // r11 = tmp2
 8506   address generate_compare_long_string_same_encoding(bool isLL) {
 8507     __ align(CodeEntryAlignment);
 8508     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 8509     StubCodeMark mark(this, stub_id);
 8510     address entry = __ pc();
 8511     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8512         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 8513 
 8514     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 8515 
 8516     // exit from large loop when less than 64 bytes left to read or we're about
 8517     // to prefetch memory behind array border
 8518     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 8519 
 8520     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 8521     __ eor(rscratch2, tmp1, tmp2);
 8522     __ cbnz(rscratch2, CAL_DIFFERENCE);
 8523 
 8524     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 8525     // update pointers, because of previous read
 8526     __ add(str1, str1, wordSize);
 8527     __ add(str2, str2, wordSize);
 8528     if (SoftwarePrefetchHintDistance >= 0) {
 8529       __ align(OptoLoopAlignment);
 8530       __ bind(LARGE_LOOP_PREFETCH);
 8531         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 8532         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 8533 
 8534         for (int i = 0; i < 4; i++) {
 8535           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 8536           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 8537           __ cmp(tmp1, tmp2);
 8538           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8539           __ br(Assembler::NE, DIFF);
 8540         }
 8541         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 8542         __ add(str1, str1, 64);
 8543         __ add(str2, str2, 64);
 8544         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 8545         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 8546         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 8547     }
 8548 
 8549     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 8550     __ br(Assembler::LE, LESS16);
 8551     __ align(OptoLoopAlignment);
 8552     __ bind(LOOP_COMPARE16);
 8553       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8554       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8555       __ cmp(tmp1, tmp2);
 8556       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8557       __ br(Assembler::NE, DIFF);
 8558       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8559       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8560       __ br(Assembler::LT, LESS16);
 8561 
 8562       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8563       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8564       __ cmp(tmp1, tmp2);
 8565       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8566       __ br(Assembler::NE, DIFF);
 8567       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8568       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8569       __ br(Assembler::GE, LOOP_COMPARE16);
 8570       __ cbz(cnt2, LENGTH_DIFF);
 8571 
 8572     __ bind(LESS16);
 8573       // each 8 compare
 8574       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 8575       __ br(Assembler::LE, LESS8);
 8576       __ ldr(tmp1, Address(__ post(str1, 8)));
 8577       __ ldr(tmp2, Address(__ post(str2, 8)));
 8578       __ eor(rscratch2, tmp1, tmp2);
 8579       __ cbnz(rscratch2, CAL_DIFFERENCE);
 8580       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 8581 
 8582     __ bind(LESS8); // directly load last 8 bytes
 8583       if (!isLL) {
 8584         __ add(cnt2, cnt2, cnt2);
 8585       }
 8586       __ ldr(tmp1, Address(str1, cnt2));
 8587       __ ldr(tmp2, Address(str2, cnt2));
 8588       __ eor(rscratch2, tmp1, tmp2);
 8589       __ cbz(rscratch2, LENGTH_DIFF);
 8590       __ b(CAL_DIFFERENCE);
 8591 
 8592     __ bind(DIFF);
 8593       __ cmp(tmp1, tmp2);
 8594       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 8595       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 8596       // reuse rscratch2 register for the result of eor instruction
 8597       __ eor(rscratch2, tmp1, tmp2);
 8598 
 8599     __ bind(CAL_DIFFERENCE);
 8600       __ rev(rscratch2, rscratch2);
 8601       __ clz(rscratch2, rscratch2);
 8602       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 8603       __ lsrv(tmp1, tmp1, rscratch2);
 8604       __ lsrv(tmp2, tmp2, rscratch2);
 8605       if (isLL) {
 8606         __ uxtbw(tmp1, tmp1);
 8607         __ uxtbw(tmp2, tmp2);
 8608       } else {
 8609         __ uxthw(tmp1, tmp1);
 8610         __ uxthw(tmp2, tmp2);
 8611       }
 8612       __ subw(result, tmp1, tmp2);
 8613 
 8614     __ bind(LENGTH_DIFF);
 8615       __ ret(lr);
 8616     return entry;
 8617   }
 8618 
 8619   enum string_compare_mode {
 8620     LL,
 8621     LU,
 8622     UL,
 8623     UU,
 8624   };
 8625 
 8626   // The following registers are declared in aarch64.ad
 8627   // r0  = result
 8628   // r1  = str1
 8629   // r2  = cnt1
 8630   // r3  = str2
 8631   // r4  = cnt2
 8632   // r10 = tmp1
 8633   // r11 = tmp2
 8634   // z0  = ztmp1
 8635   // z1  = ztmp2
 8636   // p0  = pgtmp1
 8637   // p1  = pgtmp2
 8638   address generate_compare_long_string_sve(string_compare_mode mode) {
 8639     StubGenStubId stub_id;
 8640     switch (mode) {
 8641       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 8642       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 8643       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 8644       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 8645       default: ShouldNotReachHere();
 8646     }
 8647 
 8648     __ align(CodeEntryAlignment);
 8649     address entry = __ pc();
 8650     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8651              tmp1 = r10, tmp2 = r11;
 8652 
 8653     Label LOOP, DONE, MISMATCH;
 8654     Register vec_len = tmp1;
 8655     Register idx = tmp2;
 8656     // The minimum of the string lengths has been stored in cnt2.
 8657     Register cnt = cnt2;
 8658     FloatRegister ztmp1 = z0, ztmp2 = z1;
 8659     PRegister pgtmp1 = p0, pgtmp2 = p1;
 8660 
 8661 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 8662     switch (mode) {                                                            \
 8663       case LL:                                                                 \
 8664         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 8665         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 8666         break;                                                                 \
 8667       case LU:                                                                 \
 8668         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 8669         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8670         break;                                                                 \
 8671       case UL:                                                                 \
 8672         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8673         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 8674         break;                                                                 \
 8675       case UU:                                                                 \
 8676         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8677         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8678         break;                                                                 \
 8679       default:                                                                 \
 8680         ShouldNotReachHere();                                                  \
 8681     }
 8682 
 8683     StubCodeMark mark(this, stub_id);
 8684 
 8685     __ mov(idx, 0);
 8686     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8687 
 8688     if (mode == LL) {
 8689       __ sve_cntb(vec_len);
 8690     } else {
 8691       __ sve_cnth(vec_len);
 8692     }
 8693 
 8694     __ sub(rscratch1, cnt, vec_len);
 8695 
 8696     __ bind(LOOP);
 8697 
 8698       // main loop
 8699       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8700       __ add(idx, idx, vec_len);
 8701       // Compare strings.
 8702       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8703       __ br(__ NE, MISMATCH);
 8704       __ cmp(idx, rscratch1);
 8705       __ br(__ LT, LOOP);
 8706 
 8707     // post loop, last iteration
 8708     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8709 
 8710     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8711     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8712     __ br(__ EQ, DONE);
 8713 
 8714     __ bind(MISMATCH);
 8715 
 8716     // Crop the vector to find its location.
 8717     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 8718     // Extract the first different characters of each string.
 8719     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 8720     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 8721 
 8722     // Compute the difference of the first different characters.
 8723     __ sub(result, rscratch1, rscratch2);
 8724 
 8725     __ bind(DONE);
 8726     __ ret(lr);
 8727 #undef LOAD_PAIR
 8728     return entry;
 8729   }
 8730 
 8731   void generate_compare_long_strings() {
 8732     if (UseSVE == 0) {
 8733       StubRoutines::aarch64::_compare_long_string_LL
 8734           = generate_compare_long_string_same_encoding(true);
 8735       StubRoutines::aarch64::_compare_long_string_UU
 8736           = generate_compare_long_string_same_encoding(false);
 8737       StubRoutines::aarch64::_compare_long_string_LU
 8738           = generate_compare_long_string_different_encoding(true);
 8739       StubRoutines::aarch64::_compare_long_string_UL
 8740           = generate_compare_long_string_different_encoding(false);
 8741     } else {
 8742       StubRoutines::aarch64::_compare_long_string_LL
 8743           = generate_compare_long_string_sve(LL);
 8744       StubRoutines::aarch64::_compare_long_string_UU
 8745           = generate_compare_long_string_sve(UU);
 8746       StubRoutines::aarch64::_compare_long_string_LU
 8747           = generate_compare_long_string_sve(LU);
 8748       StubRoutines::aarch64::_compare_long_string_UL
 8749           = generate_compare_long_string_sve(UL);
 8750     }
 8751   }
 8752 
 8753   // R0 = result
 8754   // R1 = str2
 8755   // R2 = cnt1
 8756   // R3 = str1
 8757   // R4 = cnt2
 8758   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 8759   //
 8760   // This generic linear code use few additional ideas, which makes it faster:
 8761   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 8762   // in order to skip initial loading(help in systems with 1 ld pipeline)
 8763   // 2) we can use "fast" algorithm of finding single character to search for
 8764   // first symbol with less branches(1 branch per each loaded register instead
 8765   // of branch for each symbol), so, this is where constants like
 8766   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 8767   // 3) after loading and analyzing 1st register of source string, it can be
 8768   // used to search for every 1st character entry, saving few loads in
 8769   // comparison with "simplier-but-slower" implementation
 8770   // 4) in order to avoid lots of push/pop operations, code below is heavily
 8771   // re-using/re-initializing/compressing register values, which makes code
 8772   // larger and a bit less readable, however, most of extra operations are
 8773   // issued during loads or branches, so, penalty is minimal
 8774   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 8775     StubGenStubId stub_id;
 8776     if (str1_isL) {
 8777       if (str2_isL) {
 8778         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 8779       } else {
 8780         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 8781       }
 8782     } else {
 8783       if (str2_isL) {
 8784         ShouldNotReachHere();
 8785       } else {
 8786         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 8787       }
 8788     }
 8789     __ align(CodeEntryAlignment);
 8790     StubCodeMark mark(this, stub_id);
 8791     address entry = __ pc();
 8792 
 8793     int str1_chr_size = str1_isL ? 1 : 2;
 8794     int str2_chr_size = str2_isL ? 1 : 2;
 8795     int str1_chr_shift = str1_isL ? 0 : 1;
 8796     int str2_chr_shift = str2_isL ? 0 : 1;
 8797     bool isL = str1_isL && str2_isL;
 8798    // parameters
 8799     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 8800     // temporary registers
 8801     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 8802     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 8803     // redefinitions
 8804     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 8805 
 8806     __ push(spilled_regs, sp);
 8807     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 8808         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 8809         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 8810         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 8811         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 8812         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 8813     // Read whole register from str1. It is safe, because length >=8 here
 8814     __ ldr(ch1, Address(str1));
 8815     // Read whole register from str2. It is safe, because length >=8 here
 8816     __ ldr(ch2, Address(str2));
 8817     __ sub(cnt2, cnt2, cnt1);
 8818     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 8819     if (str1_isL != str2_isL) {
 8820       __ eor(v0, __ T16B, v0, v0);
 8821     }
 8822     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 8823     __ mul(first, first, tmp1);
 8824     // check if we have less than 1 register to check
 8825     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 8826     if (str1_isL != str2_isL) {
 8827       __ fmovd(v1, ch1);
 8828     }
 8829     __ br(__ LE, L_SMALL);
 8830     __ eor(ch2, first, ch2);
 8831     if (str1_isL != str2_isL) {
 8832       __ zip1(v1, __ T16B, v1, v0);
 8833     }
 8834     __ sub(tmp2, ch2, tmp1);
 8835     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8836     __ bics(tmp2, tmp2, ch2);
 8837     if (str1_isL != str2_isL) {
 8838       __ fmovd(ch1, v1);
 8839     }
 8840     __ br(__ NE, L_HAS_ZERO);
 8841     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8842     __ add(result, result, wordSize/str2_chr_size);
 8843     __ add(str2, str2, wordSize);
 8844     __ br(__ LT, L_POST_LOOP);
 8845     __ BIND(L_LOOP);
 8846       __ ldr(ch2, Address(str2));
 8847       __ eor(ch2, first, ch2);
 8848       __ sub(tmp2, ch2, tmp1);
 8849       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8850       __ bics(tmp2, tmp2, ch2);
 8851       __ br(__ NE, L_HAS_ZERO);
 8852     __ BIND(L_LOOP_PROCEED);
 8853       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8854       __ add(str2, str2, wordSize);
 8855       __ add(result, result, wordSize/str2_chr_size);
 8856       __ br(__ GE, L_LOOP);
 8857     __ BIND(L_POST_LOOP);
 8858       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 8859       __ br(__ LE, NOMATCH);
 8860       __ ldr(ch2, Address(str2));
 8861       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 8862       __ eor(ch2, first, ch2);
 8863       __ sub(tmp2, ch2, tmp1);
 8864       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8865       __ mov(tmp4, -1); // all bits set
 8866       __ b(L_SMALL_PROCEED);
 8867     __ align(OptoLoopAlignment);
 8868     __ BIND(L_SMALL);
 8869       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 8870       __ eor(ch2, first, ch2);
 8871       if (str1_isL != str2_isL) {
 8872         __ zip1(v1, __ T16B, v1, v0);
 8873       }
 8874       __ sub(tmp2, ch2, tmp1);
 8875       __ mov(tmp4, -1); // all bits set
 8876       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8877       if (str1_isL != str2_isL) {
 8878         __ fmovd(ch1, v1); // move converted 4 symbols
 8879       }
 8880     __ BIND(L_SMALL_PROCEED);
 8881       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 8882       __ bic(tmp2, tmp2, ch2);
 8883       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 8884       __ rbit(tmp2, tmp2);
 8885       __ br(__ EQ, NOMATCH);
 8886     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 8887       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 8888       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 8889       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 8890       if (str2_isL) { // LL
 8891         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 8892         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 8893         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 8894         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 8895         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8896       } else {
 8897         __ mov(ch2, 0xE); // all bits in byte set except last one
 8898         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8899         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8900         __ lslv(tmp2, tmp2, tmp4);
 8901         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8902         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8903         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8904         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8905       }
 8906       __ cmp(ch1, ch2);
 8907       __ mov(tmp4, wordSize/str2_chr_size);
 8908       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8909     __ BIND(L_SMALL_CMP_LOOP);
 8910       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 8911                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 8912       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 8913                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 8914       __ add(tmp4, tmp4, 1);
 8915       __ cmp(tmp4, cnt1);
 8916       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 8917       __ cmp(first, ch2);
 8918       __ br(__ EQ, L_SMALL_CMP_LOOP);
 8919     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 8920       __ cbz(tmp2, NOMATCH); // no more matches. exit
 8921       __ clz(tmp4, tmp2);
 8922       __ add(result, result, 1); // advance index
 8923       __ add(str2, str2, str2_chr_size); // advance pointer
 8924       __ b(L_SMALL_HAS_ZERO_LOOP);
 8925     __ align(OptoLoopAlignment);
 8926     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 8927       __ cmp(first, ch2);
 8928       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8929       __ b(DONE);
 8930     __ align(OptoLoopAlignment);
 8931     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 8932       if (str2_isL) { // LL
 8933         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 8934         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 8935         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 8936         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 8937         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8938       } else {
 8939         __ mov(ch2, 0xE); // all bits in byte set except last one
 8940         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8941         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8942         __ lslv(tmp2, tmp2, tmp4);
 8943         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8944         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8945         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 8946         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8947       }
 8948       __ cmp(ch1, ch2);
 8949       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 8950       __ b(DONE);
 8951     __ align(OptoLoopAlignment);
 8952     __ BIND(L_HAS_ZERO);
 8953       __ rbit(tmp2, tmp2);
 8954       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 8955       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 8956       // It's fine because both counters are 32bit and are not changed in this
 8957       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 8958       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 8959       __ sub(result, result, 1);
 8960     __ BIND(L_HAS_ZERO_LOOP);
 8961       __ mov(cnt1, wordSize/str2_chr_size);
 8962       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 8963       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 8964       if (str2_isL) {
 8965         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 8966         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8967         __ lslv(tmp2, tmp2, tmp4);
 8968         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8969         __ add(tmp4, tmp4, 1);
 8970         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8971         __ lsl(tmp2, tmp2, 1);
 8972         __ mov(tmp4, wordSize/str2_chr_size);
 8973       } else {
 8974         __ mov(ch2, 0xE);
 8975         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 8976         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 8977         __ lslv(tmp2, tmp2, tmp4);
 8978         __ add(tmp4, tmp4, 1);
 8979         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 8980         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 8981         __ lsl(tmp2, tmp2, 1);
 8982         __ mov(tmp4, wordSize/str2_chr_size);
 8983         __ sub(str2, str2, str2_chr_size);
 8984       }
 8985       __ cmp(ch1, ch2);
 8986       __ mov(tmp4, wordSize/str2_chr_size);
 8987       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 8988     __ BIND(L_CMP_LOOP);
 8989       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 8990                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 8991       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 8992                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 8993       __ add(tmp4, tmp4, 1);
 8994       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 8995       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 8996       __ cmp(cnt1, ch2);
 8997       __ br(__ EQ, L_CMP_LOOP);
 8998     __ BIND(L_CMP_LOOP_NOMATCH);
 8999       // here we're not matched
 9000       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9001       __ clz(tmp4, tmp2);
 9002       __ add(str2, str2, str2_chr_size); // advance pointer
 9003       __ b(L_HAS_ZERO_LOOP);
 9004     __ align(OptoLoopAlignment);
 9005     __ BIND(L_CMP_LOOP_LAST_CMP);
 9006       __ cmp(cnt1, ch2);
 9007       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9008       __ b(DONE);
 9009     __ align(OptoLoopAlignment);
 9010     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9011       if (str2_isL) {
 9012         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9013         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9014         __ lslv(tmp2, tmp2, tmp4);
 9015         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9016         __ add(tmp4, tmp4, 1);
 9017         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9018         __ lsl(tmp2, tmp2, 1);
 9019       } else {
 9020         __ mov(ch2, 0xE);
 9021         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9022         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9023         __ lslv(tmp2, tmp2, tmp4);
 9024         __ add(tmp4, tmp4, 1);
 9025         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9026         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9027         __ lsl(tmp2, tmp2, 1);
 9028         __ sub(str2, str2, str2_chr_size);
 9029       }
 9030       __ cmp(ch1, ch2);
 9031       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9032       __ b(DONE);
 9033     __ align(OptoLoopAlignment);
 9034     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9035       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9036       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9037       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9038       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9039       // result by analyzed characters value, so, we can just reset lower bits
 9040       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9041       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9042       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9043       // index of last analyzed substring inside current octet. So, str2 in at
 9044       // respective start address. We need to advance it to next octet
 9045       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9046       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9047       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9048       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9049       __ movw(cnt2, cnt2);
 9050       __ b(L_LOOP_PROCEED);
 9051     __ align(OptoLoopAlignment);
 9052     __ BIND(NOMATCH);
 9053       __ mov(result, -1);
 9054     __ BIND(DONE);
 9055       __ pop(spilled_regs, sp);
 9056       __ ret(lr);
 9057     return entry;
 9058   }
 9059 
 9060   void generate_string_indexof_stubs() {
 9061     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9062     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9063     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9064   }
 9065 
 9066   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9067       FloatRegister src1, FloatRegister src2) {
 9068     Register dst = r1;
 9069     __ zip1(v1, __ T16B, src1, v0);
 9070     __ zip2(v2, __ T16B, src1, v0);
 9071     if (generatePrfm) {
 9072       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9073     }
 9074     __ zip1(v3, __ T16B, src2, v0);
 9075     __ zip2(v4, __ T16B, src2, v0);
 9076     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9077   }
 9078 
 9079   // R0 = src
 9080   // R1 = dst
 9081   // R2 = len
 9082   // R3 = len >> 3
 9083   // V0 = 0
 9084   // v1 = loaded 8 bytes
 9085   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9086   address generate_large_byte_array_inflate() {
 9087     __ align(CodeEntryAlignment);
 9088     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 9089     StubCodeMark mark(this, stub_id);
 9090     address entry = __ pc();
 9091     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9092     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9093     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9094 
 9095     // do one more 8-byte read to have address 16-byte aligned in most cases
 9096     // also use single store instruction
 9097     __ ldrd(v2, __ post(src, 8));
 9098     __ sub(octetCounter, octetCounter, 2);
 9099     __ zip1(v1, __ T16B, v1, v0);
 9100     __ zip1(v2, __ T16B, v2, v0);
 9101     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9102     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9103     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9104     __ br(__ LE, LOOP_START);
 9105     __ b(LOOP_PRFM_START);
 9106     __ bind(LOOP_PRFM);
 9107       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9108     __ bind(LOOP_PRFM_START);
 9109       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9110       __ sub(octetCounter, octetCounter, 8);
 9111       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9112       inflate_and_store_2_fp_registers(true, v3, v4);
 9113       inflate_and_store_2_fp_registers(true, v5, v6);
 9114       __ br(__ GT, LOOP_PRFM);
 9115       __ cmp(octetCounter, (u1)8);
 9116       __ br(__ LT, DONE);
 9117     __ bind(LOOP);
 9118       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9119       __ bind(LOOP_START);
 9120       __ sub(octetCounter, octetCounter, 8);
 9121       __ cmp(octetCounter, (u1)8);
 9122       inflate_and_store_2_fp_registers(false, v3, v4);
 9123       inflate_and_store_2_fp_registers(false, v5, v6);
 9124       __ br(__ GE, LOOP);
 9125     __ bind(DONE);
 9126       __ ret(lr);
 9127     return entry;
 9128   }
 9129 
 9130   /**
 9131    *  Arguments:
 9132    *
 9133    *  Input:
 9134    *  c_rarg0   - current state address
 9135    *  c_rarg1   - H key address
 9136    *  c_rarg2   - data address
 9137    *  c_rarg3   - number of blocks
 9138    *
 9139    *  Output:
 9140    *  Updated state at c_rarg0
 9141    */
 9142   address generate_ghash_processBlocks() {
 9143     // Bafflingly, GCM uses little-endian for the byte order, but
 9144     // big-endian for the bit order.  For example, the polynomial 1 is
 9145     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9146     //
 9147     // So, we must either reverse the bytes in each word and do
 9148     // everything big-endian or reverse the bits in each byte and do
 9149     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9150     // the bits in each byte (we have an instruction, RBIT, to do
 9151     // that) and keep the data in little-endian bit order through the
 9152     // calculation, bit-reversing the inputs and outputs.
 9153 
 9154     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 9155     StubCodeMark mark(this, stub_id);
 9156     __ align(wordSize * 2);
 9157     address p = __ pc();
 9158     __ emit_int64(0x87);  // The low-order bits of the field
 9159                           // polynomial (i.e. p = z^7+z^2+z+1)
 9160                           // repeated in the low and high parts of a
 9161                           // 128-bit vector
 9162     __ emit_int64(0x87);
 9163 
 9164     __ align(CodeEntryAlignment);
 9165     address start = __ pc();
 9166 
 9167     Register state   = c_rarg0;
 9168     Register subkeyH = c_rarg1;
 9169     Register data    = c_rarg2;
 9170     Register blocks  = c_rarg3;
 9171 
 9172     FloatRegister vzr = v30;
 9173     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9174 
 9175     __ ldrq(v24, p);    // The field polynomial
 9176 
 9177     __ ldrq(v0, Address(state));
 9178     __ ldrq(v1, Address(subkeyH));
 9179 
 9180     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9181     __ rbit(v0, __ T16B, v0);
 9182     __ rev64(v1, __ T16B, v1);
 9183     __ rbit(v1, __ T16B, v1);
 9184 
 9185     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9186     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9187 
 9188     {
 9189       Label L_ghash_loop;
 9190       __ bind(L_ghash_loop);
 9191 
 9192       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9193                                                  // reversing each byte
 9194       __ rbit(v2, __ T16B, v2);
 9195       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9196 
 9197       // Multiply state in v2 by subkey in v1
 9198       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9199                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9200                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9201       // Reduce v7:v5 by the field polynomial
 9202       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9203 
 9204       __ sub(blocks, blocks, 1);
 9205       __ cbnz(blocks, L_ghash_loop);
 9206     }
 9207 
 9208     // The bit-reversed result is at this point in v0
 9209     __ rev64(v0, __ T16B, v0);
 9210     __ rbit(v0, __ T16B, v0);
 9211 
 9212     __ st1(v0, __ T16B, state);
 9213     __ ret(lr);
 9214 
 9215     return start;
 9216   }
 9217 
 9218   address generate_ghash_processBlocks_wide() {
 9219     address small = generate_ghash_processBlocks();
 9220 
 9221     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 9222     StubCodeMark mark(this, stub_id);
 9223     __ align(wordSize * 2);
 9224     address p = __ pc();
 9225     __ emit_int64(0x87);  // The low-order bits of the field
 9226                           // polynomial (i.e. p = z^7+z^2+z+1)
 9227                           // repeated in the low and high parts of a
 9228                           // 128-bit vector
 9229     __ emit_int64(0x87);
 9230 
 9231     __ align(CodeEntryAlignment);
 9232     address start = __ pc();
 9233 
 9234     Register state   = c_rarg0;
 9235     Register subkeyH = c_rarg1;
 9236     Register data    = c_rarg2;
 9237     Register blocks  = c_rarg3;
 9238 
 9239     const int unroll = 4;
 9240 
 9241     __ cmp(blocks, (unsigned char)(unroll * 2));
 9242     __ br(__ LT, small);
 9243 
 9244     if (unroll > 1) {
 9245     // Save state before entering routine
 9246       __ sub(sp, sp, 4 * 16);
 9247       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9248       __ sub(sp, sp, 4 * 16);
 9249       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9250     }
 9251 
 9252     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9253 
 9254     if (unroll > 1) {
 9255       // And restore state
 9256       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9257       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9258     }
 9259 
 9260     __ cmp(blocks, (unsigned char)0);
 9261     __ br(__ GT, small);
 9262 
 9263     __ ret(lr);
 9264 
 9265     return start;
 9266   }
 9267 
 9268   void generate_base64_encode_simdround(Register src, Register dst,
 9269         FloatRegister codec, u8 size) {
 9270 
 9271     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9272     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9273     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9274 
 9275     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9276 
 9277     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9278 
 9279     __ ushr(ind0, arrangement, in0,  2);
 9280 
 9281     __ ushr(ind1, arrangement, in1,  2);
 9282     __ shl(in0,   arrangement, in0,  6);
 9283     __ orr(ind1,  arrangement, ind1, in0);
 9284     __ ushr(ind1, arrangement, ind1, 2);
 9285 
 9286     __ ushr(ind2, arrangement, in2,  4);
 9287     __ shl(in1,   arrangement, in1,  4);
 9288     __ orr(ind2,  arrangement, in1,  ind2);
 9289     __ ushr(ind2, arrangement, ind2, 2);
 9290 
 9291     __ shl(ind3,  arrangement, in2,  2);
 9292     __ ushr(ind3, arrangement, ind3, 2);
 9293 
 9294     __ tbl(out0,  arrangement, codec,  4, ind0);
 9295     __ tbl(out1,  arrangement, codec,  4, ind1);
 9296     __ tbl(out2,  arrangement, codec,  4, ind2);
 9297     __ tbl(out3,  arrangement, codec,  4, ind3);
 9298 
 9299     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9300   }
 9301 
 9302    /**
 9303    *  Arguments:
 9304    *
 9305    *  Input:
 9306    *  c_rarg0   - src_start
 9307    *  c_rarg1   - src_offset
 9308    *  c_rarg2   - src_length
 9309    *  c_rarg3   - dest_start
 9310    *  c_rarg4   - dest_offset
 9311    *  c_rarg5   - isURL
 9312    *
 9313    */
 9314   address generate_base64_encodeBlock() {
 9315 
 9316     static const char toBase64[64] = {
 9317       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9318       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9319       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9320       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9321       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9322     };
 9323 
 9324     static const char toBase64URL[64] = {
 9325       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9326       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9327       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9328       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9329       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9330     };
 9331 
 9332     __ align(CodeEntryAlignment);
 9333     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 9334     StubCodeMark mark(this, stub_id);
 9335     address start = __ pc();
 9336 
 9337     Register src   = c_rarg0;  // source array
 9338     Register soff  = c_rarg1;  // source start offset
 9339     Register send  = c_rarg2;  // source end offset
 9340     Register dst   = c_rarg3;  // dest array
 9341     Register doff  = c_rarg4;  // position for writing to dest array
 9342     Register isURL = c_rarg5;  // Base64 or URL character set
 9343 
 9344     // c_rarg6 and c_rarg7 are free to use as temps
 9345     Register codec  = c_rarg6;
 9346     Register length = c_rarg7;
 9347 
 9348     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9349 
 9350     __ add(src, src, soff);
 9351     __ add(dst, dst, doff);
 9352     __ sub(length, send, soff);
 9353 
 9354     // load the codec base address
 9355     __ lea(codec, ExternalAddress((address) toBase64));
 9356     __ cbz(isURL, ProcessData);
 9357     __ lea(codec, ExternalAddress((address) toBase64URL));
 9358 
 9359     __ BIND(ProcessData);
 9360 
 9361     // too short to formup a SIMD loop, roll back
 9362     __ cmp(length, (u1)24);
 9363     __ br(Assembler::LT, Process3B);
 9364 
 9365     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9366 
 9367     __ BIND(Process48B);
 9368     __ cmp(length, (u1)48);
 9369     __ br(Assembler::LT, Process24B);
 9370     generate_base64_encode_simdround(src, dst, v0, 16);
 9371     __ sub(length, length, 48);
 9372     __ b(Process48B);
 9373 
 9374     __ BIND(Process24B);
 9375     __ cmp(length, (u1)24);
 9376     __ br(Assembler::LT, SIMDExit);
 9377     generate_base64_encode_simdround(src, dst, v0, 8);
 9378     __ sub(length, length, 24);
 9379 
 9380     __ BIND(SIMDExit);
 9381     __ cbz(length, Exit);
 9382 
 9383     __ BIND(Process3B);
 9384     //  3 src bytes, 24 bits
 9385     __ ldrb(r10, __ post(src, 1));
 9386     __ ldrb(r11, __ post(src, 1));
 9387     __ ldrb(r12, __ post(src, 1));
 9388     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9389     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9390     // codec index
 9391     __ ubfmw(r15, r12, 18, 23);
 9392     __ ubfmw(r14, r12, 12, 17);
 9393     __ ubfmw(r13, r12, 6,  11);
 9394     __ andw(r12,  r12, 63);
 9395     // get the code based on the codec
 9396     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9397     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9398     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9399     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9400     __ strb(r15, __ post(dst, 1));
 9401     __ strb(r14, __ post(dst, 1));
 9402     __ strb(r13, __ post(dst, 1));
 9403     __ strb(r12, __ post(dst, 1));
 9404     __ sub(length, length, 3);
 9405     __ cbnz(length, Process3B);
 9406 
 9407     __ BIND(Exit);
 9408     __ ret(lr);
 9409 
 9410     return start;
 9411   }
 9412 
 9413   void generate_base64_decode_simdround(Register src, Register dst,
 9414         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9415 
 9416     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9417     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9418 
 9419     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9420     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9421 
 9422     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9423 
 9424     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9425 
 9426     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9427 
 9428     // we need unsigned saturating subtract, to make sure all input values
 9429     // in range [0, 63] will have 0U value in the higher half lookup
 9430     __ uqsubv(decH0, __ T16B, in0, v27);
 9431     __ uqsubv(decH1, __ T16B, in1, v27);
 9432     __ uqsubv(decH2, __ T16B, in2, v27);
 9433     __ uqsubv(decH3, __ T16B, in3, v27);
 9434 
 9435     // lower half lookup
 9436     __ tbl(decL0, arrangement, codecL, 4, in0);
 9437     __ tbl(decL1, arrangement, codecL, 4, in1);
 9438     __ tbl(decL2, arrangement, codecL, 4, in2);
 9439     __ tbl(decL3, arrangement, codecL, 4, in3);
 9440 
 9441     // higher half lookup
 9442     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9443     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9444     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9445     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9446 
 9447     // combine lower and higher
 9448     __ orr(decL0, arrangement, decL0, decH0);
 9449     __ orr(decL1, arrangement, decL1, decH1);
 9450     __ orr(decL2, arrangement, decL2, decH2);
 9451     __ orr(decL3, arrangement, decL3, decH3);
 9452 
 9453     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9454     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9455     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9456     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9457     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9458     __ orr(in0, arrangement, decH0, decH1);
 9459     __ orr(in1, arrangement, decH2, decH3);
 9460     __ orr(in2, arrangement, in0,   in1);
 9461     __ umaxv(in3, arrangement, in2);
 9462     __ umov(rscratch2, in3, __ B, 0);
 9463 
 9464     // get the data to output
 9465     __ shl(out0,  arrangement, decL0, 2);
 9466     __ ushr(out1, arrangement, decL1, 4);
 9467     __ orr(out0,  arrangement, out0,  out1);
 9468     __ shl(out1,  arrangement, decL1, 4);
 9469     __ ushr(out2, arrangement, decL2, 2);
 9470     __ orr(out1,  arrangement, out1,  out2);
 9471     __ shl(out2,  arrangement, decL2, 6);
 9472     __ orr(out2,  arrangement, out2,  decL3);
 9473 
 9474     __ cbz(rscratch2, NoIllegalData);
 9475 
 9476     // handle illegal input
 9477     __ umov(r10, in2, __ D, 0);
 9478     if (size == 16) {
 9479       __ cbnz(r10, ErrorInLowerHalf);
 9480 
 9481       // illegal input is in higher half, store the lower half now.
 9482       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9483 
 9484       __ umov(r10, in2,  __ D, 1);
 9485       __ umov(r11, out0, __ D, 1);
 9486       __ umov(r12, out1, __ D, 1);
 9487       __ umov(r13, out2, __ D, 1);
 9488       __ b(StoreLegalData);
 9489 
 9490       __ BIND(ErrorInLowerHalf);
 9491     }
 9492     __ umov(r11, out0, __ D, 0);
 9493     __ umov(r12, out1, __ D, 0);
 9494     __ umov(r13, out2, __ D, 0);
 9495 
 9496     __ BIND(StoreLegalData);
 9497     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 9498     __ strb(r11, __ post(dst, 1));
 9499     __ strb(r12, __ post(dst, 1));
 9500     __ strb(r13, __ post(dst, 1));
 9501     __ lsr(r10, r10, 8);
 9502     __ lsr(r11, r11, 8);
 9503     __ lsr(r12, r12, 8);
 9504     __ lsr(r13, r13, 8);
 9505     __ b(StoreLegalData);
 9506 
 9507     __ BIND(NoIllegalData);
 9508     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 9509   }
 9510 
 9511 
 9512    /**
 9513    *  Arguments:
 9514    *
 9515    *  Input:
 9516    *  c_rarg0   - src_start
 9517    *  c_rarg1   - src_offset
 9518    *  c_rarg2   - src_length
 9519    *  c_rarg3   - dest_start
 9520    *  c_rarg4   - dest_offset
 9521    *  c_rarg5   - isURL
 9522    *  c_rarg6   - isMIME
 9523    *
 9524    */
 9525   address generate_base64_decodeBlock() {
 9526 
 9527     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 9528     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 9529     // titled "Base64 decoding".
 9530 
 9531     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 9532     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 9533     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 9534     static const uint8_t fromBase64ForNoSIMD[256] = {
 9535       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9536       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9537       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9538        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9539       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9540        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 9541       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9542        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9543       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9544       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9545       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9546       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9547       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9548       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9549       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9550       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9551     };
 9552 
 9553     static const uint8_t fromBase64URLForNoSIMD[256] = {
 9554       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9555       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9556       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9557        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9558       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9559        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 9560       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9561        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9562       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9563       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9564       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9565       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9566       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9567       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9568       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9569       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9570     };
 9571 
 9572     // A legal value of base64 code is in range [0, 127].  We need two lookups
 9573     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 9574     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 9575     // table vector lookup use tbx, out of range indices are unchanged in
 9576     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 9577     // The value of index 64 is set to 0, so that we know that we already get the
 9578     // decoded data with the 1st lookup.
 9579     static const uint8_t fromBase64ForSIMD[128] = {
 9580       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9581       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9582       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9583        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9584         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9585        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9586       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9587        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9588     };
 9589 
 9590     static const uint8_t fromBase64URLForSIMD[128] = {
 9591       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9592       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9593       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9594        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9595         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9596        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9597        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9598        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9599     };
 9600 
 9601     __ align(CodeEntryAlignment);
 9602     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 9603     StubCodeMark mark(this, stub_id);
 9604     address start = __ pc();
 9605 
 9606     Register src    = c_rarg0;  // source array
 9607     Register soff   = c_rarg1;  // source start offset
 9608     Register send   = c_rarg2;  // source end offset
 9609     Register dst    = c_rarg3;  // dest array
 9610     Register doff   = c_rarg4;  // position for writing to dest array
 9611     Register isURL  = c_rarg5;  // Base64 or URL character set
 9612     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 9613 
 9614     Register length = send;    // reuse send as length of source data to process
 9615 
 9616     Register simd_codec   = c_rarg6;
 9617     Register nosimd_codec = c_rarg7;
 9618 
 9619     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 9620 
 9621     __ enter();
 9622 
 9623     __ add(src, src, soff);
 9624     __ add(dst, dst, doff);
 9625 
 9626     __ mov(doff, dst);
 9627 
 9628     __ sub(length, send, soff);
 9629     __ bfm(length, zr, 0, 1);
 9630 
 9631     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 9632     __ cbz(isURL, ProcessData);
 9633     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 9634 
 9635     __ BIND(ProcessData);
 9636     __ mov(rscratch1, length);
 9637     __ cmp(length, (u1)144); // 144 = 80 + 64
 9638     __ br(Assembler::LT, Process4B);
 9639 
 9640     // In the MIME case, the line length cannot be more than 76
 9641     // bytes (see RFC 2045). This is too short a block for SIMD
 9642     // to be worthwhile, so we use non-SIMD here.
 9643     __ movw(rscratch1, 79);
 9644 
 9645     __ BIND(Process4B);
 9646     __ ldrw(r14, __ post(src, 4));
 9647     __ ubfxw(r10, r14, 0,  8);
 9648     __ ubfxw(r11, r14, 8,  8);
 9649     __ ubfxw(r12, r14, 16, 8);
 9650     __ ubfxw(r13, r14, 24, 8);
 9651     // get the de-code
 9652     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 9653     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 9654     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 9655     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 9656     // error detection, 255u indicates an illegal input
 9657     __ orrw(r14, r10, r11);
 9658     __ orrw(r15, r12, r13);
 9659     __ orrw(r14, r14, r15);
 9660     __ tbnz(r14, 7, Exit);
 9661     // recover the data
 9662     __ lslw(r14, r10, 10);
 9663     __ bfiw(r14, r11, 4, 6);
 9664     __ bfmw(r14, r12, 2, 5);
 9665     __ rev16w(r14, r14);
 9666     __ bfiw(r13, r12, 6, 2);
 9667     __ strh(r14, __ post(dst, 2));
 9668     __ strb(r13, __ post(dst, 1));
 9669     // non-simd loop
 9670     __ subsw(rscratch1, rscratch1, 4);
 9671     __ br(Assembler::GT, Process4B);
 9672 
 9673     // if exiting from PreProcess80B, rscratch1 == -1;
 9674     // otherwise, rscratch1 == 0.
 9675     __ cbzw(rscratch1, Exit);
 9676     __ sub(length, length, 80);
 9677 
 9678     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 9679     __ cbz(isURL, SIMDEnter);
 9680     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 9681 
 9682     __ BIND(SIMDEnter);
 9683     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 9684     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 9685     __ mov(rscratch1, 63);
 9686     __ dup(v27, __ T16B, rscratch1);
 9687 
 9688     __ BIND(Process64B);
 9689     __ cmp(length, (u1)64);
 9690     __ br(Assembler::LT, Process32B);
 9691     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 9692     __ sub(length, length, 64);
 9693     __ b(Process64B);
 9694 
 9695     __ BIND(Process32B);
 9696     __ cmp(length, (u1)32);
 9697     __ br(Assembler::LT, SIMDExit);
 9698     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 9699     __ sub(length, length, 32);
 9700     __ b(Process32B);
 9701 
 9702     __ BIND(SIMDExit);
 9703     __ cbz(length, Exit);
 9704     __ movw(rscratch1, length);
 9705     __ b(Process4B);
 9706 
 9707     __ BIND(Exit);
 9708     __ sub(c_rarg0, dst, doff);
 9709 
 9710     __ leave();
 9711     __ ret(lr);
 9712 
 9713     return start;
 9714   }
 9715 
 9716   // Support for spin waits.
 9717   address generate_spin_wait() {
 9718     __ align(CodeEntryAlignment);
 9719     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 9720     StubCodeMark mark(this, stub_id);
 9721     address start = __ pc();
 9722 
 9723     __ spin_wait();
 9724     __ ret(lr);
 9725 
 9726     return start;
 9727   }
 9728 
 9729   void generate_lookup_secondary_supers_table_stub() {
 9730     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 9731     StubCodeMark mark(this, stub_id);
 9732 
 9733     const Register
 9734       r_super_klass  = r0,
 9735       r_array_base   = r1,
 9736       r_array_length = r2,
 9737       r_array_index  = r3,
 9738       r_sub_klass    = r4,
 9739       r_bitmap       = rscratch2,
 9740       result         = r5;
 9741     const FloatRegister
 9742       vtemp          = v0;
 9743 
 9744     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 9745       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 9746       Label L_success;
 9747       __ enter();
 9748       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 9749                                              r_array_base, r_array_length, r_array_index,
 9750                                              vtemp, result, slot,
 9751                                              /*stub_is_near*/true);
 9752       __ leave();
 9753       __ ret(lr);
 9754     }
 9755   }
 9756 
 9757   // Slow path implementation for UseSecondarySupersTable.
 9758   address generate_lookup_secondary_supers_table_slow_path_stub() {
 9759     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 9760     StubCodeMark mark(this, stub_id);
 9761 
 9762     address start = __ pc();
 9763     const Register
 9764       r_super_klass  = r0,        // argument
 9765       r_array_base   = r1,        // argument
 9766       temp1          = r2,        // temp
 9767       r_array_index  = r3,        // argument
 9768       r_bitmap       = rscratch2, // argument
 9769       result         = r5;        // argument
 9770 
 9771     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 9772     __ ret(lr);
 9773 
 9774     return start;
 9775   }
 9776 
 9777 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9778 
 9779   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 9780   //
 9781   // If LSE is in use, generate LSE versions of all the stubs. The
 9782   // non-LSE versions are in atomic_aarch64.S.
 9783 
 9784   // class AtomicStubMark records the entry point of a stub and the
 9785   // stub pointer which will point to it. The stub pointer is set to
 9786   // the entry point when ~AtomicStubMark() is called, which must be
 9787   // after ICache::invalidate_range. This ensures safe publication of
 9788   // the generated code.
 9789   class AtomicStubMark {
 9790     address _entry_point;
 9791     aarch64_atomic_stub_t *_stub;
 9792     MacroAssembler *_masm;
 9793   public:
 9794     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 9795       _masm = masm;
 9796       __ align(32);
 9797       _entry_point = __ pc();
 9798       _stub = stub;
 9799     }
 9800     ~AtomicStubMark() {
 9801       *_stub = (aarch64_atomic_stub_t)_entry_point;
 9802     }
 9803   };
 9804 
 9805   // NB: For memory_order_conservative we need a trailing membar after
 9806   // LSE atomic operations but not a leading membar.
 9807   //
 9808   // We don't need a leading membar because a clause in the Arm ARM
 9809   // says:
 9810   //
 9811   //   Barrier-ordered-before
 9812   //
 9813   //   Barrier instructions order prior Memory effects before subsequent
 9814   //   Memory effects generated by the same Observer. A read or a write
 9815   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 9816   //   Observer if and only if RW1 appears in program order before RW 2
 9817   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 9818   //   instruction with both Acquire and Release semantics.
 9819   //
 9820   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 9821   // and Release semantics, therefore we don't need a leading
 9822   // barrier. However, there is no corresponding Barrier-ordered-after
 9823   // relationship, therefore we need a trailing membar to prevent a
 9824   // later store or load from being reordered with the store in an
 9825   // atomic instruction.
 9826   //
 9827   // This was checked by using the herd7 consistency model simulator
 9828   // (http://diy.inria.fr/) with this test case:
 9829   //
 9830   // AArch64 LseCas
 9831   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 9832   // P0 | P1;
 9833   // LDR W4, [X2] | MOV W3, #0;
 9834   // DMB LD       | MOV W4, #1;
 9835   // LDR W3, [X1] | CASAL W3, W4, [X1];
 9836   //              | DMB ISH;
 9837   //              | STR W4, [X2];
 9838   // exists
 9839   // (0:X3=0 /\ 0:X4=1)
 9840   //
 9841   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 9842   // with the store to x in P1. Without the DMB in P1 this may happen.
 9843   //
 9844   // At the time of writing we don't know of any AArch64 hardware that
 9845   // reorders stores in this way, but the Reference Manual permits it.
 9846 
 9847   void gen_cas_entry(Assembler::operand_size size,
 9848                      atomic_memory_order order) {
 9849     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 9850       exchange_val = c_rarg2;
 9851     bool acquire, release;
 9852     switch (order) {
 9853       case memory_order_relaxed:
 9854         acquire = false;
 9855         release = false;
 9856         break;
 9857       case memory_order_release:
 9858         acquire = false;
 9859         release = true;
 9860         break;
 9861       default:
 9862         acquire = true;
 9863         release = true;
 9864         break;
 9865     }
 9866     __ mov(prev, compare_val);
 9867     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 9868     if (order == memory_order_conservative) {
 9869       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9870     }
 9871     if (size == Assembler::xword) {
 9872       __ mov(r0, prev);
 9873     } else {
 9874       __ movw(r0, prev);
 9875     }
 9876     __ ret(lr);
 9877   }
 9878 
 9879   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 9880     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 9881     // If not relaxed, then default to conservative.  Relaxed is the only
 9882     // case we use enough to be worth specializing.
 9883     if (order == memory_order_relaxed) {
 9884       __ ldadd(size, incr, prev, addr);
 9885     } else {
 9886       __ ldaddal(size, incr, prev, addr);
 9887       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9888     }
 9889     if (size == Assembler::xword) {
 9890       __ mov(r0, prev);
 9891     } else {
 9892       __ movw(r0, prev);
 9893     }
 9894     __ ret(lr);
 9895   }
 9896 
 9897   void gen_swpal_entry(Assembler::operand_size size) {
 9898     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 9899     __ swpal(size, incr, prev, addr);
 9900     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 9901     if (size == Assembler::xword) {
 9902       __ mov(r0, prev);
 9903     } else {
 9904       __ movw(r0, prev);
 9905     }
 9906     __ ret(lr);
 9907   }
 9908 
 9909   void generate_atomic_entry_points() {
 9910     if (! UseLSE) {
 9911       return;
 9912     }
 9913     __ align(CodeEntryAlignment);
 9914     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 9915     StubCodeMark mark(this, stub_id);
 9916     address first_entry = __ pc();
 9917 
 9918     // ADD, memory_order_conservative
 9919     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 9920     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 9921     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 9922     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 9923 
 9924     // ADD, memory_order_relaxed
 9925     AtomicStubMark mark_fetch_add_4_relaxed
 9926       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 9927     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 9928     AtomicStubMark mark_fetch_add_8_relaxed
 9929       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 9930     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 9931 
 9932     // XCHG, memory_order_conservative
 9933     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 9934     gen_swpal_entry(Assembler::word);
 9935     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 9936     gen_swpal_entry(Assembler::xword);
 9937 
 9938     // CAS, memory_order_conservative
 9939     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 9940     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 9941     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 9942     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 9943     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 9944     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 9945 
 9946     // CAS, memory_order_relaxed
 9947     AtomicStubMark mark_cmpxchg_1_relaxed
 9948       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 9949     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 9950     AtomicStubMark mark_cmpxchg_4_relaxed
 9951       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 9952     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 9953     AtomicStubMark mark_cmpxchg_8_relaxed
 9954       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 9955     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 9956 
 9957     AtomicStubMark mark_cmpxchg_4_release
 9958       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 9959     gen_cas_entry(MacroAssembler::word, memory_order_release);
 9960     AtomicStubMark mark_cmpxchg_8_release
 9961       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 9962     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 9963 
 9964     AtomicStubMark mark_cmpxchg_4_seq_cst
 9965       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 9966     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 9967     AtomicStubMark mark_cmpxchg_8_seq_cst
 9968       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 9969     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 9970 
 9971     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 9972   }
 9973 #endif // LINUX
 9974 
 9975   address generate_cont_thaw(Continuation::thaw_kind kind) {
 9976     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 9977     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
 9978 
 9979     address start = __ pc();
 9980 
 9981     if (return_barrier) {
 9982       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
 9983       __ mov(sp, rscratch1);
 9984     }
 9985     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 9986 
 9987     if (return_barrier) {
 9988       // preserve possible return value from a method returning to the return barrier
 9989       __ fmovd(rscratch1, v0);
 9990       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 9991     }
 9992 
 9993     __ movw(c_rarg1, (return_barrier ? 1 : 0));
 9994     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
 9995     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
 9996 
 9997     if (return_barrier) {
 9998       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 9999       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10000       __ fmovd(v0, rscratch1);
10001     }
10002     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10003 
10004 
10005     Label thaw_success;
10006     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10007     __ cbnz(rscratch2, thaw_success);
10008     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10009     __ br(rscratch1);
10010     __ bind(thaw_success);
10011 
10012     // make room for the thawed frames
10013     __ sub(rscratch1, sp, rscratch2);
10014     __ andr(rscratch1, rscratch1, -16); // align
10015     __ mov(sp, rscratch1);
10016 
10017     if (return_barrier) {
10018       // save original return value -- again
10019       __ fmovd(rscratch1, v0);
10020       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10021     }
10022 
10023     // If we want, we can templatize thaw by kind, and have three different entries
10024     __ movw(c_rarg1, (uint32_t)kind);
10025 
10026     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10027     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10028 
10029     if (return_barrier) {
10030       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10031       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10032       __ fmovd(v0, rscratch1);
10033     } else {
10034       __ mov(r0, zr); // return 0 (success) from doYield
10035     }
10036 
10037     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10038     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10039     __ mov(rfp, sp);
10040 
10041     if (return_barrier_exception) {
10042       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10043       __ authenticate_return_address(c_rarg1);
10044       __ verify_oop(r0);
10045       // save return value containing the exception oop in callee-saved R19
10046       __ mov(r19, r0);
10047 
10048       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10049 
10050       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10051       // __ reinitialize_ptrue();
10052 
10053       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10054 
10055       __ mov(r1, r0); // the exception handler
10056       __ mov(r0, r19); // restore return value containing the exception oop
10057       __ verify_oop(r0);
10058 
10059       __ leave();
10060       __ mov(r3, lr);
10061       __ br(r1); // the exception handler
10062     } else {
10063       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10064       __ leave();
10065       __ ret(lr);
10066     }
10067 
10068     return start;
10069   }
10070 
10071   address generate_cont_thaw() {
10072     if (!Continuations::enabled()) return nullptr;
10073 
10074     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
10075     StubCodeMark mark(this, stub_id);
10076     address start = __ pc();
10077     generate_cont_thaw(Continuation::thaw_top);
10078     return start;
10079   }
10080 
10081   address generate_cont_returnBarrier() {
10082     if (!Continuations::enabled()) return nullptr;
10083 
10084     // TODO: will probably need multiple return barriers depending on return type
10085     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
10086     StubCodeMark mark(this, stub_id);
10087     address start = __ pc();
10088 
10089     generate_cont_thaw(Continuation::thaw_return_barrier);
10090 
10091     return start;
10092   }
10093 
10094   address generate_cont_returnBarrier_exception() {
10095     if (!Continuations::enabled()) return nullptr;
10096 
10097     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
10098     StubCodeMark mark(this, stub_id);
10099     address start = __ pc();
10100 
10101     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10102 
10103     return start;
10104   }
10105 
10106   address generate_cont_preempt_stub() {
10107     if (!Continuations::enabled()) return nullptr;
10108     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
10109     StubCodeMark mark(this, stub_id);
10110     address start = __ pc();
10111 
10112     __ reset_last_Java_frame(true);
10113 
10114     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10115     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10116     __ mov(sp, rscratch2);
10117 
10118     Label preemption_cancelled;
10119     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10120     __ cbnz(rscratch1, preemption_cancelled);
10121 
10122     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10123     SharedRuntime::continuation_enter_cleanup(_masm);
10124     __ leave();
10125     __ ret(lr);
10126 
10127     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10128     __ bind(preemption_cancelled);
10129     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10130     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10131     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10132     __ ldr(rscratch1, Address(rscratch1));
10133     __ br(rscratch1);
10134 
10135     return start;
10136   }
10137 
10138   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10139   // are represented as long[5], with BITS_PER_LIMB = 26.
10140   // Pack five 26-bit limbs into three 64-bit registers.
10141   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10142     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10143     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10144     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10145     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10146 
10147     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10148     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10149     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10150     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10151 
10152     if (dest2->is_valid()) {
10153       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10154     } else {
10155 #ifdef ASSERT
10156       Label OK;
10157       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10158       __ br(__ EQ, OK);
10159       __ stop("high bits of Poly1305 integer should be zero");
10160       __ should_not_reach_here();
10161       __ bind(OK);
10162 #endif
10163     }
10164   }
10165 
10166   // As above, but return only a 128-bit integer, packed into two
10167   // 64-bit registers.
10168   void pack_26(Register dest0, Register dest1, Register src) {
10169     pack_26(dest0, dest1, noreg, src);
10170   }
10171 
10172   // Multiply and multiply-accumulate unsigned 64-bit registers.
10173   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10174     __ mul(prod_lo, n, m);
10175     __ umulh(prod_hi, n, m);
10176   }
10177   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10178     wide_mul(rscratch1, rscratch2, n, m);
10179     __ adds(sum_lo, sum_lo, rscratch1);
10180     __ adc(sum_hi, sum_hi, rscratch2);
10181   }
10182 
10183   // Poly1305, RFC 7539
10184 
10185   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10186   // description of the tricks used to simplify and accelerate this
10187   // computation.
10188 
10189   address generate_poly1305_processBlocks() {
10190     __ align(CodeEntryAlignment);
10191     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
10192     StubCodeMark mark(this, stub_id);
10193     address start = __ pc();
10194     Label here;
10195     __ enter();
10196     RegSet callee_saved = RegSet::range(r19, r28);
10197     __ push(callee_saved, sp);
10198 
10199     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10200 
10201     // Arguments
10202     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10203 
10204     // R_n is the 128-bit randomly-generated key, packed into two
10205     // registers.  The caller passes this key to us as long[5], with
10206     // BITS_PER_LIMB = 26.
10207     const Register R_0 = *++regs, R_1 = *++regs;
10208     pack_26(R_0, R_1, r_start);
10209 
10210     // RR_n is (R_n >> 2) * 5
10211     const Register RR_0 = *++regs, RR_1 = *++regs;
10212     __ lsr(RR_0, R_0, 2);
10213     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10214     __ lsr(RR_1, R_1, 2);
10215     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10216 
10217     // U_n is the current checksum
10218     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10219     pack_26(U_0, U_1, U_2, acc_start);
10220 
10221     static constexpr int BLOCK_LENGTH = 16;
10222     Label DONE, LOOP;
10223 
10224     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10225     __ br(Assembler::LT, DONE); {
10226       __ bind(LOOP);
10227 
10228       // S_n is to be the sum of U_n and the next block of data
10229       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10230       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10231       __ adds(S_0, U_0, S_0);
10232       __ adcs(S_1, U_1, S_1);
10233       __ adc(S_2, U_2, zr);
10234       __ add(S_2, S_2, 1);
10235 
10236       const Register U_0HI = *++regs, U_1HI = *++regs;
10237 
10238       // NB: this logic depends on some of the special properties of
10239       // Poly1305 keys. In particular, because we know that the top
10240       // four bits of R_0 and R_1 are zero, we can add together
10241       // partial products without any risk of needing to propagate a
10242       // carry out.
10243       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10244       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10245       __ andr(U_2, R_0, 3);
10246       __ mul(U_2, S_2, U_2);
10247 
10248       // Recycle registers S_0, S_1, S_2
10249       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10250 
10251       // Partial reduction mod 2**130 - 5
10252       __ adds(U_1, U_0HI, U_1);
10253       __ adc(U_2, U_1HI, U_2);
10254       // Sum now in U_2:U_1:U_0.
10255       // Dead: U_0HI, U_1HI.
10256       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10257 
10258       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10259 
10260       // First, U_2:U_1:U_0 += (U_2 >> 2)
10261       __ lsr(rscratch1, U_2, 2);
10262       __ andr(U_2, U_2, (u8)3);
10263       __ adds(U_0, U_0, rscratch1);
10264       __ adcs(U_1, U_1, zr);
10265       __ adc(U_2, U_2, zr);
10266       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10267       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10268       __ adcs(U_1, U_1, zr);
10269       __ adc(U_2, U_2, zr);
10270 
10271       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10272       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10273       __ br(~ Assembler::LT, LOOP);
10274     }
10275 
10276     // Further reduce modulo 2^130 - 5
10277     __ lsr(rscratch1, U_2, 2);
10278     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10279     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10280     __ adcs(U_1, U_1, zr);
10281     __ andr(U_2, U_2, (u1)3);
10282     __ adc(U_2, U_2, zr);
10283 
10284     // Unpack the sum into five 26-bit limbs and write to memory.
10285     __ ubfiz(rscratch1, U_0, 0, 26);
10286     __ ubfx(rscratch2, U_0, 26, 26);
10287     __ stp(rscratch1, rscratch2, Address(acc_start));
10288     __ ubfx(rscratch1, U_0, 52, 12);
10289     __ bfi(rscratch1, U_1, 12, 14);
10290     __ ubfx(rscratch2, U_1, 14, 26);
10291     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10292     __ ubfx(rscratch1, U_1, 40, 24);
10293     __ bfi(rscratch1, U_2, 24, 3);
10294     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10295 
10296     __ bind(DONE);
10297     __ pop(callee_saved, sp);
10298     __ leave();
10299     __ ret(lr);
10300 
10301     return start;
10302   }
10303 
10304   // exception handler for upcall stubs
10305   address generate_upcall_stub_exception_handler() {
10306     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
10307     StubCodeMark mark(this, stub_id);
10308     address start = __ pc();
10309 
10310     // Native caller has no idea how to handle exceptions,
10311     // so we just crash here. Up to callee to catch exceptions.
10312     __ verify_oop(r0);
10313     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10314     __ blr(rscratch1);
10315     __ should_not_reach_here();
10316 
10317     return start;
10318   }
10319 
10320   // load Method* target of MethodHandle
10321   // j_rarg0 = jobject receiver
10322   // rmethod = result
10323   address generate_upcall_stub_load_target() {
10324     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
10325     StubCodeMark mark(this, stub_id);
10326     address start = __ pc();
10327 
10328     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10329       // Load target method from receiver
10330     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10331     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10332     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10333     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10334                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10335                       noreg, noreg);
10336     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10337 
10338     __ ret(lr);
10339 
10340     return start;
10341   }
10342 
10343 #undef __
10344 #define __ masm->
10345 
10346   class MontgomeryMultiplyGenerator : public MacroAssembler {
10347 
10348     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10349       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10350 
10351     RegSet _toSave;
10352     bool _squaring;
10353 
10354   public:
10355     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10356       : MacroAssembler(as->code()), _squaring(squaring) {
10357 
10358       // Register allocation
10359 
10360       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10361       Pa_base = *regs;       // Argument registers
10362       if (squaring)
10363         Pb_base = Pa_base;
10364       else
10365         Pb_base = *++regs;
10366       Pn_base = *++regs;
10367       Rlen= *++regs;
10368       inv = *++regs;
10369       Pm_base = *++regs;
10370 
10371                           // Working registers:
10372       Ra =  *++regs;        // The current digit of a, b, n, and m.
10373       Rb =  *++regs;
10374       Rm =  *++regs;
10375       Rn =  *++regs;
10376 
10377       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10378       Pb =  *++regs;
10379       Pm =  *++regs;
10380       Pn =  *++regs;
10381 
10382       t0 =  *++regs;        // Three registers which form a
10383       t1 =  *++regs;        // triple-precision accumuator.
10384       t2 =  *++regs;
10385 
10386       Ri =  *++regs;        // Inner and outer loop indexes.
10387       Rj =  *++regs;
10388 
10389       Rhi_ab = *++regs;     // Product registers: low and high parts
10390       Rlo_ab = *++regs;     // of a*b and m*n.
10391       Rhi_mn = *++regs;
10392       Rlo_mn = *++regs;
10393 
10394       // r19 and up are callee-saved.
10395       _toSave = RegSet::range(r19, *regs) + Pm_base;
10396     }
10397 
10398   private:
10399     void save_regs() {
10400       push(_toSave, sp);
10401     }
10402 
10403     void restore_regs() {
10404       pop(_toSave, sp);
10405     }
10406 
10407     template <typename T>
10408     void unroll_2(Register count, T block) {
10409       Label loop, end, odd;
10410       tbnz(count, 0, odd);
10411       cbz(count, end);
10412       align(16);
10413       bind(loop);
10414       (this->*block)();
10415       bind(odd);
10416       (this->*block)();
10417       subs(count, count, 2);
10418       br(Assembler::GT, loop);
10419       bind(end);
10420     }
10421 
10422     template <typename T>
10423     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10424       Label loop, end, odd;
10425       tbnz(count, 0, odd);
10426       cbz(count, end);
10427       align(16);
10428       bind(loop);
10429       (this->*block)(d, s, tmp);
10430       bind(odd);
10431       (this->*block)(d, s, tmp);
10432       subs(count, count, 2);
10433       br(Assembler::GT, loop);
10434       bind(end);
10435     }
10436 
10437     void pre1(RegisterOrConstant i) {
10438       block_comment("pre1");
10439       // Pa = Pa_base;
10440       // Pb = Pb_base + i;
10441       // Pm = Pm_base;
10442       // Pn = Pn_base + i;
10443       // Ra = *Pa;
10444       // Rb = *Pb;
10445       // Rm = *Pm;
10446       // Rn = *Pn;
10447       ldr(Ra, Address(Pa_base));
10448       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10449       ldr(Rm, Address(Pm_base));
10450       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10451       lea(Pa, Address(Pa_base));
10452       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10453       lea(Pm, Address(Pm_base));
10454       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10455 
10456       // Zero the m*n result.
10457       mov(Rhi_mn, zr);
10458       mov(Rlo_mn, zr);
10459     }
10460 
10461     // The core multiply-accumulate step of a Montgomery
10462     // multiplication.  The idea is to schedule operations as a
10463     // pipeline so that instructions with long latencies (loads and
10464     // multiplies) have time to complete before their results are
10465     // used.  This most benefits in-order implementations of the
10466     // architecture but out-of-order ones also benefit.
10467     void step() {
10468       block_comment("step");
10469       // MACC(Ra, Rb, t0, t1, t2);
10470       // Ra = *++Pa;
10471       // Rb = *--Pb;
10472       umulh(Rhi_ab, Ra, Rb);
10473       mul(Rlo_ab, Ra, Rb);
10474       ldr(Ra, pre(Pa, wordSize));
10475       ldr(Rb, pre(Pb, -wordSize));
10476       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10477                                        // previous iteration.
10478       // MACC(Rm, Rn, t0, t1, t2);
10479       // Rm = *++Pm;
10480       // Rn = *--Pn;
10481       umulh(Rhi_mn, Rm, Rn);
10482       mul(Rlo_mn, Rm, Rn);
10483       ldr(Rm, pre(Pm, wordSize));
10484       ldr(Rn, pre(Pn, -wordSize));
10485       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10486     }
10487 
10488     void post1() {
10489       block_comment("post1");
10490 
10491       // MACC(Ra, Rb, t0, t1, t2);
10492       // Ra = *++Pa;
10493       // Rb = *--Pb;
10494       umulh(Rhi_ab, Ra, Rb);
10495       mul(Rlo_ab, Ra, Rb);
10496       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10497       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10498 
10499       // *Pm = Rm = t0 * inv;
10500       mul(Rm, t0, inv);
10501       str(Rm, Address(Pm));
10502 
10503       // MACC(Rm, Rn, t0, t1, t2);
10504       // t0 = t1; t1 = t2; t2 = 0;
10505       umulh(Rhi_mn, Rm, Rn);
10506 
10507 #ifndef PRODUCT
10508       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10509       {
10510         mul(Rlo_mn, Rm, Rn);
10511         add(Rlo_mn, t0, Rlo_mn);
10512         Label ok;
10513         cbz(Rlo_mn, ok); {
10514           stop("broken Montgomery multiply");
10515         } bind(ok);
10516       }
10517 #endif
10518       // We have very carefully set things up so that
10519       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10520       // the lower half of Rm * Rn because we know the result already:
10521       // it must be -t0.  t0 + (-t0) must generate a carry iff
10522       // t0 != 0.  So, rather than do a mul and an adds we just set
10523       // the carry flag iff t0 is nonzero.
10524       //
10525       // mul(Rlo_mn, Rm, Rn);
10526       // adds(zr, t0, Rlo_mn);
10527       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10528       adcs(t0, t1, Rhi_mn);
10529       adc(t1, t2, zr);
10530       mov(t2, zr);
10531     }
10532 
10533     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
10534       block_comment("pre2");
10535       // Pa = Pa_base + i-len;
10536       // Pb = Pb_base + len;
10537       // Pm = Pm_base + i-len;
10538       // Pn = Pn_base + len;
10539 
10540       if (i.is_register()) {
10541         sub(Rj, i.as_register(), len);
10542       } else {
10543         mov(Rj, i.as_constant());
10544         sub(Rj, Rj, len);
10545       }
10546       // Rj == i-len
10547 
10548       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
10549       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
10550       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10551       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
10552 
10553       // Ra = *++Pa;
10554       // Rb = *--Pb;
10555       // Rm = *++Pm;
10556       // Rn = *--Pn;
10557       ldr(Ra, pre(Pa, wordSize));
10558       ldr(Rb, pre(Pb, -wordSize));
10559       ldr(Rm, pre(Pm, wordSize));
10560       ldr(Rn, pre(Pn, -wordSize));
10561 
10562       mov(Rhi_mn, zr);
10563       mov(Rlo_mn, zr);
10564     }
10565 
10566     void post2(RegisterOrConstant i, RegisterOrConstant len) {
10567       block_comment("post2");
10568       if (i.is_constant()) {
10569         mov(Rj, i.as_constant()-len.as_constant());
10570       } else {
10571         sub(Rj, i.as_register(), len);
10572       }
10573 
10574       adds(t0, t0, Rlo_mn); // The pending m*n, low part
10575 
10576       // As soon as we know the least significant digit of our result,
10577       // store it.
10578       // Pm_base[i-len] = t0;
10579       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10580 
10581       // t0 = t1; t1 = t2; t2 = 0;
10582       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
10583       adc(t1, t2, zr);
10584       mov(t2, zr);
10585     }
10586 
10587     // A carry in t0 after Montgomery multiplication means that we
10588     // should subtract multiples of n from our result in m.  We'll
10589     // keep doing that until there is no carry.
10590     void normalize(RegisterOrConstant len) {
10591       block_comment("normalize");
10592       // while (t0)
10593       //   t0 = sub(Pm_base, Pn_base, t0, len);
10594       Label loop, post, again;
10595       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
10596       cbz(t0, post); {
10597         bind(again); {
10598           mov(i, zr);
10599           mov(cnt, len);
10600           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10601           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10602           subs(zr, zr, zr); // set carry flag, i.e. no borrow
10603           align(16);
10604           bind(loop); {
10605             sbcs(Rm, Rm, Rn);
10606             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10607             add(i, i, 1);
10608             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10609             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10610             sub(cnt, cnt, 1);
10611           } cbnz(cnt, loop);
10612           sbc(t0, t0, zr);
10613         } cbnz(t0, again);
10614       } bind(post);
10615     }
10616 
10617     // Move memory at s to d, reversing words.
10618     //    Increments d to end of copied memory
10619     //    Destroys tmp1, tmp2
10620     //    Preserves len
10621     //    Leaves s pointing to the address which was in d at start
10622     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
10623       assert(tmp1->encoding() < r19->encoding(), "register corruption");
10624       assert(tmp2->encoding() < r19->encoding(), "register corruption");
10625 
10626       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
10627       mov(tmp1, len);
10628       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
10629       sub(s, d, len, ext::uxtw, LogBytesPerWord);
10630     }
10631     // where
10632     void reverse1(Register d, Register s, Register tmp) {
10633       ldr(tmp, pre(s, -wordSize));
10634       ror(tmp, tmp, 32);
10635       str(tmp, post(d, wordSize));
10636     }
10637 
10638     void step_squaring() {
10639       // An extra ACC
10640       step();
10641       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10642     }
10643 
10644     void last_squaring(RegisterOrConstant i) {
10645       Label dont;
10646       // if ((i & 1) == 0) {
10647       tbnz(i.as_register(), 0, dont); {
10648         // MACC(Ra, Rb, t0, t1, t2);
10649         // Ra = *++Pa;
10650         // Rb = *--Pb;
10651         umulh(Rhi_ab, Ra, Rb);
10652         mul(Rlo_ab, Ra, Rb);
10653         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10654       } bind(dont);
10655     }
10656 
10657     void extra_step_squaring() {
10658       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10659 
10660       // MACC(Rm, Rn, t0, t1, t2);
10661       // Rm = *++Pm;
10662       // Rn = *--Pn;
10663       umulh(Rhi_mn, Rm, Rn);
10664       mul(Rlo_mn, Rm, Rn);
10665       ldr(Rm, pre(Pm, wordSize));
10666       ldr(Rn, pre(Pn, -wordSize));
10667     }
10668 
10669     void post1_squaring() {
10670       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10671 
10672       // *Pm = Rm = t0 * inv;
10673       mul(Rm, t0, inv);
10674       str(Rm, Address(Pm));
10675 
10676       // MACC(Rm, Rn, t0, t1, t2);
10677       // t0 = t1; t1 = t2; t2 = 0;
10678       umulh(Rhi_mn, Rm, Rn);
10679 
10680 #ifndef PRODUCT
10681       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10682       {
10683         mul(Rlo_mn, Rm, Rn);
10684         add(Rlo_mn, t0, Rlo_mn);
10685         Label ok;
10686         cbz(Rlo_mn, ok); {
10687           stop("broken Montgomery multiply");
10688         } bind(ok);
10689       }
10690 #endif
10691       // We have very carefully set things up so that
10692       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10693       // the lower half of Rm * Rn because we know the result already:
10694       // it must be -t0.  t0 + (-t0) must generate a carry iff
10695       // t0 != 0.  So, rather than do a mul and an adds we just set
10696       // the carry flag iff t0 is nonzero.
10697       //
10698       // mul(Rlo_mn, Rm, Rn);
10699       // adds(zr, t0, Rlo_mn);
10700       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10701       adcs(t0, t1, Rhi_mn);
10702       adc(t1, t2, zr);
10703       mov(t2, zr);
10704     }
10705 
10706     void acc(Register Rhi, Register Rlo,
10707              Register t0, Register t1, Register t2) {
10708       adds(t0, t0, Rlo);
10709       adcs(t1, t1, Rhi);
10710       adc(t2, t2, zr);
10711     }
10712 
10713   public:
10714     /**
10715      * Fast Montgomery multiplication.  The derivation of the
10716      * algorithm is in A Cryptographic Library for the Motorola
10717      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
10718      *
10719      * Arguments:
10720      *
10721      * Inputs for multiplication:
10722      *   c_rarg0   - int array elements a
10723      *   c_rarg1   - int array elements b
10724      *   c_rarg2   - int array elements n (the modulus)
10725      *   c_rarg3   - int length
10726      *   c_rarg4   - int inv
10727      *   c_rarg5   - int array elements m (the result)
10728      *
10729      * Inputs for squaring:
10730      *   c_rarg0   - int array elements a
10731      *   c_rarg1   - int array elements n (the modulus)
10732      *   c_rarg2   - int length
10733      *   c_rarg3   - int inv
10734      *   c_rarg4   - int array elements m (the result)
10735      *
10736      */
10737     address generate_multiply() {
10738       Label argh, nothing;
10739       bind(argh);
10740       stop("MontgomeryMultiply total_allocation must be <= 8192");
10741 
10742       align(CodeEntryAlignment);
10743       address entry = pc();
10744 
10745       cbzw(Rlen, nothing);
10746 
10747       enter();
10748 
10749       // Make room.
10750       cmpw(Rlen, 512);
10751       br(Assembler::HI, argh);
10752       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
10753       andr(sp, Ra, -2 * wordSize);
10754 
10755       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
10756 
10757       {
10758         // Copy input args, reversing as we go.  We use Ra as a
10759         // temporary variable.
10760         reverse(Ra, Pa_base, Rlen, t0, t1);
10761         if (!_squaring)
10762           reverse(Ra, Pb_base, Rlen, t0, t1);
10763         reverse(Ra, Pn_base, Rlen, t0, t1);
10764       }
10765 
10766       // Push all call-saved registers and also Pm_base which we'll need
10767       // at the end.
10768       save_regs();
10769 
10770 #ifndef PRODUCT
10771       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
10772       {
10773         ldr(Rn, Address(Pn_base, 0));
10774         mul(Rlo_mn, Rn, inv);
10775         subs(zr, Rlo_mn, -1);
10776         Label ok;
10777         br(EQ, ok); {
10778           stop("broken inverse in Montgomery multiply");
10779         } bind(ok);
10780       }
10781 #endif
10782 
10783       mov(Pm_base, Ra);
10784 
10785       mov(t0, zr);
10786       mov(t1, zr);
10787       mov(t2, zr);
10788 
10789       block_comment("for (int i = 0; i < len; i++) {");
10790       mov(Ri, zr); {
10791         Label loop, end;
10792         cmpw(Ri, Rlen);
10793         br(Assembler::GE, end);
10794 
10795         bind(loop);
10796         pre1(Ri);
10797 
10798         block_comment("  for (j = i; j; j--) {"); {
10799           movw(Rj, Ri);
10800           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10801         } block_comment("  } // j");
10802 
10803         post1();
10804         addw(Ri, Ri, 1);
10805         cmpw(Ri, Rlen);
10806         br(Assembler::LT, loop);
10807         bind(end);
10808         block_comment("} // i");
10809       }
10810 
10811       block_comment("for (int i = len; i < 2*len; i++) {");
10812       mov(Ri, Rlen); {
10813         Label loop, end;
10814         cmpw(Ri, Rlen, Assembler::LSL, 1);
10815         br(Assembler::GE, end);
10816 
10817         bind(loop);
10818         pre2(Ri, Rlen);
10819 
10820         block_comment("  for (j = len*2-i-1; j; j--) {"); {
10821           lslw(Rj, Rlen, 1);
10822           subw(Rj, Rj, Ri);
10823           subw(Rj, Rj, 1);
10824           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10825         } block_comment("  } // j");
10826 
10827         post2(Ri, Rlen);
10828         addw(Ri, Ri, 1);
10829         cmpw(Ri, Rlen, Assembler::LSL, 1);
10830         br(Assembler::LT, loop);
10831         bind(end);
10832       }
10833       block_comment("} // i");
10834 
10835       normalize(Rlen);
10836 
10837       mov(Ra, Pm_base);  // Save Pm_base in Ra
10838       restore_regs();  // Restore caller's Pm_base
10839 
10840       // Copy our result into caller's Pm_base
10841       reverse(Pm_base, Ra, Rlen, t0, t1);
10842 
10843       leave();
10844       bind(nothing);
10845       ret(lr);
10846 
10847       return entry;
10848     }
10849     // In C, approximately:
10850 
10851     // void
10852     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
10853     //                     julong Pn_base[], julong Pm_base[],
10854     //                     julong inv, int len) {
10855     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
10856     //   julong *Pa, *Pb, *Pn, *Pm;
10857     //   julong Ra, Rb, Rn, Rm;
10858 
10859     //   int i;
10860 
10861     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
10862 
10863     //   for (i = 0; i < len; i++) {
10864     //     int j;
10865 
10866     //     Pa = Pa_base;
10867     //     Pb = Pb_base + i;
10868     //     Pm = Pm_base;
10869     //     Pn = Pn_base + i;
10870 
10871     //     Ra = *Pa;
10872     //     Rb = *Pb;
10873     //     Rm = *Pm;
10874     //     Rn = *Pn;
10875 
10876     //     int iters = i;
10877     //     for (j = 0; iters--; j++) {
10878     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
10879     //       MACC(Ra, Rb, t0, t1, t2);
10880     //       Ra = *++Pa;
10881     //       Rb = *--Pb;
10882     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
10883     //       MACC(Rm, Rn, t0, t1, t2);
10884     //       Rm = *++Pm;
10885     //       Rn = *--Pn;
10886     //     }
10887 
10888     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
10889     //     MACC(Ra, Rb, t0, t1, t2);
10890     //     *Pm = Rm = t0 * inv;
10891     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
10892     //     MACC(Rm, Rn, t0, t1, t2);
10893 
10894     //     assert(t0 == 0, "broken Montgomery multiply");
10895 
10896     //     t0 = t1; t1 = t2; t2 = 0;
10897     //   }
10898 
10899     //   for (i = len; i < 2*len; i++) {
10900     //     int j;
10901 
10902     //     Pa = Pa_base + i-len;
10903     //     Pb = Pb_base + len;
10904     //     Pm = Pm_base + i-len;
10905     //     Pn = Pn_base + len;
10906 
10907     //     Ra = *++Pa;
10908     //     Rb = *--Pb;
10909     //     Rm = *++Pm;
10910     //     Rn = *--Pn;
10911 
10912     //     int iters = len*2-i-1;
10913     //     for (j = i-len+1; iters--; j++) {
10914     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
10915     //       MACC(Ra, Rb, t0, t1, t2);
10916     //       Ra = *++Pa;
10917     //       Rb = *--Pb;
10918     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
10919     //       MACC(Rm, Rn, t0, t1, t2);
10920     //       Rm = *++Pm;
10921     //       Rn = *--Pn;
10922     //     }
10923 
10924     //     Pm_base[i-len] = t0;
10925     //     t0 = t1; t1 = t2; t2 = 0;
10926     //   }
10927 
10928     //   while (t0)
10929     //     t0 = sub(Pm_base, Pn_base, t0, len);
10930     // }
10931 
10932     /**
10933      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
10934      * multiplies than Montgomery multiplication so it should be up to
10935      * 25% faster.  However, its loop control is more complex and it
10936      * may actually run slower on some machines.
10937      *
10938      * Arguments:
10939      *
10940      * Inputs:
10941      *   c_rarg0   - int array elements a
10942      *   c_rarg1   - int array elements n (the modulus)
10943      *   c_rarg2   - int length
10944      *   c_rarg3   - int inv
10945      *   c_rarg4   - int array elements m (the result)
10946      *
10947      */
10948     address generate_square() {
10949       Label argh;
10950       bind(argh);
10951       stop("MontgomeryMultiply total_allocation must be <= 8192");
10952 
10953       align(CodeEntryAlignment);
10954       address entry = pc();
10955 
10956       enter();
10957 
10958       // Make room.
10959       cmpw(Rlen, 512);
10960       br(Assembler::HI, argh);
10961       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
10962       andr(sp, Ra, -2 * wordSize);
10963 
10964       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
10965 
10966       {
10967         // Copy input args, reversing as we go.  We use Ra as a
10968         // temporary variable.
10969         reverse(Ra, Pa_base, Rlen, t0, t1);
10970         reverse(Ra, Pn_base, Rlen, t0, t1);
10971       }
10972 
10973       // Push all call-saved registers and also Pm_base which we'll need
10974       // at the end.
10975       save_regs();
10976 
10977       mov(Pm_base, Ra);
10978 
10979       mov(t0, zr);
10980       mov(t1, zr);
10981       mov(t2, zr);
10982 
10983       block_comment("for (int i = 0; i < len; i++) {");
10984       mov(Ri, zr); {
10985         Label loop, end;
10986         bind(loop);
10987         cmp(Ri, Rlen);
10988         br(Assembler::GE, end);
10989 
10990         pre1(Ri);
10991 
10992         block_comment("for (j = (i+1)/2; j; j--) {"); {
10993           add(Rj, Ri, 1);
10994           lsr(Rj, Rj, 1);
10995           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
10996         } block_comment("  } // j");
10997 
10998         last_squaring(Ri);
10999 
11000         block_comment("  for (j = i/2; j; j--) {"); {
11001           lsr(Rj, Ri, 1);
11002           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11003         } block_comment("  } // j");
11004 
11005         post1_squaring();
11006         add(Ri, Ri, 1);
11007         cmp(Ri, Rlen);
11008         br(Assembler::LT, loop);
11009 
11010         bind(end);
11011         block_comment("} // i");
11012       }
11013 
11014       block_comment("for (int i = len; i < 2*len; i++) {");
11015       mov(Ri, Rlen); {
11016         Label loop, end;
11017         bind(loop);
11018         cmp(Ri, Rlen, Assembler::LSL, 1);
11019         br(Assembler::GE, end);
11020 
11021         pre2(Ri, Rlen);
11022 
11023         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11024           lsl(Rj, Rlen, 1);
11025           sub(Rj, Rj, Ri);
11026           sub(Rj, Rj, 1);
11027           lsr(Rj, Rj, 1);
11028           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11029         } block_comment("  } // j");
11030 
11031         last_squaring(Ri);
11032 
11033         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11034           lsl(Rj, Rlen, 1);
11035           sub(Rj, Rj, Ri);
11036           lsr(Rj, Rj, 1);
11037           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11038         } block_comment("  } // j");
11039 
11040         post2(Ri, Rlen);
11041         add(Ri, Ri, 1);
11042         cmp(Ri, Rlen, Assembler::LSL, 1);
11043 
11044         br(Assembler::LT, loop);
11045         bind(end);
11046         block_comment("} // i");
11047       }
11048 
11049       normalize(Rlen);
11050 
11051       mov(Ra, Pm_base);  // Save Pm_base in Ra
11052       restore_regs();  // Restore caller's Pm_base
11053 
11054       // Copy our result into caller's Pm_base
11055       reverse(Pm_base, Ra, Rlen, t0, t1);
11056 
11057       leave();
11058       ret(lr);
11059 
11060       return entry;
11061     }
11062     // In C, approximately:
11063 
11064     // void
11065     // montgomery_square(julong Pa_base[], julong Pn_base[],
11066     //                   julong Pm_base[], julong inv, int len) {
11067     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11068     //   julong *Pa, *Pb, *Pn, *Pm;
11069     //   julong Ra, Rb, Rn, Rm;
11070 
11071     //   int i;
11072 
11073     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11074 
11075     //   for (i = 0; i < len; i++) {
11076     //     int j;
11077 
11078     //     Pa = Pa_base;
11079     //     Pb = Pa_base + i;
11080     //     Pm = Pm_base;
11081     //     Pn = Pn_base + i;
11082 
11083     //     Ra = *Pa;
11084     //     Rb = *Pb;
11085     //     Rm = *Pm;
11086     //     Rn = *Pn;
11087 
11088     //     int iters = (i+1)/2;
11089     //     for (j = 0; iters--; j++) {
11090     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11091     //       MACC2(Ra, Rb, t0, t1, t2);
11092     //       Ra = *++Pa;
11093     //       Rb = *--Pb;
11094     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11095     //       MACC(Rm, Rn, t0, t1, t2);
11096     //       Rm = *++Pm;
11097     //       Rn = *--Pn;
11098     //     }
11099     //     if ((i & 1) == 0) {
11100     //       assert(Ra == Pa_base[j], "must be");
11101     //       MACC(Ra, Ra, t0, t1, t2);
11102     //     }
11103     //     iters = i/2;
11104     //     assert(iters == i-j, "must be");
11105     //     for (; iters--; j++) {
11106     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11107     //       MACC(Rm, Rn, t0, t1, t2);
11108     //       Rm = *++Pm;
11109     //       Rn = *--Pn;
11110     //     }
11111 
11112     //     *Pm = Rm = t0 * inv;
11113     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11114     //     MACC(Rm, Rn, t0, t1, t2);
11115 
11116     //     assert(t0 == 0, "broken Montgomery multiply");
11117 
11118     //     t0 = t1; t1 = t2; t2 = 0;
11119     //   }
11120 
11121     //   for (i = len; i < 2*len; i++) {
11122     //     int start = i-len+1;
11123     //     int end = start + (len - start)/2;
11124     //     int j;
11125 
11126     //     Pa = Pa_base + i-len;
11127     //     Pb = Pa_base + len;
11128     //     Pm = Pm_base + i-len;
11129     //     Pn = Pn_base + len;
11130 
11131     //     Ra = *++Pa;
11132     //     Rb = *--Pb;
11133     //     Rm = *++Pm;
11134     //     Rn = *--Pn;
11135 
11136     //     int iters = (2*len-i-1)/2;
11137     //     assert(iters == end-start, "must be");
11138     //     for (j = start; iters--; j++) {
11139     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11140     //       MACC2(Ra, Rb, t0, t1, t2);
11141     //       Ra = *++Pa;
11142     //       Rb = *--Pb;
11143     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11144     //       MACC(Rm, Rn, t0, t1, t2);
11145     //       Rm = *++Pm;
11146     //       Rn = *--Pn;
11147     //     }
11148     //     if ((i & 1) == 0) {
11149     //       assert(Ra == Pa_base[j], "must be");
11150     //       MACC(Ra, Ra, t0, t1, t2);
11151     //     }
11152     //     iters =  (2*len-i)/2;
11153     //     assert(iters == len-j, "must be");
11154     //     for (; iters--; j++) {
11155     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11156     //       MACC(Rm, Rn, t0, t1, t2);
11157     //       Rm = *++Pm;
11158     //       Rn = *--Pn;
11159     //     }
11160     //     Pm_base[i-len] = t0;
11161     //     t0 = t1; t1 = t2; t2 = 0;
11162     //   }
11163 
11164     //   while (t0)
11165     //     t0 = sub(Pm_base, Pn_base, t0, len);
11166     // }
11167   };
11168 
11169   // Initialization
11170   void generate_initial_stubs() {
11171     // Generate initial stubs and initializes the entry points
11172 
11173     // entry points that exist in all platforms Note: This is code
11174     // that could be shared among different platforms - however the
11175     // benefit seems to be smaller than the disadvantage of having a
11176     // much more complicated generator structure. See also comment in
11177     // stubRoutines.hpp.
11178 
11179     StubRoutines::_forward_exception_entry = generate_forward_exception();
11180 
11181     StubRoutines::_call_stub_entry =
11182       generate_call_stub(StubRoutines::_call_stub_return_address);
11183 
11184     // is referenced by megamorphic call
11185     StubRoutines::_catch_exception_entry = generate_catch_exception();
11186 
11187     // Initialize table for copy memory (arraycopy) check.
11188     if (UnsafeMemoryAccess::_table == nullptr) {
11189       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11190     }
11191 
11192     if (UseCRC32Intrinsics) {
11193       // set table address before stub generation which use it
11194       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
11195       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11196     }
11197 
11198     if (UseCRC32CIntrinsics) {
11199       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11200     }
11201 
11202     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11203       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11204     }
11205 
11206     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11207       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11208     }
11209 
11210     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11211         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11212       StubRoutines::_hf2f = generate_float16ToFloat();
11213       StubRoutines::_f2hf = generate_floatToFloat16();
11214     }
11215   }
11216 
11217   void generate_continuation_stubs() {
11218     // Continuation stubs:
11219     StubRoutines::_cont_thaw          = generate_cont_thaw();
11220     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11221     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11222     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11223   }
11224 
11225   void generate_final_stubs() {
11226     // support for verify_oop (must happen after universe_init)
11227     if (VerifyOops) {
11228       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11229     }
11230 
11231     // arraycopy stubs used by compilers
11232     generate_arraycopy_stubs();
11233 
11234     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11235 
11236     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11237 
11238     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11239     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11240 
11241 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11242 
11243     generate_atomic_entry_points();
11244 
11245 #endif // LINUX
11246 
11247 #ifdef COMPILER2
11248     if (UseSecondarySupersTable) {
11249       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11250       if (! InlineSecondarySupersTest) {
11251         generate_lookup_secondary_supers_table_stub();
11252       }
11253     }
11254 #endif
11255 
11256     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11257   }
11258 
11259   void generate_compiler_stubs() {
11260 #if COMPILER2_OR_JVMCI
11261 
11262     if (UseSVE == 0) {
11263       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
11264     }
11265 
11266     // array equals stub for large arrays.
11267     if (!UseSimpleArrayEquals) {
11268       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11269     }
11270 
11271     // arrays_hascode stub for large arrays.
11272     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11273     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11274     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11275     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11276     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11277 
11278     // byte_array_inflate stub for large arrays.
11279     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11280 
11281     // countPositives stub for large arrays.
11282     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11283 
11284     generate_compare_long_strings();
11285 
11286     generate_string_indexof_stubs();
11287 
11288 #ifdef COMPILER2
11289     if (UseMultiplyToLenIntrinsic) {
11290       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11291     }
11292 
11293     if (UseSquareToLenIntrinsic) {
11294       StubRoutines::_squareToLen = generate_squareToLen();
11295     }
11296 
11297     if (UseMulAddIntrinsic) {
11298       StubRoutines::_mulAdd = generate_mulAdd();
11299     }
11300 
11301     if (UseSIMDForBigIntegerShiftIntrinsics) {
11302       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11303       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11304     }
11305 
11306     if (UseMontgomeryMultiplyIntrinsic) {
11307       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
11308       StubCodeMark mark(this, stub_id);
11309       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11310       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11311     }
11312 
11313     if (UseMontgomerySquareIntrinsic) {
11314       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
11315       StubCodeMark mark(this, stub_id);
11316       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11317       // We use generate_multiply() rather than generate_square()
11318       // because it's faster for the sizes of modulus we care about.
11319       StubRoutines::_montgomerySquare = g.generate_multiply();
11320     }
11321 
11322 #endif // COMPILER2
11323 
11324     if (UseChaCha20Intrinsics) {
11325       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11326     }
11327 
11328     if (UseKyberIntrinsics) {
11329       StubRoutines::_kyberNtt = generate_kyberNtt();
11330       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11331       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11332       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11333       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11334       StubRoutines::_kyber12To16 = generate_kyber12To16();
11335       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11336     }
11337 
11338     if (UseDilithiumIntrinsics) {
11339       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11340       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11341       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11342       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11343       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11344     }
11345 
11346     if (UseBASE64Intrinsics) {
11347         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11348         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11349     }
11350 
11351     // data cache line writeback
11352     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11353     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11354 
11355     if (UseAESIntrinsics) {
11356       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11357       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11358       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11359       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11360       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11361     }
11362     if (UseGHASHIntrinsics) {
11363       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11364       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11365     }
11366     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11367       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11368     }
11369 
11370     if (UseMD5Intrinsics) {
11371       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
11372       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
11373     }
11374     if (UseSHA1Intrinsics) {
11375       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
11376       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
11377     }
11378     if (UseSHA256Intrinsics) {
11379       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
11380       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
11381     }
11382     if (UseSHA512Intrinsics) {
11383       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
11384       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
11385     }
11386     if (UseSHA3Intrinsics) {
11387       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
11388       StubRoutines::_double_keccak         = generate_double_keccak();
11389       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
11390     }
11391 
11392     if (UsePoly1305Intrinsics) {
11393       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11394     }
11395 
11396     // generate Adler32 intrinsics code
11397     if (UseAdler32Intrinsics) {
11398       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11399     }
11400 
11401 #endif // COMPILER2_OR_JVMCI
11402   }
11403 
11404  public:
11405   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
11406     switch(blob_id) {
11407     case initial_id:
11408       generate_initial_stubs();
11409       break;
11410      case continuation_id:
11411       generate_continuation_stubs();
11412       break;
11413     case compiler_id:
11414       generate_compiler_stubs();
11415       break;
11416     case final_id:
11417       generate_final_stubs();
11418       break;
11419     default:
11420       fatal("unexpected blob id: %d", blob_id);
11421       break;
11422     };
11423   }
11424 }; // end class declaration
11425 
11426 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
11427   StubGenerator g(code, blob_id);
11428 }
11429 
11430 
11431 #if defined (LINUX)
11432 
11433 // Define pointers to atomic stubs and initialize them to point to the
11434 // code in atomic_aarch64.S.
11435 
11436 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11437   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11438     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11439   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11440     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11441 
11442 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11443 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11444 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11445 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11446 DEFAULT_ATOMIC_OP(xchg, 4, )
11447 DEFAULT_ATOMIC_OP(xchg, 8, )
11448 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11449 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11450 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11451 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11452 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11453 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11454 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11455 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11456 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11457 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11458 
11459 #undef DEFAULT_ATOMIC_OP
11460 
11461 #endif // LINUX