1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     __ ldr(j_rarg2, result);
  332     Label is_long, is_float, is_double, exit;
  333     __ ldr(j_rarg1, result_type);
  334     __ cmp(j_rarg1, (u1)T_OBJECT);
  335     __ br(Assembler::EQ, is_long);
  336     __ cmp(j_rarg1, (u1)T_LONG);
  337     __ br(Assembler::EQ, is_long);
  338     __ cmp(j_rarg1, (u1)T_FLOAT);
  339     __ br(Assembler::EQ, is_float);
  340     __ cmp(j_rarg1, (u1)T_DOUBLE);
  341     __ br(Assembler::EQ, is_double);
  342 
  343     // handle T_INT case
  344     __ strw(r0, Address(j_rarg2));
  345 
  346     __ BIND(exit);
  347 
  348     // pop parameters
  349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  350 
  351 #ifdef ASSERT
  352     // verify that threads correspond
  353     {
  354       Label L, S;
  355       __ ldr(rscratch1, thread);
  356       __ cmp(rthread, rscratch1);
  357       __ br(Assembler::NE, S);
  358       __ get_thread(rscratch1);
  359       __ cmp(rthread, rscratch1);
  360       __ br(Assembler::EQ, L);
  361       __ BIND(S);
  362       __ stop("StubRoutines::call_stub: threads must correspond");
  363       __ BIND(L);
  364     }
  365 #endif
  366 
  367     __ pop_cont_fastpath(rthread);
  368 
  369     // restore callee-save registers
  370     __ ldpd(v15, v14,  d15_save);
  371     __ ldpd(v13, v12,  d13_save);
  372     __ ldpd(v11, v10,  d11_save);
  373     __ ldpd(v9,  v8,   d9_save);
  374 
  375     __ ldp(r28, r27,   r28_save);
  376     __ ldp(r26, r25,   r26_save);
  377     __ ldp(r24, r23,   r24_save);
  378     __ ldp(r22, r21,   r22_save);
  379     __ ldp(r20, r19,   r20_save);
  380 
  381     // restore fpcr
  382     __ ldr(rscratch1,  fpcr_save);
  383     __ set_fpcr(rscratch1);
  384 
  385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  386     __ ldrw(c_rarg2, result_type);
  387     __ ldr(c_rarg3,  method);
  388     __ ldp(c_rarg4, c_rarg5,  entry_point);
  389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  390 
  391     // leave frame and return to caller
  392     __ leave();
  393     __ ret(lr);
  394 
  395     // handle return types different from T_INT
  396 
  397     __ BIND(is_long);
  398     __ str(r0, Address(j_rarg2, 0));
  399     __ br(Assembler::AL, exit);
  400 
  401     __ BIND(is_float);
  402     __ strs(j_farg0, Address(j_rarg2, 0));
  403     __ br(Assembler::AL, exit);
  404 
  405     __ BIND(is_double);
  406     __ strd(j_farg0, Address(j_rarg2, 0));
  407     __ br(Assembler::AL, exit);
  408 
  409     return start;
  410   }
  411 
  412   // Return point for a Java call if there's an exception thrown in
  413   // Java code.  The exception is caught and transformed into a
  414   // pending exception stored in JavaThread that can be tested from
  415   // within the VM.
  416   //
  417   // Note: Usually the parameters are removed by the callee. In case
  418   // of an exception crossing an activation frame boundary, that is
  419   // not the case if the callee is compiled code => need to setup the
  420   // rsp.
  421   //
  422   // r0: exception oop
  423 
  424   address generate_catch_exception() {
  425     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  426     StubCodeMark mark(this, stub_id);
  427     address start = __ pc();
  428 
  429     // same as in generate_call_stub():
  430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  431     const Address thread        (rfp, thread_off         * wordSize);
  432 
  433 #ifdef ASSERT
  434     // verify that threads correspond
  435     {
  436       Label L, S;
  437       __ ldr(rscratch1, thread);
  438       __ cmp(rthread, rscratch1);
  439       __ br(Assembler::NE, S);
  440       __ get_thread(rscratch1);
  441       __ cmp(rthread, rscratch1);
  442       __ br(Assembler::EQ, L);
  443       __ bind(S);
  444       __ stop("StubRoutines::catch_exception: threads must correspond");
  445       __ bind(L);
  446     }
  447 #endif
  448 
  449     // set pending exception
  450     __ verify_oop(r0);
  451 
  452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  453     __ mov(rscratch1, (address)__FILE__);
  454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  455     __ movw(rscratch1, (int)__LINE__);
  456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  457 
  458     // complete return to VM
  459     assert(StubRoutines::_call_stub_return_address != nullptr,
  460            "_call_stub_return_address must have been generated before");
  461     __ b(StubRoutines::_call_stub_return_address);
  462 
  463     return start;
  464   }
  465 
  466   // Continuation point for runtime calls returning with a pending
  467   // exception.  The pending exception check happened in the runtime
  468   // or native call stub.  The pending exception in Thread is
  469   // converted into a Java-level exception.
  470   //
  471   // Contract with Java-level exception handlers:
  472   // r0: exception
  473   // r3: throwing pc
  474   //
  475   // NOTE: At entry of this stub, exception-pc must be in LR !!
  476 
  477   // NOTE: this is always used as a jump target within generated code
  478   // so it just needs to be generated code with no x86 prolog
  479 
  480   address generate_forward_exception() {
  481     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  482     StubCodeMark mark(this, stub_id);
  483     address start = __ pc();
  484 
  485     // Upon entry, LR points to the return address returning into
  486     // Java (interpreted or compiled) code; i.e., the return address
  487     // becomes the throwing pc.
  488     //
  489     // Arguments pushed before the runtime call are still on the stack
  490     // but the exception handler will reset the stack pointer ->
  491     // ignore them.  A potential result in registers can be ignored as
  492     // well.
  493 
  494 #ifdef ASSERT
  495     // make sure this code is only executed if there is a pending exception
  496     {
  497       Label L;
  498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  499       __ cbnz(rscratch1, L);
  500       __ stop("StubRoutines::forward exception: no pending exception (1)");
  501       __ bind(L);
  502     }
  503 #endif
  504 
  505     // compute exception handler into r19
  506 
  507     // call the VM to find the handler address associated with the
  508     // caller address. pass thread in r0 and caller pc (ret address)
  509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  510     // the stack.
  511     __ mov(c_rarg1, lr);
  512     // lr will be trashed by the VM call so we move it to R19
  513     // (callee-saved) because we also need to pass it to the handler
  514     // returned by this call.
  515     __ mov(r19, lr);
  516     BLOCK_COMMENT("call exception_handler_for_return_address");
  517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  518                          SharedRuntime::exception_handler_for_return_address),
  519                     rthread, c_rarg1);
  520     // Reinitialize the ptrue predicate register, in case the external runtime
  521     // call clobbers ptrue reg, as we may return to SVE compiled code.
  522     __ reinitialize_ptrue();
  523 
  524     // we should not really care that lr is no longer the callee
  525     // address. we saved the value the handler needs in r19 so we can
  526     // just copy it to r3. however, the C2 handler will push its own
  527     // frame and then calls into the VM and the VM code asserts that
  528     // the PC for the frame above the handler belongs to a compiled
  529     // Java method. So, we restore lr here to satisfy that assert.
  530     __ mov(lr, r19);
  531     // setup r0 & r3 & clear pending exception
  532     __ mov(r3, r19);
  533     __ mov(r19, r0);
  534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  536 
  537 #ifdef ASSERT
  538     // make sure exception is set
  539     {
  540       Label L;
  541       __ cbnz(r0, L);
  542       __ stop("StubRoutines::forward exception: no pending exception (2)");
  543       __ bind(L);
  544     }
  545 #endif
  546 
  547     // continue at exception handler
  548     // r0: exception
  549     // r3: throwing pc
  550     // r19: exception handler
  551     __ verify_oop(r0);
  552     __ br(r19);
  553 
  554     return start;
  555   }
  556 
  557   // Non-destructive plausibility checks for oops
  558   //
  559   // Arguments:
  560   //    r0: oop to verify
  561   //    rscratch1: error message
  562   //
  563   // Stack after saving c_rarg3:
  564   //    [tos + 0]: saved c_rarg3
  565   //    [tos + 1]: saved c_rarg2
  566   //    [tos + 2]: saved lr
  567   //    [tos + 3]: saved rscratch2
  568   //    [tos + 4]: saved r0
  569   //    [tos + 5]: saved rscratch1
  570   address generate_verify_oop() {
  571     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  572     StubCodeMark mark(this, stub_id);
  573     address start = __ pc();
  574 
  575     Label exit, error;
  576 
  577     // save c_rarg2 and c_rarg3
  578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  579 
  580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ ldr(c_rarg3, Address(c_rarg2));
  583     __ add(c_rarg3, c_rarg3, 1);
  584     __ str(c_rarg3, Address(c_rarg2));
  585 
  586     // object is in r0
  587     // make sure object is 'reasonable'
  588     __ cbz(r0, exit); // if obj is null it is OK
  589 
  590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  592 
  593     // return if everything seems ok
  594     __ bind(exit);
  595 
  596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  597     __ ret(lr);
  598 
  599     // handle errors
  600     __ bind(error);
  601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  602 
  603     __ push(RegSet::range(r0, r29), sp);
  604     // debug(char* msg, int64_t pc, int64_t regs[])
  605     __ mov(c_rarg0, rscratch1);      // pass address of error message
  606     __ mov(c_rarg1, lr);             // pass return address
  607     __ mov(c_rarg2, sp);             // pass address of regs on stack
  608 #ifndef PRODUCT
  609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  610 #endif
  611     BLOCK_COMMENT("call MacroAssembler::debug");
  612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  613     __ blr(rscratch1);
  614     __ hlt(0);
  615 
  616     return start;
  617   }
  618 
  619   // Generate indices for iota vector.
  620   address generate_iota_indices(StubGenStubId stub_id) {
  621     __ align(CodeEntryAlignment);
  622     StubCodeMark mark(this, stub_id);
  623     address start = __ pc();
  624     // B
  625     __ emit_data64(0x0706050403020100, relocInfo::none);
  626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  627     // H
  628     __ emit_data64(0x0003000200010000, relocInfo::none);
  629     __ emit_data64(0x0007000600050004, relocInfo::none);
  630     // S
  631     __ emit_data64(0x0000000100000000, relocInfo::none);
  632     __ emit_data64(0x0000000300000002, relocInfo::none);
  633     // D
  634     __ emit_data64(0x0000000000000000, relocInfo::none);
  635     __ emit_data64(0x0000000000000001, relocInfo::none);
  636     // S - FP
  637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  639     // D - FP
  640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  642     return start;
  643   }
  644 
  645   // The inner part of zero_words().  This is the bulk operation,
  646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  647   // caller is responsible for zeroing the last few words.
  648   //
  649   // Inputs:
  650   // r10: the HeapWord-aligned base address of an array to zero.
  651   // r11: the count in HeapWords, r11 > 0.
  652   //
  653   // Returns r10 and r11, adjusted for the caller to clear.
  654   // r10: the base address of the tail of words left to clear.
  655   // r11: the number of words in the tail.
  656   //      r11 < MacroAssembler::zero_words_block_size.
  657 
  658   address generate_zero_blocks() {
  659     Label done;
  660     Label base_aligned;
  661 
  662     Register base = r10, cnt = r11;
  663 
  664     __ align(CodeEntryAlignment);
  665     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  666     StubCodeMark mark(this, stub_id);
  667     address start = __ pc();
  668 
  669     if (UseBlockZeroing) {
  670       int zva_length = VM_Version::zva_length();
  671 
  672       // Ensure ZVA length can be divided by 16. This is required by
  673       // the subsequent operations.
  674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  675 
  676       __ tbz(base, 3, base_aligned);
  677       __ str(zr, Address(__ post(base, 8)));
  678       __ sub(cnt, cnt, 1);
  679       __ bind(base_aligned);
  680 
  681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  682       // alignment.
  683       Label small;
  684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  685       __ subs(rscratch1, cnt, low_limit >> 3);
  686       __ br(Assembler::LT, small);
  687       __ zero_dcache_blocks(base, cnt);
  688       __ bind(small);
  689     }
  690 
  691     {
  692       // Number of stp instructions we'll unroll
  693       const int unroll =
  694         MacroAssembler::zero_words_block_size / 2;
  695       // Clear the remaining blocks.
  696       Label loop;
  697       __ subs(cnt, cnt, unroll * 2);
  698       __ br(Assembler::LT, done);
  699       __ bind(loop);
  700       for (int i = 0; i < unroll; i++)
  701         __ stp(zr, zr, __ post(base, 16));
  702       __ subs(cnt, cnt, unroll * 2);
  703       __ br(Assembler::GE, loop);
  704       __ bind(done);
  705       __ add(cnt, cnt, unroll * 2);
  706     }
  707 
  708     __ ret(lr);
  709 
  710     return start;
  711   }
  712 
  713 
  714   typedef enum {
  715     copy_forwards = 1,
  716     copy_backwards = -1
  717   } copy_direction;
  718 
  719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  720   // for arraycopy stubs.
  721   class ArrayCopyBarrierSetHelper : StackObj {
  722     BarrierSetAssembler* _bs_asm;
  723     MacroAssembler* _masm;
  724     DecoratorSet _decorators;
  725     BasicType _type;
  726     Register _gct1;
  727     Register _gct2;
  728     Register _gct3;
  729     FloatRegister _gcvt1;
  730     FloatRegister _gcvt2;
  731     FloatRegister _gcvt3;
  732 
  733   public:
  734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  735                               DecoratorSet decorators,
  736                               BasicType type,
  737                               Register gct1,
  738                               Register gct2,
  739                               Register gct3,
  740                               FloatRegister gcvt1,
  741                               FloatRegister gcvt2,
  742                               FloatRegister gcvt3)
  743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  744         _masm(masm),
  745         _decorators(decorators),
  746         _type(type),
  747         _gct1(gct1),
  748         _gct2(gct2),
  749         _gct3(gct3),
  750         _gcvt1(gcvt1),
  751         _gcvt2(gcvt2),
  752         _gcvt3(gcvt3) {
  753     }
  754 
  755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  757                             dst1, dst2, src,
  758                             _gct1, _gct2, _gcvt1);
  759     }
  760 
  761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  763                              dst, src1, src2,
  764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  765     }
  766 
  767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  769                             dst1, dst2, src,
  770                             _gct1);
  771     }
  772 
  773     void copy_store_at_16(Address dst, Register src1, Register src2) {
  774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  775                              dst, src1, src2,
  776                              _gct1, _gct2, _gct3);
  777     }
  778 
  779     void copy_load_at_8(Register dst, Address src) {
  780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  781                             dst, noreg, src,
  782                             _gct1);
  783     }
  784 
  785     void copy_store_at_8(Address dst, Register src) {
  786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  787                              dst, src, noreg,
  788                              _gct1, _gct2, _gct3);
  789     }
  790   };
  791 
  792   // Bulk copy of blocks of 8 words.
  793   //
  794   // count is a count of words.
  795   //
  796   // Precondition: count >= 8
  797   //
  798   // Postconditions:
  799   //
  800   // The least significant bit of count contains the remaining count
  801   // of words to copy.  The rest of count is trash.
  802   //
  803   // s and d are adjusted to point to the remaining words to copy
  804   //
  805   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  806     BasicType type;
  807     copy_direction direction;
  808 
  809     switch (stub_id) {
  810     case copy_byte_f_id:
  811       direction = copy_forwards;
  812       type = T_BYTE;
  813       break;
  814     case copy_byte_b_id:
  815       direction = copy_backwards;
  816       type = T_BYTE;
  817       break;
  818     case copy_oop_f_id:
  819       direction = copy_forwards;
  820       type = T_OBJECT;
  821       break;
  822     case copy_oop_b_id:
  823       direction = copy_backwards;
  824       type = T_OBJECT;
  825       break;
  826     case copy_oop_uninit_f_id:
  827       direction = copy_forwards;
  828       type = T_OBJECT;
  829       break;
  830     case copy_oop_uninit_b_id:
  831       direction = copy_backwards;
  832       type = T_OBJECT;
  833       break;
  834     default:
  835       ShouldNotReachHere();
  836     }
  837 
  838     int unit = wordSize * direction;
  839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  840 
  841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  843     const Register stride = r14;
  844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  847 
  848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  849     assert_different_registers(s, d, count, rscratch1, rscratch2);
  850 
  851     Label again, drain;
  852 
  853     __ align(CodeEntryAlignment);
  854 
  855     StubCodeMark mark(this, stub_id);
  856 
  857     __ bind(start);
  858 
  859     Label unaligned_copy_long;
  860     if (AvoidUnalignedAccesses) {
  861       __ tbnz(d, 3, unaligned_copy_long);
  862     }
  863 
  864     if (direction == copy_forwards) {
  865       __ sub(s, s, bias);
  866       __ sub(d, d, bias);
  867     }
  868 
  869 #ifdef ASSERT
  870     // Make sure we are never given < 8 words
  871     {
  872       Label L;
  873       __ cmp(count, (u1)8);
  874       __ br(Assembler::GE, L);
  875       __ stop("genrate_copy_longs called with < 8 words");
  876       __ bind(L);
  877     }
  878 #endif
  879 
  880     // Fill 8 registers
  881     if (UseSIMDForMemoryOps) {
  882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  884     } else {
  885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  889     }
  890 
  891     __ subs(count, count, 16);
  892     __ br(Assembler::LO, drain);
  893 
  894     int prefetch = PrefetchCopyIntervalInBytes;
  895     bool use_stride = false;
  896     if (direction == copy_backwards) {
  897        use_stride = prefetch > 256;
  898        prefetch = -prefetch;
  899        if (use_stride) __ mov(stride, prefetch);
  900     }
  901 
  902     __ bind(again);
  903 
  904     if (PrefetchCopyIntervalInBytes > 0)
  905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  906 
  907     if (UseSIMDForMemoryOps) {
  908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  912     } else {
  913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  921     }
  922 
  923     __ subs(count, count, 8);
  924     __ br(Assembler::HS, again);
  925 
  926     // Drain
  927     __ bind(drain);
  928     if (UseSIMDForMemoryOps) {
  929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  931     } else {
  932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936     }
  937 
  938     {
  939       Label L1, L2;
  940       __ tbz(count, exact_log2(4), L1);
  941       if (UseSIMDForMemoryOps) {
  942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  944       } else {
  945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  949       }
  950       __ bind(L1);
  951 
  952       if (direction == copy_forwards) {
  953         __ add(s, s, bias);
  954         __ add(d, d, bias);
  955       }
  956 
  957       __ tbz(count, 1, L2);
  958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  960       __ bind(L2);
  961     }
  962 
  963     __ ret(lr);
  964 
  965     if (AvoidUnalignedAccesses) {
  966       Label drain, again;
  967       // Register order for storing. Order is different for backward copy.
  968 
  969       __ bind(unaligned_copy_long);
  970 
  971       // source address is even aligned, target odd aligned
  972       //
  973       // when forward copying word pairs we read long pairs at offsets
  974       // {0, 2, 4, 6} (in long words). when backwards copying we read
  975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  976       // address by -2 in the forwards case so we can compute the
  977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  978       // or -1.
  979       //
  980       // when forward copying we need to store 1 word, 3 pairs and
  981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  982       // zero offset We adjust the destination by -1 which means we
  983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  984       //
  985       // When backwards copyng we need to store 1 word, 3 pairs and
  986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  987       // offsets {1, 3, 5, 7, 8} * unit.
  988 
  989       if (direction == copy_forwards) {
  990         __ sub(s, s, 16);
  991         __ sub(d, d, 8);
  992       }
  993 
  994       // Fill 8 registers
  995       //
  996       // for forwards copy s was offset by -16 from the original input
  997       // value of s so the register contents are at these offsets
  998       // relative to the 64 bit block addressed by that original input
  999       // and so on for each successive 64 byte block when s is updated
 1000       //
 1001       // t0 at offset 0,  t1 at offset 8
 1002       // t2 at offset 16, t3 at offset 24
 1003       // t4 at offset 32, t5 at offset 40
 1004       // t6 at offset 48, t7 at offset 56
 1005 
 1006       // for backwards copy s was not offset so the register contents
 1007       // are at these offsets into the preceding 64 byte block
 1008       // relative to that original input and so on for each successive
 1009       // preceding 64 byte block when s is updated. this explains the
 1010       // slightly counter-intuitive looking pattern of register usage
 1011       // in the stp instructions for backwards copy.
 1012       //
 1013       // t0 at offset -16, t1 at offset -8
 1014       // t2 at offset -32, t3 at offset -24
 1015       // t4 at offset -48, t5 at offset -40
 1016       // t6 at offset -64, t7 at offset -56
 1017 
 1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1022 
 1023       __ subs(count, count, 16);
 1024       __ br(Assembler::LO, drain);
 1025 
 1026       int prefetch = PrefetchCopyIntervalInBytes;
 1027       bool use_stride = false;
 1028       if (direction == copy_backwards) {
 1029          use_stride = prefetch > 256;
 1030          prefetch = -prefetch;
 1031          if (use_stride) __ mov(stride, prefetch);
 1032       }
 1033 
 1034       __ bind(again);
 1035 
 1036       if (PrefetchCopyIntervalInBytes > 0)
 1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1038 
 1039       if (direction == copy_forwards) {
 1040        // allowing for the offset of -8 the store instructions place
 1041        // registers into the target 64 bit block at the following
 1042        // offsets
 1043        //
 1044        // t0 at offset 0
 1045        // t1 at offset 8,  t2 at offset 16
 1046        // t3 at offset 24, t4 at offset 32
 1047        // t5 at offset 40, t6 at offset 48
 1048        // t7 at offset 56
 1049 
 1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1059       } else {
 1060        // d was not offset when we started so the registers are
 1061        // written into the 64 bit block preceding d with the following
 1062        // offsets
 1063        //
 1064        // t1 at offset -8
 1065        // t3 at offset -24, t0 at offset -16
 1066        // t5 at offset -48, t2 at offset -32
 1067        // t7 at offset -56, t4 at offset -48
 1068        //                   t6 at offset -64
 1069        //
 1070        // note that this matches the offsets previously noted for the
 1071        // loads
 1072 
 1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1082       }
 1083 
 1084       __ subs(count, count, 8);
 1085       __ br(Assembler::HS, again);
 1086 
 1087       // Drain
 1088       //
 1089       // this uses the same pattern of offsets and register arguments
 1090       // as above
 1091       __ bind(drain);
 1092       if (direction == copy_forwards) {
 1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1098       } else {
 1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1104       }
 1105       // now we need to copy any remaining part block which may
 1106       // include a 4 word block subblock and/or a 2 word subblock.
 1107       // bits 2 and 1 in the count are the tell-tale for whether we
 1108       // have each such subblock
 1109       {
 1110         Label L1, L2;
 1111         __ tbz(count, exact_log2(4), L1);
 1112        // this is the same as above but copying only 4 longs hence
 1113        // with only one intervening stp between the str instructions
 1114        // but note that the offsets and registers still follow the
 1115        // same pattern
 1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1118         if (direction == copy_forwards) {
 1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1122         } else {
 1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1126         }
 1127         __ bind(L1);
 1128 
 1129         __ tbz(count, 1, L2);
 1130        // this is the same as above but copying only 2 longs hence
 1131        // there is no intervening stp between the str instructions
 1132        // but note that the offset and register patterns are still
 1133        // the same
 1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1135         if (direction == copy_forwards) {
 1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1141         }
 1142         __ bind(L2);
 1143 
 1144        // for forwards copy we need to re-adjust the offsets we
 1145        // applied so that s and d are follow the last words written
 1146 
 1147        if (direction == copy_forwards) {
 1148          __ add(s, s, 16);
 1149          __ add(d, d, 8);
 1150        }
 1151 
 1152       }
 1153 
 1154       __ ret(lr);
 1155       }
 1156   }
 1157 
 1158   // Small copy: less than 16 bytes.
 1159   //
 1160   // NB: Ignores all of the bits of count which represent more than 15
 1161   // bytes, so a caller doesn't have to mask them.
 1162 
 1163   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1164     bool is_backwards = step < 0;
 1165     size_t granularity = uabs(step);
 1166     int direction = is_backwards ? -1 : 1;
 1167 
 1168     Label Lword, Lint, Lshort, Lbyte;
 1169 
 1170     assert(granularity
 1171            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1172 
 1173     const Register t0 = r3;
 1174     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1175     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1176 
 1177     // ??? I don't know if this bit-test-and-branch is the right thing
 1178     // to do.  It does a lot of jumping, resulting in several
 1179     // mispredicted branches.  It might make more sense to do this
 1180     // with something like Duff's device with a single computed branch.
 1181 
 1182     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1183     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1184     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1185     __ bind(Lword);
 1186 
 1187     if (granularity <= sizeof (jint)) {
 1188       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1189       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1190       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1191       __ bind(Lint);
 1192     }
 1193 
 1194     if (granularity <= sizeof (jshort)) {
 1195       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1196       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1197       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1198       __ bind(Lshort);
 1199     }
 1200 
 1201     if (granularity <= sizeof (jbyte)) {
 1202       __ tbz(count, 0, Lbyte);
 1203       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1204       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1205       __ bind(Lbyte);
 1206     }
 1207   }
 1208 
 1209   Label copy_f, copy_b;
 1210   Label copy_obj_f, copy_obj_b;
 1211   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1212 
 1213   // All-singing all-dancing memory copy.
 1214   //
 1215   // Copy count units of memory from s to d.  The size of a unit is
 1216   // step, which can be positive or negative depending on the direction
 1217   // of copy.  If is_aligned is false, we align the source address.
 1218   //
 1219 
 1220   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1221                    Register s, Register d, Register count, int step) {
 1222     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1223     bool is_backwards = step < 0;
 1224     unsigned int granularity = uabs(step);
 1225     const Register t0 = r3, t1 = r4;
 1226 
 1227     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1228     // load all the data before writing anything
 1229     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1230     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1231     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1232     const Register send = r17, dend = r16;
 1233     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1234     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1235     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1236 
 1237     if (PrefetchCopyIntervalInBytes > 0)
 1238       __ prfm(Address(s, 0), PLDL1KEEP);
 1239     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1240     __ br(Assembler::HI, copy_big);
 1241 
 1242     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1243     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1244 
 1245     __ cmp(count, u1(16/granularity));
 1246     __ br(Assembler::LS, copy16);
 1247 
 1248     __ cmp(count, u1(64/granularity));
 1249     __ br(Assembler::HI, copy80);
 1250 
 1251     __ cmp(count, u1(32/granularity));
 1252     __ br(Assembler::LS, copy32);
 1253 
 1254     // 33..64 bytes
 1255     if (UseSIMDForMemoryOps) {
 1256       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1257       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1258       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1259       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1260     } else {
 1261       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1262       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1263       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1264       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1265 
 1266       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1267       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1268       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1269       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1270     }
 1271     __ b(finish);
 1272 
 1273     // 17..32 bytes
 1274     __ bind(copy32);
 1275     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1276     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1277 
 1278     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1279     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1280     __ b(finish);
 1281 
 1282     // 65..80/96 bytes
 1283     // (96 bytes if SIMD because we do 32 byes per instruction)
 1284     __ bind(copy80);
 1285     if (UseSIMDForMemoryOps) {
 1286       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1287       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1288       // Unaligned pointers can be an issue for copying.
 1289       // The issue has more chances to happen when granularity of data is
 1290       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1291       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1292       // The most performance drop has been seen for the range 65-80 bytes.
 1293       // For such cases using the pair of ldp/stp instead of the third pair of
 1294       // ldpq/stpq fixes the performance issue.
 1295       if (granularity < sizeof (jint)) {
 1296         Label copy96;
 1297         __ cmp(count, u1(80/granularity));
 1298         __ br(Assembler::HI, copy96);
 1299         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1300 
 1301         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1302         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1303 
 1304         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1305         __ b(finish);
 1306 
 1307         __ bind(copy96);
 1308       }
 1309       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1310 
 1311       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1312       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1313 
 1314       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1315     } else {
 1316       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1317       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1318       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1319       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1320       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1321 
 1322       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1323       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1324       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1325       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1326       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1327     }
 1328     __ b(finish);
 1329 
 1330     // 0..16 bytes
 1331     __ bind(copy16);
 1332     __ cmp(count, u1(8/granularity));
 1333     __ br(Assembler::LO, copy8);
 1334 
 1335     // 8..16 bytes
 1336     bs.copy_load_at_8(t0, Address(s, 0));
 1337     bs.copy_load_at_8(t1, Address(send, -8));
 1338     bs.copy_store_at_8(Address(d, 0), t0);
 1339     bs.copy_store_at_8(Address(dend, -8), t1);
 1340     __ b(finish);
 1341 
 1342     if (granularity < 8) {
 1343       // 4..7 bytes
 1344       __ bind(copy8);
 1345       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1346       __ ldrw(t0, Address(s, 0));
 1347       __ ldrw(t1, Address(send, -4));
 1348       __ strw(t0, Address(d, 0));
 1349       __ strw(t1, Address(dend, -4));
 1350       __ b(finish);
 1351       if (granularity < 4) {
 1352         // 0..3 bytes
 1353         __ bind(copy4);
 1354         __ cbz(count, finish); // get rid of 0 case
 1355         if (granularity == 2) {
 1356           __ ldrh(t0, Address(s, 0));
 1357           __ strh(t0, Address(d, 0));
 1358         } else { // granularity == 1
 1359           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1360           // the first and last byte.
 1361           // Handle the 3 byte case by loading and storing base + count/2
 1362           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1363           // This does means in the 1 byte case we load/store the same
 1364           // byte 3 times.
 1365           __ lsr(count, count, 1);
 1366           __ ldrb(t0, Address(s, 0));
 1367           __ ldrb(t1, Address(send, -1));
 1368           __ ldrb(t2, Address(s, count));
 1369           __ strb(t0, Address(d, 0));
 1370           __ strb(t1, Address(dend, -1));
 1371           __ strb(t2, Address(d, count));
 1372         }
 1373         __ b(finish);
 1374       }
 1375     }
 1376 
 1377     __ bind(copy_big);
 1378     if (is_backwards) {
 1379       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1380       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1381     }
 1382 
 1383     // Now we've got the small case out of the way we can align the
 1384     // source address on a 2-word boundary.
 1385 
 1386     // Here we will materialize a count in r15, which is used by copy_memory_small
 1387     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1388     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1389     // can not be used as a temp register, as it contains the count.
 1390 
 1391     Label aligned;
 1392 
 1393     if (is_aligned) {
 1394       // We may have to adjust by 1 word to get s 2-word-aligned.
 1395       __ tbz(s, exact_log2(wordSize), aligned);
 1396       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1397       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1398       __ sub(count, count, wordSize/granularity);
 1399     } else {
 1400       if (is_backwards) {
 1401         __ andr(r15, s, 2 * wordSize - 1);
 1402       } else {
 1403         __ neg(r15, s);
 1404         __ andr(r15, r15, 2 * wordSize - 1);
 1405       }
 1406       // r15 is the byte adjustment needed to align s.
 1407       __ cbz(r15, aligned);
 1408       int shift = exact_log2(granularity);
 1409       if (shift > 0) {
 1410         __ lsr(r15, r15, shift);
 1411       }
 1412       __ sub(count, count, r15);
 1413 
 1414 #if 0
 1415       // ?? This code is only correct for a disjoint copy.  It may or
 1416       // may not make sense to use it in that case.
 1417 
 1418       // Copy the first pair; s and d may not be aligned.
 1419       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1420       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1421 
 1422       // Align s and d, adjust count
 1423       if (is_backwards) {
 1424         __ sub(s, s, r15);
 1425         __ sub(d, d, r15);
 1426       } else {
 1427         __ add(s, s, r15);
 1428         __ add(d, d, r15);
 1429       }
 1430 #else
 1431       copy_memory_small(decorators, type, s, d, r15, step);
 1432 #endif
 1433     }
 1434 
 1435     __ bind(aligned);
 1436 
 1437     // s is now 2-word-aligned.
 1438 
 1439     // We have a count of units and some trailing bytes. Adjust the
 1440     // count and do a bulk copy of words. If the shift is zero
 1441     // perform a move instead to benefit from zero latency moves.
 1442     int shift = exact_log2(wordSize/granularity);
 1443     if (shift > 0) {
 1444       __ lsr(r15, count, shift);
 1445     } else {
 1446       __ mov(r15, count);
 1447     }
 1448     if (direction == copy_forwards) {
 1449       if (type != T_OBJECT) {
 1450         __ bl(copy_f);
 1451       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1452         __ bl(copy_obj_uninit_f);
 1453       } else {
 1454         __ bl(copy_obj_f);
 1455       }
 1456     } else {
 1457       if (type != T_OBJECT) {
 1458         __ bl(copy_b);
 1459       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1460         __ bl(copy_obj_uninit_b);
 1461       } else {
 1462         __ bl(copy_obj_b);
 1463       }
 1464     }
 1465 
 1466     // And the tail.
 1467     copy_memory_small(decorators, type, s, d, count, step);
 1468 
 1469     if (granularity >= 8) __ bind(copy8);
 1470     if (granularity >= 4) __ bind(copy4);
 1471     __ bind(finish);
 1472   }
 1473 
 1474 
 1475   void clobber_registers() {
 1476 #ifdef ASSERT
 1477     RegSet clobbered
 1478       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1479     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1480     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1481     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1482       __ mov(*it, rscratch1);
 1483     }
 1484 #endif
 1485 
 1486   }
 1487 
 1488   // Scan over array at a for count oops, verifying each one.
 1489   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1490   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1491     Label loop, end;
 1492     __ mov(rscratch1, a);
 1493     __ mov(rscratch2, zr);
 1494     __ bind(loop);
 1495     __ cmp(rscratch2, count);
 1496     __ br(Assembler::HS, end);
 1497     if (size == wordSize) {
 1498       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1499       __ verify_oop(temp);
 1500     } else {
 1501       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1502       __ decode_heap_oop(temp); // calls verify_oop
 1503     }
 1504     __ add(rscratch2, rscratch2, 1);
 1505     __ b(loop);
 1506     __ bind(end);
 1507   }
 1508 
 1509   // Arguments:
 1510   //   stub_id - is used to name the stub and identify all details of
 1511   //             how to perform the copy.
 1512   //
 1513   //   entry - is assigned to the stub's post push entry point unless
 1514   //           it is null
 1515   //
 1516   // Inputs:
 1517   //   c_rarg0   - source array address
 1518   //   c_rarg1   - destination array address
 1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1520   //
 1521   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1522   // the hardware handle it.  The two dwords within qwords that span
 1523   // cache line boundaries will still be loaded and stored atomically.
 1524   //
 1525   // Side Effects: entry is set to the (post push) entry point so it
 1526   //               can be used by the corresponding conjoint copy
 1527   //               method
 1528   //
 1529   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1530     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1531     RegSet saved_reg = RegSet::of(s, d, count);
 1532     int size;
 1533     bool aligned;
 1534     bool is_oop;
 1535     bool dest_uninitialized;
 1536     switch (stub_id) {
 1537     case jbyte_disjoint_arraycopy_id:
 1538       size = sizeof(jbyte);
 1539       aligned = false;
 1540       is_oop = false;
 1541       dest_uninitialized = false;
 1542       break;
 1543     case arrayof_jbyte_disjoint_arraycopy_id:
 1544       size = sizeof(jbyte);
 1545       aligned = true;
 1546       is_oop = false;
 1547       dest_uninitialized = false;
 1548       break;
 1549     case jshort_disjoint_arraycopy_id:
 1550       size = sizeof(jshort);
 1551       aligned = false;
 1552       is_oop = false;
 1553       dest_uninitialized = false;
 1554       break;
 1555     case arrayof_jshort_disjoint_arraycopy_id:
 1556       size = sizeof(jshort);
 1557       aligned = true;
 1558       is_oop = false;
 1559       dest_uninitialized = false;
 1560       break;
 1561     case jint_disjoint_arraycopy_id:
 1562       size = sizeof(jint);
 1563       aligned = false;
 1564       is_oop = false;
 1565       dest_uninitialized = false;
 1566       break;
 1567     case arrayof_jint_disjoint_arraycopy_id:
 1568       size = sizeof(jint);
 1569       aligned = true;
 1570       is_oop = false;
 1571       dest_uninitialized = false;
 1572       break;
 1573     case jlong_disjoint_arraycopy_id:
 1574       // since this is always aligned we can (should!) use the same
 1575       // stub as for case arrayof_jlong_disjoint_arraycopy
 1576       ShouldNotReachHere();
 1577       break;
 1578     case arrayof_jlong_disjoint_arraycopy_id:
 1579       size = sizeof(jlong);
 1580       aligned = true;
 1581       is_oop = false;
 1582       dest_uninitialized = false;
 1583       break;
 1584     case oop_disjoint_arraycopy_id:
 1585       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1586       aligned = !UseCompressedOops;
 1587       is_oop = true;
 1588       dest_uninitialized = false;
 1589       break;
 1590     case arrayof_oop_disjoint_arraycopy_id:
 1591       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1592       aligned = !UseCompressedOops;
 1593       is_oop = true;
 1594       dest_uninitialized = false;
 1595       break;
 1596     case oop_disjoint_arraycopy_uninit_id:
 1597       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1598       aligned = !UseCompressedOops;
 1599       is_oop = true;
 1600       dest_uninitialized = true;
 1601       break;
 1602     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1603       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1604       aligned = !UseCompressedOops;
 1605       is_oop = true;
 1606       dest_uninitialized = true;
 1607       break;
 1608     default:
 1609       ShouldNotReachHere();
 1610       break;
 1611     }
 1612 
 1613     __ align(CodeEntryAlignment);
 1614     StubCodeMark mark(this, stub_id);
 1615     address start = __ pc();
 1616     __ enter();
 1617 
 1618     if (entry != nullptr) {
 1619       *entry = __ pc();
 1620       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1621       BLOCK_COMMENT("Entry:");
 1622     }
 1623 
 1624     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1625     if (dest_uninitialized) {
 1626       decorators |= IS_DEST_UNINITIALIZED;
 1627     }
 1628     if (aligned) {
 1629       decorators |= ARRAYCOPY_ALIGNED;
 1630     }
 1631 
 1632     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1633     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1634 
 1635     if (is_oop) {
 1636       // save regs before copy_memory
 1637       __ push(RegSet::of(d, count), sp);
 1638     }
 1639     {
 1640       // UnsafeMemoryAccess page error: continue after unsafe access
 1641       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1642       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1643       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1644     }
 1645 
 1646     if (is_oop) {
 1647       __ pop(RegSet::of(d, count), sp);
 1648       if (VerifyOops)
 1649         verify_oop_array(size, d, count, r16);
 1650     }
 1651 
 1652     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1653 
 1654     __ leave();
 1655     __ mov(r0, zr); // return 0
 1656     __ ret(lr);
 1657     return start;
 1658   }
 1659 
 1660   // Arguments:
 1661   //   stub_id - is used to name the stub and identify all details of
 1662   //             how to perform the copy.
 1663   //
 1664   //   nooverlap_target - identifes the (post push) entry for the
 1665   //             corresponding disjoint copy routine which can be
 1666   //             jumped to if the ranges do not actually overlap
 1667   //
 1668   //   entry - is assigned to the stub's post push entry point unless
 1669   //           it is null
 1670   //
 1671   //
 1672   // Inputs:
 1673   //   c_rarg0   - source array address
 1674   //   c_rarg1   - destination array address
 1675   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1676   //
 1677   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1678   // the hardware handle it.  The two dwords within qwords that span
 1679   // cache line boundaries will still be loaded and stored atomically.
 1680   //
 1681   // Side Effects:
 1682   //   entry is set to the no-overlap entry point so it can be used by
 1683   //   some other conjoint copy method
 1684   //
 1685   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1686     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1687     RegSet saved_regs = RegSet::of(s, d, count);
 1688     int size;
 1689     bool aligned;
 1690     bool is_oop;
 1691     bool dest_uninitialized;
 1692     switch (stub_id) {
 1693     case jbyte_arraycopy_id:
 1694       size = sizeof(jbyte);
 1695       aligned = false;
 1696       is_oop = false;
 1697       dest_uninitialized = false;
 1698       break;
 1699     case arrayof_jbyte_arraycopy_id:
 1700       size = sizeof(jbyte);
 1701       aligned = true;
 1702       is_oop = false;
 1703       dest_uninitialized = false;
 1704       break;
 1705     case jshort_arraycopy_id:
 1706       size = sizeof(jshort);
 1707       aligned = false;
 1708       is_oop = false;
 1709       dest_uninitialized = false;
 1710       break;
 1711     case arrayof_jshort_arraycopy_id:
 1712       size = sizeof(jshort);
 1713       aligned = true;
 1714       is_oop = false;
 1715       dest_uninitialized = false;
 1716       break;
 1717     case jint_arraycopy_id:
 1718       size = sizeof(jint);
 1719       aligned = false;
 1720       is_oop = false;
 1721       dest_uninitialized = false;
 1722       break;
 1723     case arrayof_jint_arraycopy_id:
 1724       size = sizeof(jint);
 1725       aligned = true;
 1726       is_oop = false;
 1727       dest_uninitialized = false;
 1728       break;
 1729     case jlong_arraycopy_id:
 1730       // since this is always aligned we can (should!) use the same
 1731       // stub as for case arrayof_jlong_disjoint_arraycopy
 1732       ShouldNotReachHere();
 1733       break;
 1734     case arrayof_jlong_arraycopy_id:
 1735       size = sizeof(jlong);
 1736       aligned = true;
 1737       is_oop = false;
 1738       dest_uninitialized = false;
 1739       break;
 1740     case oop_arraycopy_id:
 1741       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1742       aligned = !UseCompressedOops;
 1743       is_oop = true;
 1744       dest_uninitialized = false;
 1745       break;
 1746     case arrayof_oop_arraycopy_id:
 1747       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1748       aligned = !UseCompressedOops;
 1749       is_oop = true;
 1750       dest_uninitialized = false;
 1751       break;
 1752     case oop_arraycopy_uninit_id:
 1753       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1754       aligned = !UseCompressedOops;
 1755       is_oop = true;
 1756       dest_uninitialized = true;
 1757       break;
 1758     case arrayof_oop_arraycopy_uninit_id:
 1759       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1760       aligned = !UseCompressedOops;
 1761       is_oop = true;
 1762       dest_uninitialized = true;
 1763       break;
 1764     default:
 1765       ShouldNotReachHere();
 1766     }
 1767 
 1768     StubCodeMark mark(this, stub_id);
 1769     address start = __ pc();
 1770     __ enter();
 1771 
 1772     if (entry != nullptr) {
 1773       *entry = __ pc();
 1774       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1775       BLOCK_COMMENT("Entry:");
 1776     }
 1777 
 1778     // use fwd copy when (d-s) above_equal (count*size)
 1779     __ sub(rscratch1, d, s);
 1780     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1781     __ br(Assembler::HS, nooverlap_target);
 1782 
 1783     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1784     if (dest_uninitialized) {
 1785       decorators |= IS_DEST_UNINITIALIZED;
 1786     }
 1787     if (aligned) {
 1788       decorators |= ARRAYCOPY_ALIGNED;
 1789     }
 1790 
 1791     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1792     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1793 
 1794     if (is_oop) {
 1795       // save regs before copy_memory
 1796       __ push(RegSet::of(d, count), sp);
 1797     }
 1798     {
 1799       // UnsafeMemoryAccess page error: continue after unsafe access
 1800       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1801       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1802       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1803     }
 1804     if (is_oop) {
 1805       __ pop(RegSet::of(d, count), sp);
 1806       if (VerifyOops)
 1807         verify_oop_array(size, d, count, r16);
 1808     }
 1809     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1810     __ leave();
 1811     __ mov(r0, zr); // return 0
 1812     __ ret(lr);
 1813     return start;
 1814   }
 1815 
 1816   // Helper for generating a dynamic type check.
 1817   // Smashes rscratch1, rscratch2.
 1818   void generate_type_check(Register sub_klass,
 1819                            Register super_check_offset,
 1820                            Register super_klass,
 1821                            Register temp1,
 1822                            Register temp2,
 1823                            Register result,
 1824                            Label& L_success) {
 1825     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1826 
 1827     BLOCK_COMMENT("type_check:");
 1828 
 1829     Label L_miss;
 1830 
 1831     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1832                                      super_check_offset);
 1833     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1834 
 1835     // Fall through on failure!
 1836     __ BIND(L_miss);
 1837   }
 1838 
 1839   //
 1840   //  Generate checkcasting array copy stub
 1841   //
 1842   //  Input:
 1843   //    c_rarg0   - source array address
 1844   //    c_rarg1   - destination array address
 1845   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1846   //    c_rarg3   - size_t ckoff (super_check_offset)
 1847   //    c_rarg4   - oop ckval (super_klass)
 1848   //
 1849   //  Output:
 1850   //    r0 ==  0  -  success
 1851   //    r0 == -1^K - failure, where K is partial transfer count
 1852   //
 1853   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1854     bool dest_uninitialized;
 1855     switch (stub_id) {
 1856     case checkcast_arraycopy_id:
 1857       dest_uninitialized = false;
 1858       break;
 1859     case checkcast_arraycopy_uninit_id:
 1860       dest_uninitialized = true;
 1861       break;
 1862     default:
 1863       ShouldNotReachHere();
 1864     }
 1865 
 1866     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1867 
 1868     // Input registers (after setup_arg_regs)
 1869     const Register from        = c_rarg0;   // source array address
 1870     const Register to          = c_rarg1;   // destination array address
 1871     const Register count       = c_rarg2;   // elementscount
 1872     const Register ckoff       = c_rarg3;   // super_check_offset
 1873     const Register ckval       = c_rarg4;   // super_klass
 1874 
 1875     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1876     RegSet wb_post_saved_regs = RegSet::of(count);
 1877 
 1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1879     const Register copied_oop  = r22;       // actual oop copied
 1880     const Register count_save  = r21;       // orig elementscount
 1881     const Register start_to    = r20;       // destination array start address
 1882     const Register r19_klass   = r19;       // oop._klass
 1883 
 1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1886 
 1887     //---------------------------------------------------------------
 1888     // Assembler stub will be used for this call to arraycopy
 1889     // if the two arrays are subtypes of Object[] but the
 1890     // destination array type is not equal to or a supertype
 1891     // of the source type.  Each element must be separately
 1892     // checked.
 1893 
 1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1895                                copied_oop, r19_klass, count_save);
 1896 
 1897     __ align(CodeEntryAlignment);
 1898     StubCodeMark mark(this, stub_id);
 1899     address start = __ pc();
 1900 
 1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1902 
 1903 #ifdef ASSERT
 1904     // caller guarantees that the arrays really are different
 1905     // otherwise, we would have to make conjoint checks
 1906     { Label L;
 1907       __ b(L);                  // conjoint check not yet implemented
 1908       __ stop("checkcast_copy within a single array");
 1909       __ bind(L);
 1910     }
 1911 #endif //ASSERT
 1912 
 1913     // Caller of this entry point must set up the argument registers.
 1914     if (entry != nullptr) {
 1915       *entry = __ pc();
 1916       BLOCK_COMMENT("Entry:");
 1917     }
 1918 
 1919      // Empty array:  Nothing to do.
 1920     __ cbz(count, L_done);
 1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1922 
 1923 #ifdef ASSERT
 1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1925     // The ckoff and ckval must be mutually consistent,
 1926     // even though caller generates both.
 1927     { Label L;
 1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1929       __ ldrw(start_to, Address(ckval, sco_offset));
 1930       __ cmpw(ckoff, start_to);
 1931       __ br(Assembler::EQ, L);
 1932       __ stop("super_check_offset inconsistent");
 1933       __ bind(L);
 1934     }
 1935 #endif //ASSERT
 1936 
 1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1938     bool is_oop = true;
 1939     int element_size = UseCompressedOops ? 4 : 8;
 1940     if (dest_uninitialized) {
 1941       decorators |= IS_DEST_UNINITIALIZED;
 1942     }
 1943 
 1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1946 
 1947     // save the original count
 1948     __ mov(count_save, count);
 1949 
 1950     // Copy from low to high addresses
 1951     __ mov(start_to, to);              // Save destination array start address
 1952     __ b(L_load_element);
 1953 
 1954     // ======== begin loop ========
 1955     // (Loop is rotated; its entry is L_load_element.)
 1956     // Loop control:
 1957     //   for (; count != 0; count--) {
 1958     //     copied_oop = load_heap_oop(from++);
 1959     //     ... generate_type_check ...;
 1960     //     store_heap_oop(to++, copied_oop);
 1961     //   }
 1962     __ align(OptoLoopAlignment);
 1963 
 1964     __ BIND(L_store_element);
 1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1966                       __ post(to, element_size), copied_oop, noreg,
 1967                       gct1, gct2, gct3);
 1968     __ sub(count, count, 1);
 1969     __ cbz(count, L_do_card_marks);
 1970 
 1971     // ======== loop entry is here ========
 1972     __ BIND(L_load_element);
 1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1974                      copied_oop, noreg, __ post(from, element_size),
 1975                      gct1);
 1976     __ cbz(copied_oop, L_store_element);
 1977 
 1978     __ load_klass(r19_klass, copied_oop);// query the object klass
 1979 
 1980     BLOCK_COMMENT("type_check:");
 1981     generate_type_check(/*sub_klass*/r19_klass,
 1982                         /*super_check_offset*/ckoff,
 1983                         /*super_klass*/ckval,
 1984                         /*r_array_base*/gct1,
 1985                         /*temp2*/gct2,
 1986                         /*result*/r10, L_store_element);
 1987 
 1988     // Fall through on failure!
 1989 
 1990     // ======== end loop ========
 1991 
 1992     // It was a real error; we must depend on the caller to finish the job.
 1993     // Register count = remaining oops, count_orig = total oops.
 1994     // Emit GC store barriers for the oops we have copied and report
 1995     // their number to the caller.
 1996 
 1997     __ subs(count, count_save, count);     // K = partially copied oop count
 1998     __ eon(count, count, zr);              // report (-1^K) to caller
 1999     __ br(Assembler::EQ, L_done_pop);
 2000 
 2001     __ BIND(L_do_card_marks);
 2002     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2003 
 2004     __ bind(L_done_pop);
 2005     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2007 
 2008     __ bind(L_done);
 2009     __ mov(r0, count);
 2010     __ leave();
 2011     __ ret(lr);
 2012 
 2013     return start;
 2014   }
 2015 
 2016   // Perform range checks on the proposed arraycopy.
 2017   // Kills temp, but nothing else.
 2018   // Also, clean the sign bits of src_pos and dst_pos.
 2019   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2020                               Register src_pos, // source position (c_rarg1)
 2021                               Register dst,     // destination array oo (c_rarg2)
 2022                               Register dst_pos, // destination position (c_rarg3)
 2023                               Register length,
 2024                               Register temp,
 2025                               Label& L_failed) {
 2026     BLOCK_COMMENT("arraycopy_range_checks:");
 2027 
 2028     assert_different_registers(rscratch1, temp);
 2029 
 2030     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2031     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2032     __ addw(temp, length, src_pos);
 2033     __ cmpw(temp, rscratch1);
 2034     __ br(Assembler::HI, L_failed);
 2035 
 2036     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2037     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2038     __ addw(temp, length, dst_pos);
 2039     __ cmpw(temp, rscratch1);
 2040     __ br(Assembler::HI, L_failed);
 2041 
 2042     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2043     __ movw(src_pos, src_pos);
 2044     __ movw(dst_pos, dst_pos);
 2045 
 2046     BLOCK_COMMENT("arraycopy_range_checks done");
 2047   }
 2048 
 2049   // These stubs get called from some dumb test routine.
 2050   // I'll write them properly when they're called from
 2051   // something that's actually doing something.
 2052   static void fake_arraycopy_stub(address src, address dst, int count) {
 2053     assert(count == 0, "huh?");
 2054   }
 2055 
 2056 
 2057   //
 2058   //  Generate 'unsafe' array copy stub
 2059   //  Though just as safe as the other stubs, it takes an unscaled
 2060   //  size_t argument instead of an element count.
 2061   //
 2062   //  Input:
 2063   //    c_rarg0   - source array address
 2064   //    c_rarg1   - destination array address
 2065   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2066   //
 2067   // Examines the alignment of the operands and dispatches
 2068   // to a long, int, short, or byte copy loop.
 2069   //
 2070   address generate_unsafe_copy(address byte_copy_entry,
 2071                                address short_copy_entry,
 2072                                address int_copy_entry,
 2073                                address long_copy_entry) {
 2074     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2075 
 2076     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2077     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2078 
 2079     __ align(CodeEntryAlignment);
 2080     StubCodeMark mark(this, stub_id);
 2081     address start = __ pc();
 2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2083 
 2084     // bump this on entry, not on exit:
 2085     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2086 
 2087     __ orr(rscratch1, s, d);
 2088     __ orr(rscratch1, rscratch1, count);
 2089 
 2090     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2091     __ cbz(rscratch1, L_long_aligned);
 2092     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2093     __ cbz(rscratch1, L_int_aligned);
 2094     __ tbz(rscratch1, 0, L_short_aligned);
 2095     __ b(RuntimeAddress(byte_copy_entry));
 2096 
 2097     __ BIND(L_short_aligned);
 2098     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2099     __ b(RuntimeAddress(short_copy_entry));
 2100     __ BIND(L_int_aligned);
 2101     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2102     __ b(RuntimeAddress(int_copy_entry));
 2103     __ BIND(L_long_aligned);
 2104     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2105     __ b(RuntimeAddress(long_copy_entry));
 2106 
 2107     return start;
 2108   }
 2109 
 2110   //
 2111   //  Generate generic array copy stubs
 2112   //
 2113   //  Input:
 2114   //    c_rarg0    -  src oop
 2115   //    c_rarg1    -  src_pos (32-bits)
 2116   //    c_rarg2    -  dst oop
 2117   //    c_rarg3    -  dst_pos (32-bits)
 2118   //    c_rarg4    -  element count (32-bits)
 2119   //
 2120   //  Output:
 2121   //    r0 ==  0  -  success
 2122   //    r0 == -1^K - failure, where K is partial transfer count
 2123   //
 2124   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2125                                 address int_copy_entry, address oop_copy_entry,
 2126                                 address long_copy_entry, address checkcast_copy_entry) {
 2127     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2128 
 2129     Label L_failed, L_objArray;
 2130     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2131 
 2132     // Input registers
 2133     const Register src        = c_rarg0;  // source array oop
 2134     const Register src_pos    = c_rarg1;  // source position
 2135     const Register dst        = c_rarg2;  // destination array oop
 2136     const Register dst_pos    = c_rarg3;  // destination position
 2137     const Register length     = c_rarg4;
 2138 
 2139 
 2140     // Registers used as temps
 2141     const Register dst_klass  = c_rarg5;
 2142 
 2143     __ align(CodeEntryAlignment);
 2144 
 2145     StubCodeMark mark(this, stub_id);
 2146 
 2147     address start = __ pc();
 2148 
 2149     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2150 
 2151     // bump this on entry, not on exit:
 2152     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2153 
 2154     //-----------------------------------------------------------------------
 2155     // Assembler stub will be used for this call to arraycopy
 2156     // if the following conditions are met:
 2157     //
 2158     // (1) src and dst must not be null.
 2159     // (2) src_pos must not be negative.
 2160     // (3) dst_pos must not be negative.
 2161     // (4) length  must not be negative.
 2162     // (5) src klass and dst klass should be the same and not null.
 2163     // (6) src and dst should be arrays.
 2164     // (7) src_pos + length must not exceed length of src.
 2165     // (8) dst_pos + length must not exceed length of dst.
 2166     //
 2167 
 2168     //  if (src == nullptr) return -1;
 2169     __ cbz(src, L_failed);
 2170 
 2171     //  if (src_pos < 0) return -1;
 2172     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2173 
 2174     //  if (dst == nullptr) return -1;
 2175     __ cbz(dst, L_failed);
 2176 
 2177     //  if (dst_pos < 0) return -1;
 2178     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2179 
 2180     // registers used as temp
 2181     const Register scratch_length    = r16; // elements count to copy
 2182     const Register scratch_src_klass = r17; // array klass
 2183     const Register lh                = r15; // layout helper
 2184 
 2185     //  if (length < 0) return -1;
 2186     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2187     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2188 
 2189     __ load_klass(scratch_src_klass, src);
 2190 #ifdef ASSERT
 2191     //  assert(src->klass() != nullptr);
 2192     {
 2193       BLOCK_COMMENT("assert klasses not null {");
 2194       Label L1, L2;
 2195       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2196       __ bind(L1);
 2197       __ stop("broken null klass");
 2198       __ bind(L2);
 2199       __ load_klass(rscratch1, dst);
 2200       __ cbz(rscratch1, L1);     // this would be broken also
 2201       BLOCK_COMMENT("} assert klasses not null done");
 2202     }
 2203 #endif
 2204 
 2205     // Load layout helper (32-bits)
 2206     //
 2207     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2208     // 32        30    24            16              8     2                 0
 2209     //
 2210     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2211     //
 2212 
 2213     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2214 
 2215     // Handle objArrays completely differently...
 2216     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2217     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2218     __ movw(rscratch1, objArray_lh);
 2219     __ eorw(rscratch2, lh, rscratch1);
 2220     __ cbzw(rscratch2, L_objArray);
 2221 
 2222     //  if (src->klass() != dst->klass()) return -1;
 2223     __ load_klass(rscratch2, dst);
 2224     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2225     __ cbnz(rscratch2, L_failed);
 2226 
 2227     //  if (!src->is_Array()) return -1;
 2228     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2229 
 2230     // At this point, it is known to be a typeArray (array_tag 0x3).
 2231 #ifdef ASSERT
 2232     {
 2233       BLOCK_COMMENT("assert primitive array {");
 2234       Label L;
 2235       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2236       __ cmpw(lh, rscratch2);
 2237       __ br(Assembler::GE, L);
 2238       __ stop("must be a primitive array");
 2239       __ bind(L);
 2240       BLOCK_COMMENT("} assert primitive array done");
 2241     }
 2242 #endif
 2243 
 2244     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2245                            rscratch2, L_failed);
 2246 
 2247     // TypeArrayKlass
 2248     //
 2249     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2250     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2251     //
 2252 
 2253     const Register rscratch1_offset = rscratch1;    // array offset
 2254     const Register r15_elsize = lh; // element size
 2255 
 2256     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2257            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2258     __ add(src, src, rscratch1_offset);           // src array offset
 2259     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2260     BLOCK_COMMENT("choose copy loop based on element size");
 2261 
 2262     // next registers should be set before the jump to corresponding stub
 2263     const Register from     = c_rarg0;  // source array address
 2264     const Register to       = c_rarg1;  // destination array address
 2265     const Register count    = c_rarg2;  // elements count
 2266 
 2267     // 'from', 'to', 'count' registers should be set in such order
 2268     // since they are the same as 'src', 'src_pos', 'dst'.
 2269 
 2270     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2271 
 2272     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2273     // size in bytes).  We do a simple bitwise binary search.
 2274   __ BIND(L_copy_bytes);
 2275     __ tbnz(r15_elsize, 1, L_copy_ints);
 2276     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2277     __ lea(from, Address(src, src_pos));// src_addr
 2278     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2279     __ movw(count, scratch_length); // length
 2280     __ b(RuntimeAddress(byte_copy_entry));
 2281 
 2282   __ BIND(L_copy_shorts);
 2283     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2285     __ movw(count, scratch_length); // length
 2286     __ b(RuntimeAddress(short_copy_entry));
 2287 
 2288   __ BIND(L_copy_ints);
 2289     __ tbnz(r15_elsize, 0, L_copy_longs);
 2290     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2291     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2292     __ movw(count, scratch_length); // length
 2293     __ b(RuntimeAddress(int_copy_entry));
 2294 
 2295   __ BIND(L_copy_longs);
 2296 #ifdef ASSERT
 2297     {
 2298       BLOCK_COMMENT("assert long copy {");
 2299       Label L;
 2300       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2301       __ cmpw(r15_elsize, LogBytesPerLong);
 2302       __ br(Assembler::EQ, L);
 2303       __ stop("must be long copy, but elsize is wrong");
 2304       __ bind(L);
 2305       BLOCK_COMMENT("} assert long copy done");
 2306     }
 2307 #endif
 2308     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2310     __ movw(count, scratch_length); // length
 2311     __ b(RuntimeAddress(long_copy_entry));
 2312 
 2313     // ObjArrayKlass
 2314   __ BIND(L_objArray);
 2315     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2316 
 2317     Label L_plain_copy, L_checkcast_copy;
 2318     //  test array classes for subtyping
 2319     __ load_klass(r15, dst);
 2320     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2321     __ br(Assembler::NE, L_checkcast_copy);
 2322 
 2323     // Identically typed arrays can be copied without element-wise checks.
 2324     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2325                            rscratch2, L_failed);
 2326 
 2327     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2328     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2329     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2330     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2331     __ movw(count, scratch_length); // length
 2332   __ BIND(L_plain_copy);
 2333     __ b(RuntimeAddress(oop_copy_entry));
 2334 
 2335   __ BIND(L_checkcast_copy);
 2336     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2337     {
 2338       // Before looking at dst.length, make sure dst is also an objArray.
 2339       __ ldrw(rscratch1, Address(r15, lh_offset));
 2340       __ movw(rscratch2, objArray_lh);
 2341       __ eorw(rscratch1, rscratch1, rscratch2);
 2342       __ cbnzw(rscratch1, L_failed);
 2343 
 2344       // It is safe to examine both src.length and dst.length.
 2345       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2346                              r15, L_failed);
 2347 
 2348       __ load_klass(dst_klass, dst); // reload
 2349 
 2350       // Marshal the base address arguments now, freeing registers.
 2351       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2352       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2354       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2355       __ movw(count, length);           // length (reloaded)
 2356       Register sco_temp = c_rarg3;      // this register is free now
 2357       assert_different_registers(from, to, count, sco_temp,
 2358                                  dst_klass, scratch_src_klass);
 2359       // assert_clean_int(count, sco_temp);
 2360 
 2361       // Generate the type check.
 2362       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2364 
 2365       // Smashes rscratch1, rscratch2
 2366       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2367                           L_plain_copy);
 2368 
 2369       // Fetch destination element klass from the ObjArrayKlass header.
 2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2373 
 2374       // the checkcast_copy loop needs two extra arguments:
 2375       assert(c_rarg3 == sco_temp, "#3 already in place");
 2376       // Set up arguments for checkcast_copy_entry.
 2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2378       __ b(RuntimeAddress(checkcast_copy_entry));
 2379     }
 2380 
 2381   __ BIND(L_failed);
 2382     __ mov(r0, -1);
 2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2384     __ ret(lr);
 2385 
 2386     return start;
 2387   }
 2388 
 2389   //
 2390   // Generate stub for array fill. If "aligned" is true, the
 2391   // "to" address is assumed to be heapword aligned.
 2392   //
 2393   // Arguments for generated stub:
 2394   //   to:    c_rarg0
 2395   //   value: c_rarg1
 2396   //   count: c_rarg2 treated as signed
 2397   //
 2398   address generate_fill(StubGenStubId stub_id) {
 2399     BasicType t;
 2400     bool aligned;
 2401 
 2402     switch (stub_id) {
 2403     case jbyte_fill_id:
 2404       t = T_BYTE;
 2405       aligned = false;
 2406       break;
 2407     case jshort_fill_id:
 2408       t = T_SHORT;
 2409       aligned = false;
 2410       break;
 2411     case jint_fill_id:
 2412       t = T_INT;
 2413       aligned = false;
 2414       break;
 2415     case arrayof_jbyte_fill_id:
 2416       t = T_BYTE;
 2417       aligned = true;
 2418       break;
 2419     case arrayof_jshort_fill_id:
 2420       t = T_SHORT;
 2421       aligned = true;
 2422       break;
 2423     case arrayof_jint_fill_id:
 2424       t = T_INT;
 2425       aligned = true;
 2426       break;
 2427     default:
 2428       ShouldNotReachHere();
 2429     };
 2430 
 2431     __ align(CodeEntryAlignment);
 2432     StubCodeMark mark(this, stub_id);
 2433     address start = __ pc();
 2434 
 2435     BLOCK_COMMENT("Entry:");
 2436 
 2437     const Register to        = c_rarg0;  // source array address
 2438     const Register value     = c_rarg1;  // value
 2439     const Register count     = c_rarg2;  // elements count
 2440 
 2441     const Register bz_base = r10;        // base for block_zero routine
 2442     const Register cnt_words = r11;      // temp register
 2443 
 2444     __ enter();
 2445 
 2446     Label L_fill_elements, L_exit1;
 2447 
 2448     int shift = -1;
 2449     switch (t) {
 2450       case T_BYTE:
 2451         shift = 0;
 2452         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2453         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2454         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2455         __ br(Assembler::LO, L_fill_elements);
 2456         break;
 2457       case T_SHORT:
 2458         shift = 1;
 2459         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2461         __ br(Assembler::LO, L_fill_elements);
 2462         break;
 2463       case T_INT:
 2464         shift = 2;
 2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2466         __ br(Assembler::LO, L_fill_elements);
 2467         break;
 2468       default: ShouldNotReachHere();
 2469     }
 2470 
 2471     // Align source address at 8 bytes address boundary.
 2472     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2473     if (!aligned) {
 2474       switch (t) {
 2475         case T_BYTE:
 2476           // One byte misalignment happens only for byte arrays.
 2477           __ tbz(to, 0, L_skip_align1);
 2478           __ strb(value, Address(__ post(to, 1)));
 2479           __ subw(count, count, 1);
 2480           __ bind(L_skip_align1);
 2481           // Fallthrough
 2482         case T_SHORT:
 2483           // Two bytes misalignment happens only for byte and short (char) arrays.
 2484           __ tbz(to, 1, L_skip_align2);
 2485           __ strh(value, Address(__ post(to, 2)));
 2486           __ subw(count, count, 2 >> shift);
 2487           __ bind(L_skip_align2);
 2488           // Fallthrough
 2489         case T_INT:
 2490           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2491           __ tbz(to, 2, L_skip_align4);
 2492           __ strw(value, Address(__ post(to, 4)));
 2493           __ subw(count, count, 4 >> shift);
 2494           __ bind(L_skip_align4);
 2495           break;
 2496         default: ShouldNotReachHere();
 2497       }
 2498     }
 2499 
 2500     //
 2501     //  Fill large chunks
 2502     //
 2503     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2504     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2505     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2506     if (UseBlockZeroing) {
 2507       Label non_block_zeroing, rest;
 2508       // If the fill value is zero we can use the fast zero_words().
 2509       __ cbnz(value, non_block_zeroing);
 2510       __ mov(bz_base, to);
 2511       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2512       address tpc = __ zero_words(bz_base, cnt_words);
 2513       if (tpc == nullptr) {
 2514         fatal("CodeCache is full at generate_fill");
 2515       }
 2516       __ b(rest);
 2517       __ bind(non_block_zeroing);
 2518       __ fill_words(to, cnt_words, value);
 2519       __ bind(rest);
 2520     } else {
 2521       __ fill_words(to, cnt_words, value);
 2522     }
 2523 
 2524     // Remaining count is less than 8 bytes. Fill it by a single store.
 2525     // Note that the total length is no less than 8 bytes.
 2526     if (t == T_BYTE || t == T_SHORT) {
 2527       Label L_exit1;
 2528       __ cbzw(count, L_exit1);
 2529       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2530       __ str(value, Address(to, -8));    // overwrite some elements
 2531       __ bind(L_exit1);
 2532       __ leave();
 2533       __ ret(lr);
 2534     }
 2535 
 2536     // Handle copies less than 8 bytes.
 2537     Label L_fill_2, L_fill_4, L_exit2;
 2538     __ bind(L_fill_elements);
 2539     switch (t) {
 2540       case T_BYTE:
 2541         __ tbz(count, 0, L_fill_2);
 2542         __ strb(value, Address(__ post(to, 1)));
 2543         __ bind(L_fill_2);
 2544         __ tbz(count, 1, L_fill_4);
 2545         __ strh(value, Address(__ post(to, 2)));
 2546         __ bind(L_fill_4);
 2547         __ tbz(count, 2, L_exit2);
 2548         __ strw(value, Address(to));
 2549         break;
 2550       case T_SHORT:
 2551         __ tbz(count, 0, L_fill_4);
 2552         __ strh(value, Address(__ post(to, 2)));
 2553         __ bind(L_fill_4);
 2554         __ tbz(count, 1, L_exit2);
 2555         __ strw(value, Address(to));
 2556         break;
 2557       case T_INT:
 2558         __ cbzw(count, L_exit2);
 2559         __ strw(value, Address(to));
 2560         break;
 2561       default: ShouldNotReachHere();
 2562     }
 2563     __ bind(L_exit2);
 2564     __ leave();
 2565     __ ret(lr);
 2566     return start;
 2567   }
 2568 
 2569   address generate_data_cache_writeback() {
 2570     const Register line        = c_rarg0;  // address of line to write back
 2571 
 2572     __ align(CodeEntryAlignment);
 2573 
 2574     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2575     StubCodeMark mark(this, stub_id);
 2576 
 2577     address start = __ pc();
 2578     __ enter();
 2579     __ cache_wb(Address(line, 0));
 2580     __ leave();
 2581     __ ret(lr);
 2582 
 2583     return start;
 2584   }
 2585 
 2586   address generate_data_cache_writeback_sync() {
 2587     const Register is_pre     = c_rarg0;  // pre or post sync
 2588 
 2589     __ align(CodeEntryAlignment);
 2590 
 2591     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2592     StubCodeMark mark(this, stub_id);
 2593 
 2594     // pre wbsync is a no-op
 2595     // post wbsync translates to an sfence
 2596 
 2597     Label skip;
 2598     address start = __ pc();
 2599     __ enter();
 2600     __ cbnz(is_pre, skip);
 2601     __ cache_wbsync(false);
 2602     __ bind(skip);
 2603     __ leave();
 2604     __ ret(lr);
 2605 
 2606     return start;
 2607   }
 2608 
 2609   void generate_arraycopy_stubs() {
 2610     address entry;
 2611     address entry_jbyte_arraycopy;
 2612     address entry_jshort_arraycopy;
 2613     address entry_jint_arraycopy;
 2614     address entry_oop_arraycopy;
 2615     address entry_jlong_arraycopy;
 2616     address entry_checkcast_arraycopy;
 2617 
 2618     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2619     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2620 
 2621     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2622     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2623 
 2624     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2625     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2626 
 2627     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2628 
 2629     //*** jbyte
 2630     // Always need aligned and unaligned versions
 2631     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2632     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2633     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2634     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2635 
 2636     //*** jshort
 2637     // Always need aligned and unaligned versions
 2638     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2639     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2640     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2641     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2642 
 2643     //*** jint
 2644     // Aligned versions
 2645     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2646     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2647     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2648     // entry_jint_arraycopy always points to the unaligned version
 2649     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2650     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2651 
 2652     //*** jlong
 2653     // It is always aligned
 2654     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2655     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2656     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2657     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2658 
 2659     //*** oops
 2660     {
 2661       // With compressed oops we need unaligned versions; notice that
 2662       // we overwrite entry_oop_arraycopy.
 2663       bool aligned = !UseCompressedOops;
 2664 
 2665       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2666         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2667       StubRoutines::_arrayof_oop_arraycopy
 2668         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2669       // Aligned versions without pre-barriers
 2670       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2671         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2672       StubRoutines::_arrayof_oop_arraycopy_uninit
 2673         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2674     }
 2675 
 2676     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2677     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2678     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2679     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2680 
 2681     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2682     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2683 
 2684     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2685                                                               entry_jshort_arraycopy,
 2686                                                               entry_jint_arraycopy,
 2687                                                               entry_jlong_arraycopy);
 2688 
 2689     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2690                                                                entry_jshort_arraycopy,
 2691                                                                entry_jint_arraycopy,
 2692                                                                entry_oop_arraycopy,
 2693                                                                entry_jlong_arraycopy,
 2694                                                                entry_checkcast_arraycopy);
 2695 
 2696     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2697     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2698     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2699     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2700     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2701     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2702   }
 2703 
 2704   void generate_math_stubs() { Unimplemented(); }
 2705 
 2706   // Arguments:
 2707   //
 2708   // Inputs:
 2709   //   c_rarg0   - source byte array address
 2710   //   c_rarg1   - destination byte array address
 2711   //   c_rarg2   - K (key) in little endian int array
 2712   //
 2713   address generate_aescrypt_encryptBlock() {
 2714     __ align(CodeEntryAlignment);
 2715     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2716     StubCodeMark mark(this, stub_id);
 2717 
 2718     const Register from        = c_rarg0;  // source array address
 2719     const Register to          = c_rarg1;  // destination array address
 2720     const Register key         = c_rarg2;  // key array address
 2721     const Register keylen      = rscratch1;
 2722 
 2723     address start = __ pc();
 2724     __ enter();
 2725 
 2726     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2727 
 2728     __ aesenc_loadkeys(key, keylen);
 2729     __ aesecb_encrypt(from, to, keylen);
 2730 
 2731     __ mov(r0, 0);
 2732 
 2733     __ leave();
 2734     __ ret(lr);
 2735 
 2736     return start;
 2737   }
 2738 
 2739   // Arguments:
 2740   //
 2741   // Inputs:
 2742   //   c_rarg0   - source byte array address
 2743   //   c_rarg1   - destination byte array address
 2744   //   c_rarg2   - K (key) in little endian int array
 2745   //
 2746   address generate_aescrypt_decryptBlock() {
 2747     assert(UseAES, "need AES cryptographic extension support");
 2748     __ align(CodeEntryAlignment);
 2749     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2750     StubCodeMark mark(this, stub_id);
 2751     Label L_doLast;
 2752 
 2753     const Register from        = c_rarg0;  // source array address
 2754     const Register to          = c_rarg1;  // destination array address
 2755     const Register key         = c_rarg2;  // key array address
 2756     const Register keylen      = rscratch1;
 2757 
 2758     address start = __ pc();
 2759     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2760 
 2761     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2762 
 2763     __ aesecb_decrypt(from, to, key, keylen);
 2764 
 2765     __ mov(r0, 0);
 2766 
 2767     __ leave();
 2768     __ ret(lr);
 2769 
 2770     return start;
 2771   }
 2772 
 2773   // Arguments:
 2774   //
 2775   // Inputs:
 2776   //   c_rarg0   - source byte array address
 2777   //   c_rarg1   - destination byte array address
 2778   //   c_rarg2   - K (key) in little endian int array
 2779   //   c_rarg3   - r vector byte array address
 2780   //   c_rarg4   - input length
 2781   //
 2782   // Output:
 2783   //   x0        - input length
 2784   //
 2785   address generate_cipherBlockChaining_encryptAESCrypt() {
 2786     assert(UseAES, "need AES cryptographic extension support");
 2787     __ align(CodeEntryAlignment);
 2788     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2789     StubCodeMark mark(this, stub_id);
 2790 
 2791     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2792 
 2793     const Register from        = c_rarg0;  // source array address
 2794     const Register to          = c_rarg1;  // destination array address
 2795     const Register key         = c_rarg2;  // key array address
 2796     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2797                                            // and left with the results of the last encryption block
 2798     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2799     const Register keylen      = rscratch1;
 2800 
 2801     address start = __ pc();
 2802 
 2803       __ enter();
 2804 
 2805       __ movw(rscratch2, len_reg);
 2806 
 2807       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2808 
 2809       __ ld1(v0, __ T16B, rvec);
 2810 
 2811       __ cmpw(keylen, 52);
 2812       __ br(Assembler::CC, L_loadkeys_44);
 2813       __ br(Assembler::EQ, L_loadkeys_52);
 2814 
 2815       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2816       __ rev32(v17, __ T16B, v17);
 2817       __ rev32(v18, __ T16B, v18);
 2818     __ BIND(L_loadkeys_52);
 2819       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2820       __ rev32(v19, __ T16B, v19);
 2821       __ rev32(v20, __ T16B, v20);
 2822     __ BIND(L_loadkeys_44);
 2823       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2824       __ rev32(v21, __ T16B, v21);
 2825       __ rev32(v22, __ T16B, v22);
 2826       __ rev32(v23, __ T16B, v23);
 2827       __ rev32(v24, __ T16B, v24);
 2828       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2829       __ rev32(v25, __ T16B, v25);
 2830       __ rev32(v26, __ T16B, v26);
 2831       __ rev32(v27, __ T16B, v27);
 2832       __ rev32(v28, __ T16B, v28);
 2833       __ ld1(v29, v30, v31, __ T16B, key);
 2834       __ rev32(v29, __ T16B, v29);
 2835       __ rev32(v30, __ T16B, v30);
 2836       __ rev32(v31, __ T16B, v31);
 2837 
 2838     __ BIND(L_aes_loop);
 2839       __ ld1(v1, __ T16B, __ post(from, 16));
 2840       __ eor(v0, __ T16B, v0, v1);
 2841 
 2842       __ br(Assembler::CC, L_rounds_44);
 2843       __ br(Assembler::EQ, L_rounds_52);
 2844 
 2845       __ aese(v0, v17); __ aesmc(v0, v0);
 2846       __ aese(v0, v18); __ aesmc(v0, v0);
 2847     __ BIND(L_rounds_52);
 2848       __ aese(v0, v19); __ aesmc(v0, v0);
 2849       __ aese(v0, v20); __ aesmc(v0, v0);
 2850     __ BIND(L_rounds_44);
 2851       __ aese(v0, v21); __ aesmc(v0, v0);
 2852       __ aese(v0, v22); __ aesmc(v0, v0);
 2853       __ aese(v0, v23); __ aesmc(v0, v0);
 2854       __ aese(v0, v24); __ aesmc(v0, v0);
 2855       __ aese(v0, v25); __ aesmc(v0, v0);
 2856       __ aese(v0, v26); __ aesmc(v0, v0);
 2857       __ aese(v0, v27); __ aesmc(v0, v0);
 2858       __ aese(v0, v28); __ aesmc(v0, v0);
 2859       __ aese(v0, v29); __ aesmc(v0, v0);
 2860       __ aese(v0, v30);
 2861       __ eor(v0, __ T16B, v0, v31);
 2862 
 2863       __ st1(v0, __ T16B, __ post(to, 16));
 2864 
 2865       __ subw(len_reg, len_reg, 16);
 2866       __ cbnzw(len_reg, L_aes_loop);
 2867 
 2868       __ st1(v0, __ T16B, rvec);
 2869 
 2870       __ mov(r0, rscratch2);
 2871 
 2872       __ leave();
 2873       __ ret(lr);
 2874 
 2875       return start;
 2876   }
 2877 
 2878   // Arguments:
 2879   //
 2880   // Inputs:
 2881   //   c_rarg0   - source byte array address
 2882   //   c_rarg1   - destination byte array address
 2883   //   c_rarg2   - K (key) in little endian int array
 2884   //   c_rarg3   - r vector byte array address
 2885   //   c_rarg4   - input length
 2886   //
 2887   // Output:
 2888   //   r0        - input length
 2889   //
 2890   address generate_cipherBlockChaining_decryptAESCrypt() {
 2891     assert(UseAES, "need AES cryptographic extension support");
 2892     __ align(CodeEntryAlignment);
 2893     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2894     StubCodeMark mark(this, stub_id);
 2895 
 2896     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2897 
 2898     const Register from        = c_rarg0;  // source array address
 2899     const Register to          = c_rarg1;  // destination array address
 2900     const Register key         = c_rarg2;  // key array address
 2901     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2902                                            // and left with the results of the last encryption block
 2903     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2904     const Register keylen      = rscratch1;
 2905 
 2906     address start = __ pc();
 2907 
 2908       __ enter();
 2909 
 2910       __ movw(rscratch2, len_reg);
 2911 
 2912       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2913 
 2914       __ ld1(v2, __ T16B, rvec);
 2915 
 2916       __ ld1(v31, __ T16B, __ post(key, 16));
 2917       __ rev32(v31, __ T16B, v31);
 2918 
 2919       __ cmpw(keylen, 52);
 2920       __ br(Assembler::CC, L_loadkeys_44);
 2921       __ br(Assembler::EQ, L_loadkeys_52);
 2922 
 2923       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2924       __ rev32(v17, __ T16B, v17);
 2925       __ rev32(v18, __ T16B, v18);
 2926     __ BIND(L_loadkeys_52);
 2927       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2928       __ rev32(v19, __ T16B, v19);
 2929       __ rev32(v20, __ T16B, v20);
 2930     __ BIND(L_loadkeys_44);
 2931       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2932       __ rev32(v21, __ T16B, v21);
 2933       __ rev32(v22, __ T16B, v22);
 2934       __ rev32(v23, __ T16B, v23);
 2935       __ rev32(v24, __ T16B, v24);
 2936       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2937       __ rev32(v25, __ T16B, v25);
 2938       __ rev32(v26, __ T16B, v26);
 2939       __ rev32(v27, __ T16B, v27);
 2940       __ rev32(v28, __ T16B, v28);
 2941       __ ld1(v29, v30, __ T16B, key);
 2942       __ rev32(v29, __ T16B, v29);
 2943       __ rev32(v30, __ T16B, v30);
 2944 
 2945     __ BIND(L_aes_loop);
 2946       __ ld1(v0, __ T16B, __ post(from, 16));
 2947       __ orr(v1, __ T16B, v0, v0);
 2948 
 2949       __ br(Assembler::CC, L_rounds_44);
 2950       __ br(Assembler::EQ, L_rounds_52);
 2951 
 2952       __ aesd(v0, v17); __ aesimc(v0, v0);
 2953       __ aesd(v0, v18); __ aesimc(v0, v0);
 2954     __ BIND(L_rounds_52);
 2955       __ aesd(v0, v19); __ aesimc(v0, v0);
 2956       __ aesd(v0, v20); __ aesimc(v0, v0);
 2957     __ BIND(L_rounds_44);
 2958       __ aesd(v0, v21); __ aesimc(v0, v0);
 2959       __ aesd(v0, v22); __ aesimc(v0, v0);
 2960       __ aesd(v0, v23); __ aesimc(v0, v0);
 2961       __ aesd(v0, v24); __ aesimc(v0, v0);
 2962       __ aesd(v0, v25); __ aesimc(v0, v0);
 2963       __ aesd(v0, v26); __ aesimc(v0, v0);
 2964       __ aesd(v0, v27); __ aesimc(v0, v0);
 2965       __ aesd(v0, v28); __ aesimc(v0, v0);
 2966       __ aesd(v0, v29); __ aesimc(v0, v0);
 2967       __ aesd(v0, v30);
 2968       __ eor(v0, __ T16B, v0, v31);
 2969       __ eor(v0, __ T16B, v0, v2);
 2970 
 2971       __ st1(v0, __ T16B, __ post(to, 16));
 2972       __ orr(v2, __ T16B, v1, v1);
 2973 
 2974       __ subw(len_reg, len_reg, 16);
 2975       __ cbnzw(len_reg, L_aes_loop);
 2976 
 2977       __ st1(v2, __ T16B, rvec);
 2978 
 2979       __ mov(r0, rscratch2);
 2980 
 2981       __ leave();
 2982       __ ret(lr);
 2983 
 2984     return start;
 2985   }
 2986 
 2987   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 2988   // Inputs: 128-bits. in is preserved.
 2989   // The least-significant 64-bit word is in the upper dword of each vector.
 2990   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 2991   // Output: result
 2992   void be_add_128_64(FloatRegister result, FloatRegister in,
 2993                      FloatRegister inc, FloatRegister tmp) {
 2994     assert_different_registers(result, tmp, inc);
 2995 
 2996     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 2997                                            // input
 2998     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 2999     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3000                                            // MSD == 0 (must be!) to LSD
 3001     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3002   }
 3003 
 3004   // CTR AES crypt.
 3005   // Arguments:
 3006   //
 3007   // Inputs:
 3008   //   c_rarg0   - source byte array address
 3009   //   c_rarg1   - destination byte array address
 3010   //   c_rarg2   - K (key) in little endian int array
 3011   //   c_rarg3   - counter vector byte array address
 3012   //   c_rarg4   - input length
 3013   //   c_rarg5   - saved encryptedCounter start
 3014   //   c_rarg6   - saved used length
 3015   //
 3016   // Output:
 3017   //   r0       - input length
 3018   //
 3019   address generate_counterMode_AESCrypt() {
 3020     const Register in = c_rarg0;
 3021     const Register out = c_rarg1;
 3022     const Register key = c_rarg2;
 3023     const Register counter = c_rarg3;
 3024     const Register saved_len = c_rarg4, len = r10;
 3025     const Register saved_encrypted_ctr = c_rarg5;
 3026     const Register used_ptr = c_rarg6, used = r12;
 3027 
 3028     const Register offset = r7;
 3029     const Register keylen = r11;
 3030 
 3031     const unsigned char block_size = 16;
 3032     const int bulk_width = 4;
 3033     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3034     // performance with larger data sizes, but it also means that the
 3035     // fast path isn't used until you have at least 8 blocks, and up
 3036     // to 127 bytes of data will be executed on the slow path. For
 3037     // that reason, and also so as not to blow away too much icache, 4
 3038     // blocks seems like a sensible compromise.
 3039 
 3040     // Algorithm:
 3041     //
 3042     //    if (len == 0) {
 3043     //        goto DONE;
 3044     //    }
 3045     //    int result = len;
 3046     //    do {
 3047     //        if (used >= blockSize) {
 3048     //            if (len >= bulk_width * blockSize) {
 3049     //                CTR_large_block();
 3050     //                if (len == 0)
 3051     //                    goto DONE;
 3052     //            }
 3053     //            for (;;) {
 3054     //                16ByteVector v0 = counter;
 3055     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3056     //                used = 0;
 3057     //                if (len < blockSize)
 3058     //                    break;    /* goto NEXT */
 3059     //                16ByteVector v1 = load16Bytes(in, offset);
 3060     //                v1 = v1 ^ encryptedCounter;
 3061     //                store16Bytes(out, offset);
 3062     //                used = blockSize;
 3063     //                offset += blockSize;
 3064     //                len -= blockSize;
 3065     //                if (len == 0)
 3066     //                    goto DONE;
 3067     //            }
 3068     //        }
 3069     //      NEXT:
 3070     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3071     //        len--;
 3072     //    } while (len != 0);
 3073     //  DONE:
 3074     //    return result;
 3075     //
 3076     // CTR_large_block()
 3077     //    Wide bulk encryption of whole blocks.
 3078 
 3079     __ align(CodeEntryAlignment);
 3080     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3081     StubCodeMark mark(this, stub_id);
 3082     const address start = __ pc();
 3083     __ enter();
 3084 
 3085     Label DONE, CTR_large_block, large_block_return;
 3086     __ ldrw(used, Address(used_ptr));
 3087     __ cbzw(saved_len, DONE);
 3088 
 3089     __ mov(len, saved_len);
 3090     __ mov(offset, 0);
 3091 
 3092     // Compute #rounds for AES based on the length of the key array
 3093     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3094 
 3095     __ aesenc_loadkeys(key, keylen);
 3096 
 3097     {
 3098       Label L_CTR_loop, NEXT;
 3099 
 3100       __ bind(L_CTR_loop);
 3101 
 3102       __ cmp(used, block_size);
 3103       __ br(__ LO, NEXT);
 3104 
 3105       // Maybe we have a lot of data
 3106       __ subsw(rscratch1, len, bulk_width * block_size);
 3107       __ br(__ HS, CTR_large_block);
 3108       __ BIND(large_block_return);
 3109       __ cbzw(len, DONE);
 3110 
 3111       // Setup the counter
 3112       __ movi(v4, __ T4S, 0);
 3113       __ movi(v5, __ T4S, 1);
 3114       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3115 
 3116       // 128-bit big-endian increment
 3117       __ ld1(v0, __ T16B, counter);
 3118       __ rev64(v16, __ T16B, v0);
 3119       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3120       __ rev64(v16, __ T16B, v16);
 3121       __ st1(v16, __ T16B, counter);
 3122       // Previous counter value is in v0
 3123       // v4 contains { 0, 1 }
 3124 
 3125       {
 3126         // We have fewer than bulk_width blocks of data left. Encrypt
 3127         // them one by one until there is less than a full block
 3128         // remaining, being careful to save both the encrypted counter
 3129         // and the counter.
 3130 
 3131         Label inner_loop;
 3132         __ bind(inner_loop);
 3133         // Counter to encrypt is in v0
 3134         __ aesecb_encrypt(noreg, noreg, keylen);
 3135         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3136 
 3137         // Do we have a remaining full block?
 3138 
 3139         __ mov(used, 0);
 3140         __ cmp(len, block_size);
 3141         __ br(__ LO, NEXT);
 3142 
 3143         // Yes, we have a full block
 3144         __ ldrq(v1, Address(in, offset));
 3145         __ eor(v1, __ T16B, v1, v0);
 3146         __ strq(v1, Address(out, offset));
 3147         __ mov(used, block_size);
 3148         __ add(offset, offset, block_size);
 3149 
 3150         __ subw(len, len, block_size);
 3151         __ cbzw(len, DONE);
 3152 
 3153         // Increment the counter, store it back
 3154         __ orr(v0, __ T16B, v16, v16);
 3155         __ rev64(v16, __ T16B, v16);
 3156         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3157         __ rev64(v16, __ T16B, v16);
 3158         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3159 
 3160         __ b(inner_loop);
 3161       }
 3162 
 3163       __ BIND(NEXT);
 3164 
 3165       // Encrypt a single byte, and loop.
 3166       // We expect this to be a rare event.
 3167       __ ldrb(rscratch1, Address(in, offset));
 3168       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3169       __ eor(rscratch1, rscratch1, rscratch2);
 3170       __ strb(rscratch1, Address(out, offset));
 3171       __ add(offset, offset, 1);
 3172       __ add(used, used, 1);
 3173       __ subw(len, len,1);
 3174       __ cbnzw(len, L_CTR_loop);
 3175     }
 3176 
 3177     __ bind(DONE);
 3178     __ strw(used, Address(used_ptr));
 3179     __ mov(r0, saved_len);
 3180 
 3181     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3182     __ ret(lr);
 3183 
 3184     // Bulk encryption
 3185 
 3186     __ BIND (CTR_large_block);
 3187     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3188 
 3189     if (bulk_width == 8) {
 3190       __ sub(sp, sp, 4 * 16);
 3191       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3192     }
 3193     __ sub(sp, sp, 4 * 16);
 3194     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3195     RegSet saved_regs = (RegSet::of(in, out, offset)
 3196                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3197     __ push(saved_regs, sp);
 3198     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3199     __ add(in, in, offset);
 3200     __ add(out, out, offset);
 3201 
 3202     // Keys should already be loaded into the correct registers
 3203 
 3204     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3205     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3206 
 3207     // AES/CTR loop
 3208     {
 3209       Label L_CTR_loop;
 3210       __ BIND(L_CTR_loop);
 3211 
 3212       // Setup the counters
 3213       __ movi(v8, __ T4S, 0);
 3214       __ movi(v9, __ T4S, 1);
 3215       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3216 
 3217       for (int i = 0; i < bulk_width; i++) {
 3218         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3219         __ rev64(v0_ofs, __ T16B, v16);
 3220         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3221       }
 3222 
 3223       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3224 
 3225       // Encrypt the counters
 3226       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3227 
 3228       if (bulk_width == 8) {
 3229         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3230       }
 3231 
 3232       // XOR the encrypted counters with the inputs
 3233       for (int i = 0; i < bulk_width; i++) {
 3234         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3235         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3236         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3237       }
 3238 
 3239       // Write the encrypted data
 3240       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3241       if (bulk_width == 8) {
 3242         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3243       }
 3244 
 3245       __ subw(len, len, 16 * bulk_width);
 3246       __ cbnzw(len, L_CTR_loop);
 3247     }
 3248 
 3249     // Save the counter back where it goes
 3250     __ rev64(v16, __ T16B, v16);
 3251     __ st1(v16, __ T16B, counter);
 3252 
 3253     __ pop(saved_regs, sp);
 3254 
 3255     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3256     if (bulk_width == 8) {
 3257       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3258     }
 3259 
 3260     __ andr(rscratch1, len, -16 * bulk_width);
 3261     __ sub(len, len, rscratch1);
 3262     __ add(offset, offset, rscratch1);
 3263     __ mov(used, 16);
 3264     __ strw(used, Address(used_ptr));
 3265     __ b(large_block_return);
 3266 
 3267     return start;
 3268   }
 3269 
 3270   // Vector AES Galois Counter Mode implementation. Parameters:
 3271   //
 3272   // in = c_rarg0
 3273   // len = c_rarg1
 3274   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3275   // out = c_rarg3
 3276   // key = c_rarg4
 3277   // state = c_rarg5 - GHASH.state
 3278   // subkeyHtbl = c_rarg6 - powers of H
 3279   // counter = c_rarg7 - 16 bytes of CTR
 3280   // return - number of processed bytes
 3281   address generate_galoisCounterMode_AESCrypt() {
 3282     address ghash_polynomial = __ pc();
 3283     __ emit_int64(0x87);  // The low-order bits of the field
 3284                           // polynomial (i.e. p = z^7+z^2+z+1)
 3285                           // repeated in the low and high parts of a
 3286                           // 128-bit vector
 3287     __ emit_int64(0x87);
 3288 
 3289     __ align(CodeEntryAlignment);
 3290     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3291     StubCodeMark mark(this, stub_id);
 3292     address start = __ pc();
 3293     __ enter();
 3294 
 3295     const Register in = c_rarg0;
 3296     const Register len = c_rarg1;
 3297     const Register ct = c_rarg2;
 3298     const Register out = c_rarg3;
 3299     // and updated with the incremented counter in the end
 3300 
 3301     const Register key = c_rarg4;
 3302     const Register state = c_rarg5;
 3303 
 3304     const Register subkeyHtbl = c_rarg6;
 3305 
 3306     const Register counter = c_rarg7;
 3307 
 3308     const Register keylen = r10;
 3309     // Save state before entering routine
 3310     __ sub(sp, sp, 4 * 16);
 3311     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3312     __ sub(sp, sp, 4 * 16);
 3313     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3314 
 3315     // __ andr(len, len, -512);
 3316     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3317     __ str(len, __ pre(sp, -2 * wordSize));
 3318 
 3319     Label DONE;
 3320     __ cbz(len, DONE);
 3321 
 3322     // Compute #rounds for AES based on the length of the key array
 3323     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3324 
 3325     __ aesenc_loadkeys(key, keylen);
 3326     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3327     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3328 
 3329     // AES/CTR loop
 3330     {
 3331       Label L_CTR_loop;
 3332       __ BIND(L_CTR_loop);
 3333 
 3334       // Setup the counters
 3335       __ movi(v8, __ T4S, 0);
 3336       __ movi(v9, __ T4S, 1);
 3337       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3338 
 3339       assert(v0->encoding() < v8->encoding(), "");
 3340       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3341         FloatRegister f = as_FloatRegister(i);
 3342         __ rev32(f, __ T16B, v16);
 3343         __ addv(v16, __ T4S, v16, v8);
 3344       }
 3345 
 3346       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3347 
 3348       // Encrypt the counters
 3349       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3350 
 3351       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3352 
 3353       // XOR the encrypted counters with the inputs
 3354       for (int i = 0; i < 8; i++) {
 3355         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3356         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3357         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3358       }
 3359       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3360       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3361 
 3362       __ subw(len, len, 16 * 8);
 3363       __ cbnzw(len, L_CTR_loop);
 3364     }
 3365 
 3366     __ rev32(v16, __ T16B, v16);
 3367     __ st1(v16, __ T16B, counter);
 3368 
 3369     __ ldr(len, Address(sp));
 3370     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3371 
 3372     // GHASH/CTR loop
 3373     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3374                                 len, /*unrolls*/4);
 3375 
 3376 #ifdef ASSERT
 3377     { Label L;
 3378       __ cmp(len, (unsigned char)0);
 3379       __ br(Assembler::EQ, L);
 3380       __ stop("stubGenerator: abort");
 3381       __ bind(L);
 3382   }
 3383 #endif
 3384 
 3385   __ bind(DONE);
 3386     // Return the number of bytes processed
 3387     __ ldr(r0, __ post(sp, 2 * wordSize));
 3388 
 3389     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3390     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3391 
 3392     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3393     __ ret(lr);
 3394      return start;
 3395   }
 3396 
 3397   class Cached64Bytes {
 3398   private:
 3399     MacroAssembler *_masm;
 3400     Register _regs[8];
 3401 
 3402   public:
 3403     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3404       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3405       auto it = rs.begin();
 3406       for (auto &r: _regs) {
 3407         r = *it;
 3408         ++it;
 3409       }
 3410     }
 3411 
 3412     void gen_loads(Register base) {
 3413       for (int i = 0; i < 8; i += 2) {
 3414         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3415       }
 3416     }
 3417 
 3418     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3419     void extract_u32(Register dest, int i) {
 3420       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3421     }
 3422   };
 3423 
 3424   // Utility routines for md5.
 3425   // Clobbers r10 and r11.
 3426   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3427               int k, int s, int t) {
 3428     Register rscratch3 = r10;
 3429     Register rscratch4 = r11;
 3430 
 3431     __ eorw(rscratch3, r3, r4);
 3432     __ movw(rscratch2, t);
 3433     __ andw(rscratch3, rscratch3, r2);
 3434     __ addw(rscratch4, r1, rscratch2);
 3435     reg_cache.extract_u32(rscratch1, k);
 3436     __ eorw(rscratch3, rscratch3, r4);
 3437     __ addw(rscratch4, rscratch4, rscratch1);
 3438     __ addw(rscratch3, rscratch3, rscratch4);
 3439     __ rorw(rscratch2, rscratch3, 32 - s);
 3440     __ addw(r1, rscratch2, r2);
 3441   }
 3442 
 3443   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3444               int k, int s, int t) {
 3445     Register rscratch3 = r10;
 3446     Register rscratch4 = r11;
 3447 
 3448     reg_cache.extract_u32(rscratch1, k);
 3449     __ movw(rscratch2, t);
 3450     __ addw(rscratch4, r1, rscratch2);
 3451     __ addw(rscratch4, rscratch4, rscratch1);
 3452     __ bicw(rscratch2, r3, r4);
 3453     __ andw(rscratch3, r2, r4);
 3454     __ addw(rscratch2, rscratch2, rscratch4);
 3455     __ addw(rscratch2, rscratch2, rscratch3);
 3456     __ rorw(rscratch2, rscratch2, 32 - s);
 3457     __ addw(r1, rscratch2, r2);
 3458   }
 3459 
 3460   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3461               int k, int s, int t) {
 3462     Register rscratch3 = r10;
 3463     Register rscratch4 = r11;
 3464 
 3465     __ eorw(rscratch3, r3, r4);
 3466     __ movw(rscratch2, t);
 3467     __ addw(rscratch4, r1, rscratch2);
 3468     reg_cache.extract_u32(rscratch1, k);
 3469     __ eorw(rscratch3, rscratch3, r2);
 3470     __ addw(rscratch4, rscratch4, rscratch1);
 3471     __ addw(rscratch3, rscratch3, rscratch4);
 3472     __ rorw(rscratch2, rscratch3, 32 - s);
 3473     __ addw(r1, rscratch2, r2);
 3474   }
 3475 
 3476   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3477               int k, int s, int t) {
 3478     Register rscratch3 = r10;
 3479     Register rscratch4 = r11;
 3480 
 3481     __ movw(rscratch3, t);
 3482     __ ornw(rscratch2, r2, r4);
 3483     __ addw(rscratch4, r1, rscratch3);
 3484     reg_cache.extract_u32(rscratch1, k);
 3485     __ eorw(rscratch3, rscratch2, r3);
 3486     __ addw(rscratch4, rscratch4, rscratch1);
 3487     __ addw(rscratch3, rscratch3, rscratch4);
 3488     __ rorw(rscratch2, rscratch3, 32 - s);
 3489     __ addw(r1, rscratch2, r2);
 3490   }
 3491 
 3492   // Arguments:
 3493   //
 3494   // Inputs:
 3495   //   c_rarg0   - byte[]  source+offset
 3496   //   c_rarg1   - int[]   SHA.state
 3497   //   c_rarg2   - int     offset
 3498   //   c_rarg3   - int     limit
 3499   //
 3500   address generate_md5_implCompress(StubGenStubId stub_id) {
 3501     bool multi_block;
 3502     switch (stub_id) {
 3503     case md5_implCompress_id:
 3504       multi_block = false;
 3505       break;
 3506     case md5_implCompressMB_id:
 3507       multi_block = true;
 3508       break;
 3509     default:
 3510       ShouldNotReachHere();
 3511     }
 3512     __ align(CodeEntryAlignment);
 3513 
 3514     StubCodeMark mark(this, stub_id);
 3515     address start = __ pc();
 3516 
 3517     Register buf       = c_rarg0;
 3518     Register state     = c_rarg1;
 3519     Register ofs       = c_rarg2;
 3520     Register limit     = c_rarg3;
 3521     Register a         = r4;
 3522     Register b         = r5;
 3523     Register c         = r6;
 3524     Register d         = r7;
 3525     Register rscratch3 = r10;
 3526     Register rscratch4 = r11;
 3527 
 3528     Register state_regs[2] = { r12, r13 };
 3529     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3530     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3531 
 3532     __ push(saved_regs, sp);
 3533 
 3534     __ ldp(state_regs[0], state_regs[1], Address(state));
 3535     __ ubfx(a, state_regs[0],  0, 32);
 3536     __ ubfx(b, state_regs[0], 32, 32);
 3537     __ ubfx(c, state_regs[1],  0, 32);
 3538     __ ubfx(d, state_regs[1], 32, 32);
 3539 
 3540     Label md5_loop;
 3541     __ BIND(md5_loop);
 3542 
 3543     reg_cache.gen_loads(buf);
 3544 
 3545     // Round 1
 3546     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3547     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3548     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3549     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3550     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3551     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3552     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3553     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3554     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3555     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3556     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3557     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3558     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3559     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3560     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3561     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3562 
 3563     // Round 2
 3564     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3565     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3566     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3567     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3568     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3569     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3570     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3571     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3572     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3573     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3574     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3575     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3576     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3577     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3578     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3579     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3580 
 3581     // Round 3
 3582     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3583     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3584     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3585     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3586     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3587     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3588     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3589     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3590     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3591     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3592     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3593     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3594     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3595     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3596     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3597     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3598 
 3599     // Round 4
 3600     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3601     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3602     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3603     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3604     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3605     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3606     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3607     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3608     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3609     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3610     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3611     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3612     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3613     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3614     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3615     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3616 
 3617     __ addw(a, state_regs[0], a);
 3618     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3619     __ addw(b, rscratch2, b);
 3620     __ addw(c, state_regs[1], c);
 3621     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3622     __ addw(d, rscratch4, d);
 3623 
 3624     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3625     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3626 
 3627     if (multi_block) {
 3628       __ add(buf, buf, 64);
 3629       __ add(ofs, ofs, 64);
 3630       __ cmp(ofs, limit);
 3631       __ br(Assembler::LE, md5_loop);
 3632       __ mov(c_rarg0, ofs); // return ofs
 3633     }
 3634 
 3635     // write hash values back in the correct order
 3636     __ stp(state_regs[0], state_regs[1], Address(state));
 3637 
 3638     __ pop(saved_regs, sp);
 3639 
 3640     __ ret(lr);
 3641 
 3642     return start;
 3643   }
 3644 
 3645   // Arguments:
 3646   //
 3647   // Inputs:
 3648   //   c_rarg0   - byte[]  source+offset
 3649   //   c_rarg1   - int[]   SHA.state
 3650   //   c_rarg2   - int     offset
 3651   //   c_rarg3   - int     limit
 3652   //
 3653   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3654     bool multi_block;
 3655     switch (stub_id) {
 3656     case sha1_implCompress_id:
 3657       multi_block = false;
 3658       break;
 3659     case sha1_implCompressMB_id:
 3660       multi_block = true;
 3661       break;
 3662     default:
 3663       ShouldNotReachHere();
 3664     }
 3665 
 3666     __ align(CodeEntryAlignment);
 3667 
 3668     StubCodeMark mark(this, stub_id);
 3669     address start = __ pc();
 3670 
 3671     Register buf   = c_rarg0;
 3672     Register state = c_rarg1;
 3673     Register ofs   = c_rarg2;
 3674     Register limit = c_rarg3;
 3675 
 3676     Label keys;
 3677     Label sha1_loop;
 3678 
 3679     // load the keys into v0..v3
 3680     __ adr(rscratch1, keys);
 3681     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3682     // load 5 words state into v6, v7
 3683     __ ldrq(v6, Address(state, 0));
 3684     __ ldrs(v7, Address(state, 16));
 3685 
 3686 
 3687     __ BIND(sha1_loop);
 3688     // load 64 bytes of data into v16..v19
 3689     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3690     __ rev32(v16, __ T16B, v16);
 3691     __ rev32(v17, __ T16B, v17);
 3692     __ rev32(v18, __ T16B, v18);
 3693     __ rev32(v19, __ T16B, v19);
 3694 
 3695     // do the sha1
 3696     __ addv(v4, __ T4S, v16, v0);
 3697     __ orr(v20, __ T16B, v6, v6);
 3698 
 3699     FloatRegister d0 = v16;
 3700     FloatRegister d1 = v17;
 3701     FloatRegister d2 = v18;
 3702     FloatRegister d3 = v19;
 3703 
 3704     for (int round = 0; round < 20; round++) {
 3705       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3706       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3707       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3708       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3709       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3710 
 3711       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3712       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3713       __ sha1h(tmp2, __ T4S, v20);
 3714       if (round < 5)
 3715         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3716       else if (round < 10 || round >= 15)
 3717         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3718       else
 3719         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3720       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3721 
 3722       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3723     }
 3724 
 3725     __ addv(v7, __ T2S, v7, v21);
 3726     __ addv(v6, __ T4S, v6, v20);
 3727 
 3728     if (multi_block) {
 3729       __ add(ofs, ofs, 64);
 3730       __ cmp(ofs, limit);
 3731       __ br(Assembler::LE, sha1_loop);
 3732       __ mov(c_rarg0, ofs); // return ofs
 3733     }
 3734 
 3735     __ strq(v6, Address(state, 0));
 3736     __ strs(v7, Address(state, 16));
 3737 
 3738     __ ret(lr);
 3739 
 3740     __ bind(keys);
 3741     __ emit_int32(0x5a827999);
 3742     __ emit_int32(0x6ed9eba1);
 3743     __ emit_int32(0x8f1bbcdc);
 3744     __ emit_int32(0xca62c1d6);
 3745 
 3746     return start;
 3747   }
 3748 
 3749 
 3750   // Arguments:
 3751   //
 3752   // Inputs:
 3753   //   c_rarg0   - byte[]  source+offset
 3754   //   c_rarg1   - int[]   SHA.state
 3755   //   c_rarg2   - int     offset
 3756   //   c_rarg3   - int     limit
 3757   //
 3758   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3759     bool multi_block;
 3760     switch (stub_id) {
 3761     case sha256_implCompress_id:
 3762       multi_block = false;
 3763       break;
 3764     case sha256_implCompressMB_id:
 3765       multi_block = true;
 3766       break;
 3767     default:
 3768       ShouldNotReachHere();
 3769     }
 3770 
 3771     static const uint32_t round_consts[64] = {
 3772       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3773       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3774       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3775       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3776       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3777       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3778       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3779       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3780       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3781       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3782       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3783       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3784       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3785       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3786       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3787       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3788     };
 3789 
 3790     __ align(CodeEntryAlignment);
 3791 
 3792     StubCodeMark mark(this, stub_id);
 3793     address start = __ pc();
 3794 
 3795     Register buf   = c_rarg0;
 3796     Register state = c_rarg1;
 3797     Register ofs   = c_rarg2;
 3798     Register limit = c_rarg3;
 3799 
 3800     Label sha1_loop;
 3801 
 3802     __ stpd(v8, v9, __ pre(sp, -32));
 3803     __ stpd(v10, v11, Address(sp, 16));
 3804 
 3805 // dga == v0
 3806 // dgb == v1
 3807 // dg0 == v2
 3808 // dg1 == v3
 3809 // dg2 == v4
 3810 // t0 == v6
 3811 // t1 == v7
 3812 
 3813     // load 16 keys to v16..v31
 3814     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3815     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3816     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3817     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3818     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3819 
 3820     // load 8 words (256 bits) state
 3821     __ ldpq(v0, v1, state);
 3822 
 3823     __ BIND(sha1_loop);
 3824     // load 64 bytes of data into v8..v11
 3825     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3826     __ rev32(v8, __ T16B, v8);
 3827     __ rev32(v9, __ T16B, v9);
 3828     __ rev32(v10, __ T16B, v10);
 3829     __ rev32(v11, __ T16B, v11);
 3830 
 3831     __ addv(v6, __ T4S, v8, v16);
 3832     __ orr(v2, __ T16B, v0, v0);
 3833     __ orr(v3, __ T16B, v1, v1);
 3834 
 3835     FloatRegister d0 = v8;
 3836     FloatRegister d1 = v9;
 3837     FloatRegister d2 = v10;
 3838     FloatRegister d3 = v11;
 3839 
 3840 
 3841     for (int round = 0; round < 16; round++) {
 3842       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3843       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3844       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3845       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3846 
 3847       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3848        __ orr(v4, __ T16B, v2, v2);
 3849       if (round < 15)
 3850         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3851       __ sha256h(v2, __ T4S, v3, tmp2);
 3852       __ sha256h2(v3, __ T4S, v4, tmp2);
 3853       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3854 
 3855       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3856     }
 3857 
 3858     __ addv(v0, __ T4S, v0, v2);
 3859     __ addv(v1, __ T4S, v1, v3);
 3860 
 3861     if (multi_block) {
 3862       __ add(ofs, ofs, 64);
 3863       __ cmp(ofs, limit);
 3864       __ br(Assembler::LE, sha1_loop);
 3865       __ mov(c_rarg0, ofs); // return ofs
 3866     }
 3867 
 3868     __ ldpd(v10, v11, Address(sp, 16));
 3869     __ ldpd(v8, v9, __ post(sp, 32));
 3870 
 3871     __ stpq(v0, v1, state);
 3872 
 3873     __ ret(lr);
 3874 
 3875     return start;
 3876   }
 3877 
 3878   // Double rounds for sha512.
 3879   void sha512_dround(int dr,
 3880                      FloatRegister vi0, FloatRegister vi1,
 3881                      FloatRegister vi2, FloatRegister vi3,
 3882                      FloatRegister vi4, FloatRegister vrc0,
 3883                      FloatRegister vrc1, FloatRegister vin0,
 3884                      FloatRegister vin1, FloatRegister vin2,
 3885                      FloatRegister vin3, FloatRegister vin4) {
 3886       if (dr < 36) {
 3887         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3888       }
 3889       __ addv(v5, __ T2D, vrc0, vin0);
 3890       __ ext(v6, __ T16B, vi2, vi3, 8);
 3891       __ ext(v5, __ T16B, v5, v5, 8);
 3892       __ ext(v7, __ T16B, vi1, vi2, 8);
 3893       __ addv(vi3, __ T2D, vi3, v5);
 3894       if (dr < 32) {
 3895         __ ext(v5, __ T16B, vin3, vin4, 8);
 3896         __ sha512su0(vin0, __ T2D, vin1);
 3897       }
 3898       __ sha512h(vi3, __ T2D, v6, v7);
 3899       if (dr < 32) {
 3900         __ sha512su1(vin0, __ T2D, vin2, v5);
 3901       }
 3902       __ addv(vi4, __ T2D, vi1, vi3);
 3903       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3904   }
 3905 
 3906   // Arguments:
 3907   //
 3908   // Inputs:
 3909   //   c_rarg0   - byte[]  source+offset
 3910   //   c_rarg1   - int[]   SHA.state
 3911   //   c_rarg2   - int     offset
 3912   //   c_rarg3   - int     limit
 3913   //
 3914   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3915     bool multi_block;
 3916     switch (stub_id) {
 3917     case sha512_implCompress_id:
 3918       multi_block = false;
 3919       break;
 3920     case sha512_implCompressMB_id:
 3921       multi_block = true;
 3922       break;
 3923     default:
 3924       ShouldNotReachHere();
 3925     }
 3926 
 3927     static const uint64_t round_consts[80] = {
 3928       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3929       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3930       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3931       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3932       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3933       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3934       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3935       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3936       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3937       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3938       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3939       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3940       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3941       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3942       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3943       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3944       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3945       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3946       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3947       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3948       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3949       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3950       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3951       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3952       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3953       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3954       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3955     };
 3956 
 3957     __ align(CodeEntryAlignment);
 3958 
 3959     StubCodeMark mark(this, stub_id);
 3960     address start = __ pc();
 3961 
 3962     Register buf   = c_rarg0;
 3963     Register state = c_rarg1;
 3964     Register ofs   = c_rarg2;
 3965     Register limit = c_rarg3;
 3966 
 3967     __ stpd(v8, v9, __ pre(sp, -64));
 3968     __ stpd(v10, v11, Address(sp, 16));
 3969     __ stpd(v12, v13, Address(sp, 32));
 3970     __ stpd(v14, v15, Address(sp, 48));
 3971 
 3972     Label sha512_loop;
 3973 
 3974     // load state
 3975     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3976 
 3977     // load first 4 round constants
 3978     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3979     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 3980 
 3981     __ BIND(sha512_loop);
 3982     // load 128B of data into v12..v19
 3983     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 3984     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 3985     __ rev64(v12, __ T16B, v12);
 3986     __ rev64(v13, __ T16B, v13);
 3987     __ rev64(v14, __ T16B, v14);
 3988     __ rev64(v15, __ T16B, v15);
 3989     __ rev64(v16, __ T16B, v16);
 3990     __ rev64(v17, __ T16B, v17);
 3991     __ rev64(v18, __ T16B, v18);
 3992     __ rev64(v19, __ T16B, v19);
 3993 
 3994     __ mov(rscratch2, rscratch1);
 3995 
 3996     __ mov(v0, __ T16B, v8);
 3997     __ mov(v1, __ T16B, v9);
 3998     __ mov(v2, __ T16B, v10);
 3999     __ mov(v3, __ T16B, v11);
 4000 
 4001     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4002     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4003     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4004     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4005     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4006     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4007     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4008     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4009     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4010     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4011     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4012     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4013     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4014     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4015     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4016     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4017     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4018     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4019     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4020     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4021     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4022     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4023     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4024     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4025     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4026     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4027     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4028     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4029     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4030     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4031     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4032     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4033     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4034     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4035     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4036     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4037     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4038     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4039     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4040     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4041 
 4042     __ addv(v8, __ T2D, v8, v0);
 4043     __ addv(v9, __ T2D, v9, v1);
 4044     __ addv(v10, __ T2D, v10, v2);
 4045     __ addv(v11, __ T2D, v11, v3);
 4046 
 4047     if (multi_block) {
 4048       __ add(ofs, ofs, 128);
 4049       __ cmp(ofs, limit);
 4050       __ br(Assembler::LE, sha512_loop);
 4051       __ mov(c_rarg0, ofs); // return ofs
 4052     }
 4053 
 4054     __ st1(v8, v9, v10, v11, __ T2D, state);
 4055 
 4056     __ ldpd(v14, v15, Address(sp, 48));
 4057     __ ldpd(v12, v13, Address(sp, 32));
 4058     __ ldpd(v10, v11, Address(sp, 16));
 4059     __ ldpd(v8, v9, __ post(sp, 64));
 4060 
 4061     __ ret(lr);
 4062 
 4063     return start;
 4064   }
 4065 
 4066   // Execute one round of keccak of two computations in parallel.
 4067   // One of the states should be loaded into the lower halves of
 4068   // the vector registers v0-v24, the other should be loaded into
 4069   // the upper halves of those registers. The ld1r instruction loads
 4070   // the round constant into both halves of register v31.
 4071   // Intermediate results c0...c5 and d0...d5 are computed
 4072   // in registers v25...v30.
 4073   // All vector instructions that are used operate on both register
 4074   // halves in parallel.
 4075   // If only a single computation is needed, one can only load the lower halves.
 4076   void keccak_round(Register rscratch1) {
 4077   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4078   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4079   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4080   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4081   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4082   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4083   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4084   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4085   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4086   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4087 
 4088   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4089   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4090   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4091   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4092   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4093 
 4094   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4095   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4096   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4097   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4098   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4099   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4100   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4101   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4102   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4103   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4104   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4105   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4106   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4107   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4108   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4109   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4110   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4111   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4112   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4113   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4114   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4115   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4116   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4117   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4118   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4119 
 4120   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4121   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4122   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4123   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4124   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4125 
 4126   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4127 
 4128   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4129   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4130   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4131   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4132   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4133 
 4134   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4135   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4136   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4137   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4138   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4139 
 4140   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4141   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4142   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4143   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4144   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4145 
 4146   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4147   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4148   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4149   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4150   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4151 
 4152   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4153   }
 4154 
 4155   // Arguments:
 4156   //
 4157   // Inputs:
 4158   //   c_rarg0   - byte[]  source+offset
 4159   //   c_rarg1   - byte[]  SHA.state
 4160   //   c_rarg2   - int     block_size
 4161   //   c_rarg3   - int     offset
 4162   //   c_rarg4   - int     limit
 4163   //
 4164   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4165     bool multi_block;
 4166     switch (stub_id) {
 4167     case sha3_implCompress_id:
 4168       multi_block = false;
 4169       break;
 4170     case sha3_implCompressMB_id:
 4171       multi_block = true;
 4172       break;
 4173     default:
 4174       ShouldNotReachHere();
 4175     }
 4176 
 4177     static const uint64_t round_consts[24] = {
 4178       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4179       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4180       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4181       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4182       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4183       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4184       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4185       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4186     };
 4187 
 4188     __ align(CodeEntryAlignment);
 4189 
 4190     StubCodeMark mark(this, stub_id);
 4191     address start = __ pc();
 4192 
 4193     Register buf           = c_rarg0;
 4194     Register state         = c_rarg1;
 4195     Register block_size    = c_rarg2;
 4196     Register ofs           = c_rarg3;
 4197     Register limit         = c_rarg4;
 4198 
 4199     Label sha3_loop, rounds24_loop;
 4200     Label sha3_512_or_sha3_384, shake128;
 4201 
 4202     __ stpd(v8, v9, __ pre(sp, -64));
 4203     __ stpd(v10, v11, Address(sp, 16));
 4204     __ stpd(v12, v13, Address(sp, 32));
 4205     __ stpd(v14, v15, Address(sp, 48));
 4206 
 4207     // load state
 4208     __ add(rscratch1, state, 32);
 4209     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4210     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4211     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4212     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4213     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4214     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4215     __ ld1(v24, __ T1D, rscratch1);
 4216 
 4217     __ BIND(sha3_loop);
 4218 
 4219     // 24 keccak rounds
 4220     __ movw(rscratch2, 24);
 4221 
 4222     // load round_constants base
 4223     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4224 
 4225     // load input
 4226     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4227     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4228     __ eor(v0, __ T8B, v0, v25);
 4229     __ eor(v1, __ T8B, v1, v26);
 4230     __ eor(v2, __ T8B, v2, v27);
 4231     __ eor(v3, __ T8B, v3, v28);
 4232     __ eor(v4, __ T8B, v4, v29);
 4233     __ eor(v5, __ T8B, v5, v30);
 4234     __ eor(v6, __ T8B, v6, v31);
 4235 
 4236     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4237     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4238 
 4239     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4240     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4241     __ eor(v7, __ T8B, v7, v25);
 4242     __ eor(v8, __ T8B, v8, v26);
 4243     __ eor(v9, __ T8B, v9, v27);
 4244     __ eor(v10, __ T8B, v10, v28);
 4245     __ eor(v11, __ T8B, v11, v29);
 4246     __ eor(v12, __ T8B, v12, v30);
 4247     __ eor(v13, __ T8B, v13, v31);
 4248 
 4249     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4250     __ eor(v14, __ T8B, v14, v25);
 4251     __ eor(v15, __ T8B, v15, v26);
 4252     __ eor(v16, __ T8B, v16, v27);
 4253 
 4254     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4255     __ andw(c_rarg5, block_size, 48);
 4256     __ cbzw(c_rarg5, rounds24_loop);
 4257 
 4258     __ tbnz(block_size, 5, shake128);
 4259     // block_size == 144, bit5 == 0, SHA3-224
 4260     __ ldrd(v28, __ post(buf, 8));
 4261     __ eor(v17, __ T8B, v17, v28);
 4262     __ b(rounds24_loop);
 4263 
 4264     __ BIND(shake128);
 4265     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4266     __ eor(v17, __ T8B, v17, v28);
 4267     __ eor(v18, __ T8B, v18, v29);
 4268     __ eor(v19, __ T8B, v19, v30);
 4269     __ eor(v20, __ T8B, v20, v31);
 4270     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4271 
 4272     __ BIND(sha3_512_or_sha3_384);
 4273     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4274     __ eor(v7, __ T8B, v7, v25);
 4275     __ eor(v8, __ T8B, v8, v26);
 4276     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4277 
 4278     // SHA3-384
 4279     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4280     __ eor(v9,  __ T8B, v9,  v27);
 4281     __ eor(v10, __ T8B, v10, v28);
 4282     __ eor(v11, __ T8B, v11, v29);
 4283     __ eor(v12, __ T8B, v12, v30);
 4284 
 4285     __ BIND(rounds24_loop);
 4286     __ subw(rscratch2, rscratch2, 1);
 4287 
 4288     keccak_round(rscratch1);
 4289 
 4290     __ cbnzw(rscratch2, rounds24_loop);
 4291 
 4292     if (multi_block) {
 4293       __ add(ofs, ofs, block_size);
 4294       __ cmp(ofs, limit);
 4295       __ br(Assembler::LE, sha3_loop);
 4296       __ mov(c_rarg0, ofs); // return ofs
 4297     }
 4298 
 4299     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4300     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4301     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4302     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4303     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4304     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4305     __ st1(v24, __ T1D, state);
 4306 
 4307     // restore callee-saved registers
 4308     __ ldpd(v14, v15, Address(sp, 48));
 4309     __ ldpd(v12, v13, Address(sp, 32));
 4310     __ ldpd(v10, v11, Address(sp, 16));
 4311     __ ldpd(v8, v9, __ post(sp, 64));
 4312 
 4313     __ ret(lr);
 4314 
 4315     return start;
 4316   }
 4317 
 4318   // Inputs:
 4319   //   c_rarg0   - long[]  state0
 4320   //   c_rarg1   - long[]  state1
 4321   address generate_double_keccak() {
 4322     static const uint64_t round_consts[24] = {
 4323       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4324       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4325       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4326       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4327       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4328       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4329       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4330       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4331     };
 4332 
 4333     // Implements the double_keccak() method of the
 4334     // sun.secyrity.provider.SHA3Parallel class
 4335     __ align(CodeEntryAlignment);
 4336     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4337     address start = __ pc();
 4338     __ enter();
 4339 
 4340     Register state0        = c_rarg0;
 4341     Register state1        = c_rarg1;
 4342 
 4343     Label rounds24_loop;
 4344 
 4345     // save callee-saved registers
 4346     __ stpd(v8, v9, __ pre(sp, -64));
 4347     __ stpd(v10, v11, Address(sp, 16));
 4348     __ stpd(v12, v13, Address(sp, 32));
 4349     __ stpd(v14, v15, Address(sp, 48));
 4350 
 4351     // load states
 4352     __ add(rscratch1, state0, 32);
 4353     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4354     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4355     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4356     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4357     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4358     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4359     __ ld1(v24, __ D, 0, rscratch1);
 4360     __ add(rscratch1, state1, 32);
 4361     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4362     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4363     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4364     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4365     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4366     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4367     __ ld1(v24, __ D, 1, rscratch1);
 4368 
 4369     // 24 keccak rounds
 4370     __ movw(rscratch2, 24);
 4371 
 4372     // load round_constants base
 4373     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4374 
 4375     __ BIND(rounds24_loop);
 4376     __ subw(rscratch2, rscratch2, 1);
 4377     keccak_round(rscratch1);
 4378     __ cbnzw(rscratch2, rounds24_loop);
 4379 
 4380     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4381     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4382     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4383     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4384     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4385     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4386     __ st1(v24, __ D, 0, state0);
 4387     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4388     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4389     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4390     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4391     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4392     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4393     __ st1(v24, __ D, 1, state1);
 4394 
 4395     // restore callee-saved vector registers
 4396     __ ldpd(v14, v15, Address(sp, 48));
 4397     __ ldpd(v12, v13, Address(sp, 32));
 4398     __ ldpd(v10, v11, Address(sp, 16));
 4399     __ ldpd(v8, v9, __ post(sp, 64));
 4400 
 4401     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4402     __ mov(r0, zr); // return 0
 4403     __ ret(lr);
 4404 
 4405     return start;
 4406   }
 4407 
 4408   /**
 4409    *  Arguments:
 4410    *
 4411    * Inputs:
 4412    *   c_rarg0   - int crc
 4413    *   c_rarg1   - byte* buf
 4414    *   c_rarg2   - int length
 4415    *
 4416    * Output:
 4417    *       rax   - int crc result
 4418    */
 4419   address generate_updateBytesCRC32() {
 4420     assert(UseCRC32Intrinsics, "what are we doing here?");
 4421 
 4422     __ align(CodeEntryAlignment);
 4423     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 4424     StubCodeMark mark(this, stub_id);
 4425 
 4426     address start = __ pc();
 4427 
 4428     const Register crc   = c_rarg0;  // crc
 4429     const Register buf   = c_rarg1;  // source java byte array address
 4430     const Register len   = c_rarg2;  // length
 4431     const Register table0 = c_rarg3; // crc_table address
 4432     const Register table1 = c_rarg4;
 4433     const Register table2 = c_rarg5;
 4434     const Register table3 = c_rarg6;
 4435     const Register tmp3 = c_rarg7;
 4436 
 4437     BLOCK_COMMENT("Entry:");
 4438     __ enter(); // required for proper stackwalking of RuntimeStub frame
 4439 
 4440     __ kernel_crc32(crc, buf, len,
 4441               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 4442 
 4443     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4444     __ ret(lr);
 4445 
 4446     return start;
 4447   }
 4448 
 4449   // ChaCha20 block function.  This version parallelizes 4 quarter
 4450   // round operations at a time.  It uses 16 SIMD registers to
 4451   // produce 4 blocks of key stream.
 4452   //
 4453   // state (int[16]) = c_rarg0
 4454   // keystream (byte[256]) = c_rarg1
 4455   // return - number of bytes of keystream (always 256)
 4456   //
 4457   // In this approach, we load the 512-bit start state sequentially into
 4458   // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
 4459   // state, with each successive set of 4 vectors having a +1 added into
 4460   // the first 32-bit lane of the 4th vector in that group (the counter).
 4461   // By doing this, we can perform the block function on 4 512-bit blocks
 4462   // within one run of this intrinsic.
 4463   // The alignment of the data across the 4-vector group is such that at
 4464   // the start it is already aligned for the first round of each two-round
 4465   // loop iteration.  In other words, the corresponding lanes of each vector
 4466   // will contain the values needed for that quarter round operation (e.g.
 4467   // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
 4468   // In between each full round, a lane shift must occur.  Within a loop
 4469   // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
 4470   // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
 4471   // is effectively a diagonal orientation in columnar form.  After the
 4472   // second full round, those registers are left-rotated again, this time
 4473   // 96, 64, and 32 bits - returning the vectors to their columnar organization.
 4474   // After all 10 iterations, the original state is added to each 4-vector
 4475   // working state along with the add mask, and the 4 vector groups are
 4476   // sequentially written to the memory dedicated for the output key stream.
 4477   //
 4478   // For a more detailed explanation, see Goll and Gueron, "Vectorization of
 4479   // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
 4480   // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
 4481   address generate_chacha20Block_qrpar() {
 4482     Label L_Q_twoRounds, L_Q_cc20_const;
 4483     // The constant data is broken into two 128-bit segments to be loaded
 4484     // onto SIMD registers.  The first 128 bits are a counter add overlay
 4485     // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
 4486     // The second 128-bits is a table constant used for 8-bit left rotations.
 4487     // on 32-bit lanes within a SIMD register.
 4488     __ BIND(L_Q_cc20_const);
 4489     __ emit_int64(0x0000000000000001UL);
 4490     __ emit_int64(0x0000000000000000UL);
 4491     __ emit_int64(0x0605040702010003UL);
 4492     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4493 
 4494     __ align(CodeEntryAlignment);
 4495     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4496     StubCodeMark mark(this, stub_id);
 4497     address start = __ pc();
 4498     __ enter();
 4499 
 4500     const Register state = c_rarg0;
 4501     const Register keystream = c_rarg1;
 4502     const Register loopCtr = r10;
 4503     const Register tmpAddr = r11;
 4504 
 4505     const FloatRegister aState = v0;
 4506     const FloatRegister bState = v1;
 4507     const FloatRegister cState = v2;
 4508     const FloatRegister dState = v3;
 4509     const FloatRegister a1Vec = v4;
 4510     const FloatRegister b1Vec = v5;
 4511     const FloatRegister c1Vec = v6;
 4512     const FloatRegister d1Vec = v7;
 4513     // Skip the callee-saved registers v8 - v15
 4514     const FloatRegister a2Vec = v16;
 4515     const FloatRegister b2Vec = v17;
 4516     const FloatRegister c2Vec = v18;
 4517     const FloatRegister d2Vec = v19;
 4518     const FloatRegister a3Vec = v20;
 4519     const FloatRegister b3Vec = v21;
 4520     const FloatRegister c3Vec = v22;
 4521     const FloatRegister d3Vec = v23;
 4522     const FloatRegister a4Vec = v24;
 4523     const FloatRegister b4Vec = v25;
 4524     const FloatRegister c4Vec = v26;
 4525     const FloatRegister d4Vec = v27;
 4526     const FloatRegister scratch = v28;
 4527     const FloatRegister addMask = v29;
 4528     const FloatRegister lrot8Tbl = v30;
 4529 
 4530     // Load the initial state in the first 4 quadword registers,
 4531     // then copy the initial state into the next 4 quadword registers
 4532     // that will be used for the working state.
 4533     __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
 4534 
 4535     // Load the index register for 2 constant 128-bit data fields.
 4536     // The first represents the +1/+0/+0/+0 add mask.  The second is
 4537     // the 8-bit left rotation.
 4538     __ adr(tmpAddr, L_Q_cc20_const);
 4539     __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
 4540 
 4541     __ mov(a1Vec, __ T16B, aState);
 4542     __ mov(b1Vec, __ T16B, bState);
 4543     __ mov(c1Vec, __ T16B, cState);
 4544     __ mov(d1Vec, __ T16B, dState);
 4545 
 4546     __ mov(a2Vec, __ T16B, aState);
 4547     __ mov(b2Vec, __ T16B, bState);
 4548     __ mov(c2Vec, __ T16B, cState);
 4549     __ addv(d2Vec, __ T4S, d1Vec, addMask);
 4550 
 4551     __ mov(a3Vec, __ T16B, aState);
 4552     __ mov(b3Vec, __ T16B, bState);
 4553     __ mov(c3Vec, __ T16B, cState);
 4554     __ addv(d3Vec, __ T4S, d2Vec, addMask);
 4555 
 4556     __ mov(a4Vec, __ T16B, aState);
 4557     __ mov(b4Vec, __ T16B, bState);
 4558     __ mov(c4Vec, __ T16B, cState);
 4559     __ addv(d4Vec, __ T4S, d3Vec, addMask);
 4560 
 4561     // Set up the 10 iteration loop
 4562     __ mov(loopCtr, 10);
 4563     __ BIND(L_Q_twoRounds);
 4564 
 4565     // The first set of operations on the vectors covers the first 4 quarter
 4566     // round operations:
 4567     //  Qround(state, 0, 4, 8,12)
 4568     //  Qround(state, 1, 5, 9,13)
 4569     //  Qround(state, 2, 6,10,14)
 4570     //  Qround(state, 3, 7,11,15)
 4571     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4572     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4573     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4574     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4575 
 4576     // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
 4577     // diagonals. The a1Vec does not need to change orientation.
 4578     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
 4579     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
 4580     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
 4581     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
 4582 
 4583     // The second set of operations on the vectors covers the second 4 quarter
 4584     // round operations, now acting on the diagonals:
 4585     //  Qround(state, 0, 5,10,15)
 4586     //  Qround(state, 1, 6,11,12)
 4587     //  Qround(state, 2, 7, 8,13)
 4588     //  Qround(state, 3, 4, 9,14)
 4589     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4590     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4591     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4592     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4593 
 4594     // Before we start the next iteration, we need to perform shuffles
 4595     // on the b/c/d vectors to move them back to columnar organizations
 4596     // from their current diagonal orientation.
 4597     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
 4598     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
 4599     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
 4600     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
 4601 
 4602     // Decrement and iterate
 4603     __ sub(loopCtr, loopCtr, 1);
 4604     __ cbnz(loopCtr, L_Q_twoRounds);
 4605 
 4606     // Once the counter reaches zero, we fall out of the loop
 4607     // and need to add the initial state back into the working state
 4608     // represented by the a/b/c/d1Vec registers.  This is destructive
 4609     // on the dState register but we no longer will need it.
 4610     __ addv(a1Vec, __ T4S, a1Vec, aState);
 4611     __ addv(b1Vec, __ T4S, b1Vec, bState);
 4612     __ addv(c1Vec, __ T4S, c1Vec, cState);
 4613     __ addv(d1Vec, __ T4S, d1Vec, dState);
 4614 
 4615     __ addv(a2Vec, __ T4S, a2Vec, aState);
 4616     __ addv(b2Vec, __ T4S, b2Vec, bState);
 4617     __ addv(c2Vec, __ T4S, c2Vec, cState);
 4618     __ addv(dState, __ T4S, dState, addMask);
 4619     __ addv(d2Vec, __ T4S, d2Vec, dState);
 4620 
 4621     __ addv(a3Vec, __ T4S, a3Vec, aState);
 4622     __ addv(b3Vec, __ T4S, b3Vec, bState);
 4623     __ addv(c3Vec, __ T4S, c3Vec, cState);
 4624     __ addv(dState, __ T4S, dState, addMask);
 4625     __ addv(d3Vec, __ T4S, d3Vec, dState);
 4626 
 4627     __ addv(a4Vec, __ T4S, a4Vec, aState);
 4628     __ addv(b4Vec, __ T4S, b4Vec, bState);
 4629     __ addv(c4Vec, __ T4S, c4Vec, cState);
 4630     __ addv(dState, __ T4S, dState, addMask);
 4631     __ addv(d4Vec, __ T4S, d4Vec, dState);
 4632 
 4633     // Write the final state back to the result buffer
 4634     __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
 4635     __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
 4636     __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
 4637     __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
 4638 
 4639     __ mov(r0, 256);             // Return length of output keystream
 4640     __ leave();
 4641     __ ret(lr);
 4642 
 4643     return start;
 4644   }
 4645 
 4646   void dilithium_load16zetas(int o0, Register zetas) {
 4647     __ ldpq(as_FloatRegister(o0), as_FloatRegister(o0 + 1), __ post (zetas, 32));
 4648     __ ldpq(as_FloatRegister(o0 + 2), as_FloatRegister(o0 + 3), __ post (zetas, 32));
 4649 
 4650   }
 4651 
 4652   void dilithium_load32zetas(Register zetas) {
 4653     dilithium_load16zetas(16, zetas);
 4654     dilithium_load16zetas(20, zetas);
 4655   }
 4656 
 4657   // 2x16 32-bit Montgomery multiplications in parallel
 4658   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4659   // Here MONT_R_BITS is 32, so the right shift by it is implicit.
 4660   // The constants qInv = MONT_Q_INV_MOD_R and q = MONT_Q are loaded in
 4661   // (all 32-bit chunks of) vector registers v30 and v31, resp.
 4662   // The inputs are b[i]s in v0-v7 and c[i]s v16-v23 and
 4663   // the results are a[i]s in v16-v23, four 32-bit values in each register
 4664   // and we do a_i = b_i * c_i * 2^-32 mod MONT_Q for all
 4665   void dilithium_montmul32(bool by_constant) {
 4666     FloatRegister vr0 = by_constant ? v29 : v0;
 4667     FloatRegister vr1 = by_constant ? v29 : v1;
 4668     FloatRegister vr2 = by_constant ? v29 : v2;
 4669     FloatRegister vr3 = by_constant ? v29 : v3;
 4670     FloatRegister vr4 = by_constant ? v29 : v4;
 4671     FloatRegister vr5 = by_constant ? v29 : v5;
 4672     FloatRegister vr6 = by_constant ? v29 : v6;
 4673     FloatRegister vr7 = by_constant ? v29 : v7;
 4674 
 4675     __ sqdmulh(v24, __ T4S, vr0, v16); // aHigh = hi32(2 * b * c)
 4676     __ mulv(v16, __ T4S, vr0, v16);    // aLow = lo32(b * c)
 4677     __ sqdmulh(v25, __ T4S, vr1, v17);
 4678     __ mulv(v17, __ T4S, vr1, v17);
 4679     __ sqdmulh(v26, __ T4S, vr2, v18);
 4680     __ mulv(v18, __ T4S, vr2, v18);
 4681     __ sqdmulh(v27, __ T4S, vr3, v19);
 4682     __ mulv(v19, __ T4S, vr3, v19);
 4683 
 4684     __ mulv(v16, __ T4S, v16, v30);     // m = aLow * qinv
 4685     __ mulv(v17, __ T4S, v17, v30);
 4686     __ mulv(v18, __ T4S, v18, v30);
 4687     __ mulv(v19, __ T4S, v19, v30);
 4688 
 4689     __ sqdmulh(v16, __ T4S, v16, v31);  // n = hi32(2 * m * q)
 4690     __ sqdmulh(v17, __ T4S, v17, v31);
 4691     __ sqdmulh(v18, __ T4S, v18, v31);
 4692     __ sqdmulh(v19, __ T4S, v19, v31);
 4693 
 4694     __ shsubv(v16, __ T4S, v24, v16);   // a = (aHigh - n) / 2
 4695     __ shsubv(v17, __ T4S, v25, v17);
 4696     __ shsubv(v18, __ T4S, v26, v18);
 4697     __ shsubv(v19, __ T4S, v27, v19);
 4698 
 4699     __ sqdmulh(v24, __ T4S, vr4, v20);
 4700     __ mulv(v20, __ T4S, vr4, v20);
 4701     __ sqdmulh(v25, __ T4S, vr5, v21);
 4702     __ mulv(v21, __ T4S, vr5, v21);
 4703     __ sqdmulh(v26, __ T4S, vr6, v22);
 4704     __ mulv(v22, __ T4S, vr6, v22);
 4705     __ sqdmulh(v27, __ T4S, vr7, v23);
 4706     __ mulv(v23, __ T4S, vr7, v23);
 4707 
 4708     __ mulv(v20, __ T4S, v20, v30);
 4709     __ mulv(v21, __ T4S, v21, v30);
 4710     __ mulv(v22, __ T4S, v22, v30);
 4711     __ mulv(v23, __ T4S, v23, v30);
 4712 
 4713     __ sqdmulh(v20, __ T4S, v20, v31);
 4714     __ sqdmulh(v21, __ T4S, v21, v31);
 4715     __ sqdmulh(v22, __ T4S, v22, v31);
 4716     __ sqdmulh(v23, __ T4S, v23, v31);
 4717 
 4718     __ shsubv(v20, __ T4S, v24, v20);
 4719     __ shsubv(v21, __ T4S, v25, v21);
 4720     __ shsubv(v22, __ T4S, v26, v22);
 4721     __ shsubv(v23, __ T4S, v27, v23);
 4722   }
 4723 
 4724  // Do the addition and subtraction done in the ntt algorithm.
 4725  // See sun.security.provider.ML_DSA.implDilithiumAlmostNttJava()
 4726   void dilithium_add_sub32() {
 4727     __ addv(v24, __ T4S, v0, v16); // coeffs[j] = coeffs[j] + tmp;
 4728     __ addv(v25, __ T4S, v1, v17);
 4729     __ addv(v26, __ T4S, v2, v18);
 4730     __ addv(v27, __ T4S, v3, v19);
 4731     __ addv(v28, __ T4S, v4, v20);
 4732     __ addv(v29, __ T4S, v5, v21);
 4733     __ addv(v30, __ T4S, v6, v22);
 4734     __ addv(v31, __ T4S, v7, v23);
 4735 
 4736     __ subv(v0, __ T4S, v0, v16);  // coeffs[j + l] = coeffs[j] - tmp;
 4737     __ subv(v1, __ T4S, v1, v17);
 4738     __ subv(v2, __ T4S, v2, v18);
 4739     __ subv(v3, __ T4S, v3, v19);
 4740     __ subv(v4, __ T4S, v4, v20);
 4741     __ subv(v5, __ T4S, v5, v21);
 4742     __ subv(v6, __ T4S, v6, v22);
 4743     __ subv(v7, __ T4S, v7, v23);
 4744   }
 4745 
 4746   // Do the same computation that
 4747   // dilithium_montmul32() and dilithium_add_sub32() does,
 4748   // except for only 4x4 32-bit vector elements and with
 4749   // different register usage.
 4750   void dilithium_montmul_sub_add16() {
 4751     __ sqdmulh(v24, __ T4S, v1, v16);
 4752     __ mulv(v16, __ T4S, v1, v16);
 4753     __ sqdmulh(v25, __ T4S, v3, v17);
 4754     __ mulv(v17, __ T4S, v3, v17);
 4755     __ sqdmulh(v26, __ T4S, v5, v18);
 4756     __ mulv(v18, __ T4S, v5, v18);
 4757     __ sqdmulh(v27, __ T4S, v7, v19);
 4758     __ mulv(v19, __ T4S, v7, v19);
 4759 
 4760     __ mulv(v16, __ T4S, v16, v30);
 4761     __ mulv(v17, __ T4S, v17, v30);
 4762     __ mulv(v18, __ T4S, v18, v30);
 4763     __ mulv(v19, __ T4S, v19, v30);
 4764 
 4765     __ sqdmulh(v16, __ T4S, v16, v31);
 4766     __ sqdmulh(v17, __ T4S, v17, v31);
 4767     __ sqdmulh(v18, __ T4S, v18, v31);
 4768     __ sqdmulh(v19, __ T4S, v19, v31);
 4769 
 4770     __ shsubv(v16, __ T4S, v24, v16);
 4771     __ shsubv(v17, __ T4S, v25, v17);
 4772     __ shsubv(v18, __ T4S, v26, v18);
 4773     __ shsubv(v19, __ T4S, v27, v19);
 4774 
 4775     __ subv(v1, __ T4S, v0, v16);
 4776     __ subv(v3, __ T4S, v2, v17);
 4777     __ subv(v5, __ T4S, v4, v18);
 4778     __ subv(v7, __ T4S, v6, v19);
 4779 
 4780     __ addv(v0, __ T4S, v0, v16);
 4781     __ addv(v2, __ T4S, v2, v17);
 4782     __ addv(v4, __ T4S, v4, v18);
 4783     __ addv(v6, __ T4S, v6, v19);
 4784   }
 4785 
 4786   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 4787   // in the Java implementation come in sequences of at least 8, so we
 4788   // can use ldpq to collect the corresponding data into pairs of vector
 4789   // registers.
 4790   // We collect the coefficients corresponding to the 'j+l' indexes into
 4791   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 4792   // then we do the (Montgomery) multiplications by the zetas in parallel
 4793   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 4794   // v0-v7, then do the additions into v24-v31 and the subtractions into
 4795   // v0-v7 and finally save the results back to the coeffs array.
 4796   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 4797     const Register coeffs, const Register zetas) {
 4798     int c1 = 0;
 4799     int c2 = 512;
 4800     int startIncr;
 4801     int incr1 = 32;
 4802     int incr2 = 64;
 4803     int incr3 = 96;
 4804 
 4805     for (int level = 0; level < 5; level++) {
 4806       int c1Start = c1;
 4807       int c2Start = c2;
 4808       if (level == 3) {
 4809         incr1 = 32;
 4810         incr2 = 128;
 4811         incr3 = 160;
 4812       } else if (level == 4) {
 4813         incr1 = 64;
 4814         incr2 = 128;
 4815         incr3 = 192;
 4816       }
 4817 
 4818       for (int i = 0; i < 4; i++) {
 4819         __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q
 4820         __ ldpq(v0, v1, Address(coeffs, c2Start));
 4821         __ ldpq(v2, v3, Address(coeffs, c2Start + incr1));
 4822         __ ldpq(v4, v5, Address(coeffs, c2Start + incr2));
 4823         __ ldpq(v6, v7, Address(coeffs, c2Start + incr3));
 4824         dilithium_load32zetas(zetas);
 4825         dilithium_montmul32(false);
 4826         __ ldpq(v0, v1, Address(coeffs, c1Start));
 4827         __ ldpq(v2, v3, Address(coeffs, c1Start + incr1));
 4828         __ ldpq(v4, v5, Address(coeffs, c1Start + incr2));
 4829         __ ldpq(v6, v7, Address(coeffs, c1Start + incr3));
 4830         dilithium_add_sub32();
 4831         __ stpq(v24, v25, Address(coeffs, c1Start));
 4832         __ stpq(v26, v27, Address(coeffs, c1Start + incr1));
 4833         __ stpq(v28, v29, Address(coeffs, c1Start + incr2));
 4834         __ stpq(v30, v31, Address(coeffs, c1Start + incr3));
 4835         __ stpq(v0, v1, Address(coeffs, c2Start));
 4836         __ stpq(v2, v3, Address(coeffs, c2Start + incr1));
 4837         __ stpq(v4, v5, Address(coeffs, c2Start + incr2));
 4838         __ stpq(v6, v7, Address(coeffs, c2Start + incr3));
 4839 
 4840         int k = 4 * level + i;
 4841 
 4842         if (k > 7) {
 4843           startIncr = 256;
 4844         } else if (k == 5) {
 4845           startIncr = 384;
 4846         } else {
 4847           startIncr = 128;
 4848         }
 4849 
 4850         c1Start += startIncr;
 4851         c2Start += startIncr;
 4852       }
 4853 
 4854       c2 /= 2;
 4855     }
 4856   }
 4857 
 4858   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 4859   // Implements the method
 4860   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 4861   // of the Java class sun.security.provider
 4862   //
 4863   // coeffs (int[256]) = c_rarg0
 4864   // zetas (int[256]) = c_rarg1
 4865   address generate_dilithiumAlmostNtt() {
 4866 
 4867     __ align(CodeEntryAlignment);
 4868     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 4869     StubCodeMark mark(this, stub_id);
 4870     address start = __ pc();
 4871     __ enter();
 4872 
 4873     const Register coeffs = c_rarg0;
 4874     const Register zetas = c_rarg1;
 4875 
 4876     const Register tmpAddr = r9;
 4877     const Register dilithiumConsts = r10;
 4878     const Register result = r11;
 4879 
 4880     __ add(result, coeffs, 0);
 4881     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 4882 
 4883     // Each level represents one iteration of the outer for loop of the Java version
 4884 
 4885     // level 0-4
 4886     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 4887 
 4888     // level 5
 4889     for (int i = 0; i < 1024; i += 256) {
 4890       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4891       __ ldr(v0, __ Q, Address(coeffs, i + 16));
 4892       __ ldr(v1, __ Q, Address(coeffs, i + 48));
 4893       __ ldr(v2, __ Q, Address(coeffs, i + 80));
 4894       __ ldr(v3, __ Q, Address(coeffs, i + 112));
 4895       __ ldr(v4, __ Q, Address(coeffs, i + 144));
 4896       __ ldr(v5, __ Q, Address(coeffs, i + 176));
 4897       __ ldr(v6, __ Q, Address(coeffs, i + 208));
 4898       __ ldr(v7, __ Q, Address(coeffs, i + 240));
 4899       dilithium_load32zetas(zetas);
 4900       dilithium_montmul32(false);
 4901       __ ldr(v0, __ Q, Address(coeffs, i));
 4902       __ ldr(v1, __ Q, Address(coeffs, i + 32));
 4903       __ ldr(v2, __ Q, Address(coeffs, i + 64));
 4904       __ ldr(v3, __ Q, Address(coeffs, i + 96));
 4905       __ ldr(v4, __ Q, Address(coeffs, i + 128));
 4906       __ ldr(v5, __ Q, Address(coeffs, i + 160));
 4907       __ ldr(v6, __ Q, Address(coeffs, i + 192));
 4908       __ ldr(v7, __ Q, Address(coeffs, i + 224));
 4909       dilithium_add_sub32();
 4910       __ str(v24, __ Q, Address(coeffs, i));
 4911       __ str(v25, __ Q, Address(coeffs, i + 32));
 4912       __ str(v26, __ Q, Address(coeffs, i + 64));
 4913       __ str(v27, __ Q, Address(coeffs, i + 96));
 4914       __ str(v28, __ Q, Address(coeffs, i + 128));
 4915       __ str(v29, __ Q, Address(coeffs, i + 160));
 4916       __ str(v30, __ Q, Address(coeffs, i + 192));
 4917       __ str(v31, __ Q, Address(coeffs, i + 224));
 4918       __ str(v0, __ Q, Address(coeffs, i + 16));
 4919       __ str(v1, __ Q, Address(coeffs, i + 48));
 4920       __ str(v2, __ Q, Address(coeffs, i + 80));
 4921       __ str(v3, __ Q, Address(coeffs, i + 112));
 4922       __ str(v4, __ Q, Address(coeffs, i + 144));
 4923       __ str(v5, __ Q, Address(coeffs, i + 176));
 4924       __ str(v6, __ Q, Address(coeffs, i + 208));
 4925       __ str(v7, __ Q, Address(coeffs, i + 240));
 4926     }
 4927 
 4928     // level 6
 4929     for (int i = 0; i < 1024; i += 128) {
 4930       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4931       __ add(tmpAddr, coeffs, i);
 4932       __ ld2(v0, v1, __ T2D, tmpAddr);
 4933       __ add(tmpAddr, coeffs, i + 32);
 4934       __ ld2(v2, v3, __ T2D, tmpAddr);
 4935       __ add(tmpAddr, coeffs, i + 64);
 4936       __ ld2(v4, v5, __ T2D, tmpAddr);
 4937       __ add(tmpAddr, coeffs, i + 96);
 4938       __ ld2(v6, v7, __ T2D, tmpAddr);
 4939       dilithium_load16zetas(16, zetas);
 4940       dilithium_montmul_sub_add16();
 4941       __ add(tmpAddr, coeffs, i);
 4942       __ st2(v0, v1, __ T2D, tmpAddr);
 4943       __ add(tmpAddr, coeffs, i + 32);
 4944       __ st2(v2, v3, __ T2D, tmpAddr);
 4945       __ add(tmpAddr, coeffs, i + 64);
 4946       __ st2(v4, v5, __ T2D, tmpAddr);
 4947       __ add(tmpAddr, coeffs, i + 96);
 4948       __ st2(v6, v7, __ T2D, tmpAddr);
 4949     }
 4950 
 4951     // level 7
 4952     for (int i = 0; i < 1024; i += 128) {
 4953       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4954       __ add(tmpAddr, coeffs, i);
 4955       __ ld2(v0, v1, __ T4S, tmpAddr);
 4956       __ add(tmpAddr, coeffs, i + 32);
 4957       __ ld2(v2, v3, __ T4S, tmpAddr);
 4958       __ add(tmpAddr, coeffs, i + 64);
 4959       __ ld2(v4, v5, __ T4S, tmpAddr);
 4960       __ add(tmpAddr, coeffs, i + 96);
 4961       __ ld2(v6, v7, __ T4S, tmpAddr);
 4962       dilithium_load16zetas(16, zetas);
 4963       dilithium_montmul_sub_add16();
 4964       __ add(tmpAddr, coeffs, i);
 4965       __ st2(v0, v1, __ T4S, tmpAddr);
 4966       __ add(tmpAddr, coeffs, i + 32);
 4967       __ st2(v2, v3, __ T4S, tmpAddr);
 4968       __ add(tmpAddr, coeffs, i + 64);
 4969       __ st2(v4, v5, __ T4S, tmpAddr);
 4970       __ add(tmpAddr, coeffs, i + 96);
 4971       __ st2(v6, v7, __ T4S, tmpAddr);
 4972     }
 4973     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4974     __ mov(r0, zr); // return 0
 4975     __ ret(lr);
 4976 
 4977     return start;
 4978 
 4979   }
 4980 
 4981   // Do the computations that can be found in the body of the loop in
 4982   // sun.security.provider.ML_DSA.implDilithiumAlmostInverseNttJava()
 4983   // for 16 coefficients in parallel:
 4984   // tmp = coeffs[j];
 4985   // coeffs[j] = (tmp + coeffs[j + l]);
 4986   // coeffs[j + l] = montMul(tmp - coeffs[j + l], -MONT_ZETAS_FOR_NTT[m]);
 4987   // coefss[j]s are loaded in v0, v2, v4 and v6,
 4988   // coeffs[j + l]s in v1, v3, v5 and v7,
 4989   // the corresponding zetas in v16, v17, v18 and v19.
 4990   void dilithium_sub_add_montmul16() {
 4991     __ subv(v20, __ T4S, v0, v1);
 4992     __ subv(v21, __ T4S, v2, v3);
 4993     __ subv(v22, __ T4S, v4, v5);
 4994     __ subv(v23, __ T4S, v6, v7);
 4995 
 4996     __ addv(v0, __ T4S, v0, v1);
 4997     __ addv(v2, __ T4S, v2, v3);
 4998     __ addv(v4, __ T4S, v4, v5);
 4999     __ addv(v6, __ T4S, v6, v7);
 5000 
 5001     __ sqdmulh(v24, __ T4S, v20, v16); // aHigh = hi32(2 * b * c)
 5002     __ mulv(v1, __ T4S, v20, v16);     // aLow = lo32(b * c)
 5003     __ sqdmulh(v25, __ T4S, v21, v17);
 5004     __ mulv(v3, __ T4S, v21, v17);
 5005     __ sqdmulh(v26, __ T4S, v22, v18);
 5006     __ mulv(v5, __ T4S, v22, v18);
 5007     __ sqdmulh(v27, __ T4S, v23, v19);
 5008     __ mulv(v7, __ T4S, v23, v19);
 5009 
 5010     __ mulv(v1, __ T4S, v1, v30);      // m = (aLow * q)
 5011     __ mulv(v3, __ T4S, v3, v30);
 5012     __ mulv(v5, __ T4S, v5, v30);
 5013     __ mulv(v7, __ T4S, v7, v30);
 5014 
 5015     __ sqdmulh(v1, __ T4S, v1, v31);  // n = hi32(2 * m * q)
 5016     __ sqdmulh(v3, __ T4S, v3, v31);
 5017     __ sqdmulh(v5, __ T4S, v5, v31);
 5018     __ sqdmulh(v7, __ T4S, v7, v31);
 5019 
 5020     __ shsubv(v1, __ T4S, v24, v1);  // a = (aHigh  - n) / 2
 5021     __ shsubv(v3, __ T4S, v25, v3);
 5022     __ shsubv(v5, __ T4S, v26, v5);
 5023     __ shsubv(v7, __ T4S, v27, v7);
 5024   }
 5025 
 5026   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 5027   // in the Java implementation come in sequences of at least 8, so we
 5028   // can use ldpq to collect the corresponding data into pairs of vector
 5029   // registers
 5030   // We collect the coefficients that correspond to the 'j's into v0-v7
 5031   // the coefficiets that correspond to the 'j+l's into v16-v23 then
 5032   // do the additions into v24-v31 and the subtractions into v0-v7 then
 5033   // save the result of the additions, load the zetas into v16-v23
 5034   // do the (Montgomery) multiplications by zeta in parallel into v16-v23
 5035   // finally save the results back to the coeffs array
 5036   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 5037     const Register coeffs, const Register zetas) {
 5038     int c1 = 0;
 5039     int c2 = 32;
 5040     int startIncr;
 5041     int incr1;
 5042     int incr2;
 5043     int incr3;
 5044 
 5045     for (int level = 3; level < 8; level++) {
 5046       int c1Start = c1;
 5047       int c2Start = c2;
 5048       if (level == 3) {
 5049         incr1 = 64;
 5050         incr2 = 128;
 5051         incr3 = 192;
 5052       } else if (level == 4) {
 5053         incr1 = 32;
 5054         incr2 = 128;
 5055         incr3 = 160;
 5056       } else {
 5057         incr1 = 32;
 5058         incr2 = 64;
 5059         incr3 = 96;
 5060       }
 5061 
 5062       for (int i = 0; i < 4; i++) {
 5063         __ ldpq(v0, v1, Address(coeffs, c1Start));
 5064         __ ldpq(v2, v3, Address(coeffs, c1Start + incr1));
 5065         __ ldpq(v4, v5, Address(coeffs, c1Start + incr2));
 5066         __ ldpq(v6, v7, Address(coeffs, c1Start + incr3));
 5067         __ ldpq(v16, v17, Address(coeffs, c2Start));
 5068         __ ldpq(v18, v19, Address(coeffs, c2Start + incr1));
 5069         __ ldpq(v20, v21, Address(coeffs, c2Start + incr2));
 5070         __ ldpq(v22, v23, Address(coeffs, c2Start + incr3));
 5071         dilithium_add_sub32();
 5072         __ stpq(v24, v25, Address(coeffs, c1Start));
 5073         __ stpq(v26, v27, Address(coeffs, c1Start + incr1));
 5074         __ stpq(v28, v29, Address(coeffs, c1Start + incr2));
 5075         __ stpq(v30, v31, Address(coeffs, c1Start + incr3));
 5076         __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5077         dilithium_load32zetas(zetas);
 5078         dilithium_montmul32(false);
 5079         __ stpq(v16, v17, Address(coeffs, c2Start));
 5080         __ stpq(v18, v19, Address(coeffs, c2Start + incr1));
 5081         __ stpq(v20, v21, Address(coeffs, c2Start + incr2));
 5082         __ stpq(v22, v23, Address(coeffs, c2Start + incr3));
 5083 
 5084         int k = 4 * level + i;
 5085 
 5086         if (k < 24) {
 5087           startIncr = 256;
 5088         } else if (k == 25) {
 5089           startIncr = 384;
 5090         } else {
 5091           startIncr = 128;
 5092         }
 5093 
 5094         c1Start += startIncr;
 5095         c2Start += startIncr;
 5096       }
 5097 
 5098       c2 *= 2;
 5099     }
 5100   }
 5101 
 5102   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 5103   // Implements the method
 5104   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 5105   // the sun.security.provider.ML_DSA class.
 5106   //
 5107   // coeffs (int[256]) = c_rarg0
 5108   // zetas (int[256]) = c_rarg1
 5109   address generate_dilithiumAlmostInverseNtt() {
 5110 
 5111     __ align(CodeEntryAlignment);
 5112     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 5113     StubCodeMark mark(this, stub_id);
 5114     address start = __ pc();
 5115     __ enter();
 5116 
 5117     const Register coeffs = c_rarg0;
 5118     const Register zetas = c_rarg1;
 5119 
 5120     const Register tmpAddr = r9;
 5121     const Register dilithiumConsts = r10;
 5122     const Register result = r11;
 5123 
 5124     __ add(result, coeffs, 0);
 5125     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5126 
 5127     // Each level represents one iteration of the outer for loop of the Java version
 5128     // level0
 5129     for (int i = 0; i < 1024; i += 128) {
 5130       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 5131       __ add(tmpAddr, coeffs, i);
 5132       __ ld2(v0, v1, __ T4S, tmpAddr);
 5133       __ add(tmpAddr, coeffs, i + 32);
 5134       __ ld2(v2, v3, __ T4S, tmpAddr);
 5135       __ add(tmpAddr, coeffs, i + 64);
 5136       __ ld2(v4, v5, __ T4S, tmpAddr);
 5137       __ add(tmpAddr, coeffs, i + 96);
 5138       __ ld2(v6, v7, __ T4S, tmpAddr);
 5139       dilithium_load16zetas(16, zetas);
 5140       dilithium_sub_add_montmul16();
 5141       __ add(tmpAddr, coeffs, i);
 5142       __ st2(v0, v1, __ T4S, tmpAddr);
 5143       __ add(tmpAddr, coeffs, i + 32);
 5144       __ st2(v2, v3, __ T4S, tmpAddr);
 5145       __ add(tmpAddr, coeffs, i + 64);
 5146       __ st2(v4, v5, __ T4S, tmpAddr);
 5147       __ add(tmpAddr, coeffs, i + 96);
 5148       __ st2(v6, v7, __ T4S, tmpAddr);
 5149     }
 5150 
 5151     // level 1
 5152     for (int i = 0; i < 1024; i += 128) {
 5153       __ add(tmpAddr, coeffs, i);
 5154       __ ld2(v0, v1, __ T2D, tmpAddr);
 5155       __ add(tmpAddr, coeffs, i + 32);
 5156       __ ld2(v2, v3, __ T2D, tmpAddr);
 5157       __ add(tmpAddr, coeffs, i + 64);
 5158       __ ld2(v4, v5, __ T2D, tmpAddr);
 5159       __ add(tmpAddr, coeffs, i + 96);
 5160       __ ld2(v6, v7, __ T2D, tmpAddr);
 5161       dilithium_load16zetas(16, zetas);
 5162       dilithium_sub_add_montmul16();
 5163       __ add(tmpAddr, coeffs, i);
 5164       __ st2(v0, v1, __ T2D, tmpAddr);
 5165       __ add(tmpAddr, coeffs, i + 32);
 5166       __ st2(v2, v3, __ T2D, tmpAddr);
 5167       __ add(tmpAddr, coeffs, i + 64);
 5168       __ st2(v4, v5, __ T2D, tmpAddr);
 5169       __ add(tmpAddr, coeffs, i + 96);
 5170       __ st2(v6, v7, __ T2D, tmpAddr);
 5171     }
 5172 
 5173     //level 2
 5174     for (int i = 0; i < 1024; i += 256) {
 5175       __ ldr(v0, __ Q, Address(coeffs, i));
 5176       __ ldr(v1, __ Q, Address(coeffs, i + 32));
 5177       __ ldr(v2, __ Q, Address(coeffs, i + 64));
 5178       __ ldr(v3, __ Q, Address(coeffs, i + 96));
 5179       __ ldr(v4, __ Q, Address(coeffs, i + 128));
 5180       __ ldr(v5, __ Q, Address(coeffs, i + 160));
 5181       __ ldr(v6, __ Q, Address(coeffs, i + 192));
 5182       __ ldr(v7, __ Q, Address(coeffs, i + 224));
 5183       __ ldr(v16, __ Q, Address(coeffs, i + 16));
 5184       __ ldr(v17, __ Q, Address(coeffs, i + 48));
 5185       __ ldr(v18, __ Q, Address(coeffs, i + 80));
 5186       __ ldr(v19, __ Q, Address(coeffs, i + 112));
 5187       __ ldr(v20, __ Q, Address(coeffs, i + 144));
 5188       __ ldr(v21, __ Q, Address(coeffs, i + 176));
 5189       __ ldr(v22, __ Q, Address(coeffs, i + 208));
 5190       __ ldr(v23, __ Q, Address(coeffs, i + 240));
 5191       dilithium_add_sub32();
 5192       __ str(v24, __ Q, Address(coeffs, i));
 5193       __ str(v25, __ Q, Address(coeffs, i + 32));
 5194       __ str(v26, __ Q, Address(coeffs, i + 64));
 5195       __ str(v27, __ Q, Address(coeffs, i + 96));
 5196       __ str(v28, __ Q, Address(coeffs, i + 128));
 5197       __ str(v29, __ Q, Address(coeffs, i + 160));
 5198       __ str(v30, __ Q, Address(coeffs, i + 192));
 5199       __ str(v31, __ Q, Address(coeffs, i + 224));
 5200       dilithium_load32zetas(zetas);
 5201       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 5202       dilithium_montmul32(false);
 5203       __ str(v16, __ Q, Address(coeffs, i + 16));
 5204       __ str(v17, __ Q, Address(coeffs, i + 48));
 5205       __ str(v18, __ Q, Address(coeffs, i + 80));
 5206       __ str(v19, __ Q, Address(coeffs, i + 112));
 5207       __ str(v20, __ Q, Address(coeffs, i + 144));
 5208       __ str(v21, __ Q, Address(coeffs, i + 176));
 5209       __ str(v22, __ Q, Address(coeffs, i + 208));
 5210       __ str(v23, __ Q, Address(coeffs, i + 240));
 5211     }
 5212 
 5213     // level 3-7
 5214     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 5215 
 5216     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5217     __ mov(r0, zr); // return 0
 5218     __ ret(lr);
 5219 
 5220     return start;
 5221 
 5222   }
 5223 
 5224   // Dilithium multiply polynomials in the NTT domain.
 5225   // Straightforward implementation of the method
 5226   // static int implDilithiumNttMult(
 5227   //              int[] result, int[] ntta, int[] nttb {} of
 5228   // the sun.security.provider.ML_DSA class.
 5229   //
 5230   // result (int[256]) = c_rarg0
 5231   // poly1 (int[256]) = c_rarg1
 5232   // poly2 (int[256]) = c_rarg2
 5233   address generate_dilithiumNttMult() {
 5234 
 5235     __ align(CodeEntryAlignment);
 5236     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 5237     StubCodeMark mark(this, stub_id);
 5238     address start = __ pc();
 5239     __ enter();
 5240 
 5241     Label L_loop;
 5242 
 5243     const Register result = c_rarg0;
 5244     const Register poly1 = c_rarg1;
 5245     const Register poly2 = c_rarg2;
 5246 
 5247     const Register dilithiumConsts = r10;
 5248     const Register len = r11;
 5249 
 5250     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5251 
 5252     __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5253     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 5254 
 5255     __ mov(len, zr);
 5256     __ add(len, len, 1024);
 5257 
 5258     __ BIND(L_loop);
 5259 
 5260     __ ldpq(v0, v1, __ post(poly1, 32));
 5261     __ ldpq(v2, v3, __ post(poly1, 32));
 5262     __ ldpq(v4, v5, __ post(poly1, 32));
 5263     __ ldpq(v6, v7, __ post(poly1, 32));
 5264     __ ldpq(v16, v17, __ post(poly2, 32));
 5265     __ ldpq(v18, v19, __ post(poly2, 32));
 5266     __ ldpq(v20, v21, __ post(poly2, 32));
 5267     __ ldpq(v22, v23, __ post(poly2, 32));
 5268     dilithium_montmul32(false);
 5269     dilithium_montmul32(true);
 5270     __ stpq(v16, v17, __ post(result, 32));
 5271     __ stpq(v18, v19, __ post(result, 32));
 5272     __ stpq(v20, v21, __ post(result, 32));
 5273     __ stpq(v22, v23, __ post(result, 32));
 5274 
 5275     __ sub(len, len, 128);
 5276     __ cmp(len, (u1)128);
 5277     __ br(Assembler::GE, L_loop);
 5278 
 5279     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5280     __ mov(r0, zr); // return 0
 5281     __ ret(lr);
 5282 
 5283     return start;
 5284 
 5285   }
 5286 
 5287   // Dilithium Motgomery multiply an array by a constant.
 5288   // A straightforward implementation of the method
 5289   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 5290   // of the sun.security.provider.MLDSA class
 5291   //
 5292   // coeffs (int[256]) = c_rarg0
 5293   // constant (int) = c_rarg1
 5294   address generate_dilithiumMontMulByConstant() {
 5295 
 5296     __ align(CodeEntryAlignment);
 5297     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 5298     StubCodeMark mark(this, stub_id);
 5299     address start = __ pc();
 5300     __ enter();
 5301 
 5302     Label L_loop;
 5303 
 5304     const Register coeffs = c_rarg0;
 5305     const Register constant = c_rarg1;
 5306 
 5307     const Register dilithiumConsts = r10;
 5308     const Register result = r11;
 5309     const Register len = r12;
 5310 
 5311     __ add(result, coeffs, 0);
 5312     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5313 
 5314     __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5315     __ dup(v29, __ T4S, constant);
 5316     __ mov(len, zr);
 5317     __ add(len, len, 1024);
 5318 
 5319     __ BIND(L_loop);
 5320 
 5321     __ ldpq(v16, v17, __ post(coeffs, 32));
 5322     __ ldpq(v18, v19, __ post(coeffs, 32));
 5323     __ ldpq(v20, v21, __ post(coeffs, 32));
 5324     __ ldpq(v22, v23, __ post(coeffs, 32));
 5325     dilithium_montmul32(true);
 5326     __ stpq(v16, v17, __ post(result, 32));
 5327     __ stpq(v18, v19, __ post(result, 32));
 5328     __ stpq(v20, v21, __ post(result, 32));
 5329     __ stpq(v22, v23, __ post(result, 32));
 5330 
 5331     __ sub(len, len, 128);
 5332     __ cmp(len, (u1)128);
 5333     __ br(Assembler::GE, L_loop);
 5334 
 5335     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5336     __ mov(r0, zr); // return 0
 5337     __ ret(lr);
 5338 
 5339     return start;
 5340   }
 5341 
 5342   // Dilithium decompose poly.
 5343   // Implements the method
 5344   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 5345   // of the sun.security.provider.ML_DSA class
 5346   //
 5347   // input (int[256]) = c_rarg0
 5348   // lowPart (int[256]) = c_rarg1
 5349   // highPart (int[256]) = c_rarg2
 5350   // twoGamma2  (int) = c_rarg3
 5351   // multiplier (int) = c_rarg4
 5352   address generate_dilithiumDecomposePoly() {
 5353 
 5354     __ align(CodeEntryAlignment);
 5355     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 5356     StubCodeMark mark(this, stub_id);
 5357     address start = __ pc();
 5358     __ enter();
 5359 
 5360     Label L_loop;
 5361 
 5362     const Register input = c_rarg0;
 5363     const Register lowPart = c_rarg1;
 5364     const Register highPart = c_rarg2;
 5365     const Register twoGamma2 = c_rarg3;
 5366     const Register multiplier = c_rarg4;
 5367 
 5368     const Register len = r9;
 5369     const Register dilithiumConsts = r10;
 5370     const Register tmp = r11;
 5371 
 5372     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5373 
 5374     // save callee-saved registers
 5375     __ stpd(v8, v9, __ pre(sp, -64));
 5376     __ stpd(v10, v11, Address(sp, 16));
 5377     __ stpd(v12, v13, Address(sp, 32));
 5378     __ stpd(v14, v15, Address(sp, 48));
 5379 
 5380 
 5381     __ mov(tmp, zr);
 5382     __ add(tmp, tmp, 1);
 5383     __ dup(v25, __ T4S, tmp); // 1
 5384     __ ldr(v30, __ Q, Address(dilithiumConsts, 16)); // q
 5385     __ ldr(v31, __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 5386     __ dup(v28, __ T4S, twoGamma2); // 2 * gamma2
 5387     __ dup(v29, __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 5388     __ subv(v26, __ T4S, v30, v25); // q - 1
 5389     __ sshr(v27, __ T4S, v28, 1); // gamma2
 5390 
 5391     __ mov(len, zr);
 5392     __ add(len, len, 1024);
 5393 
 5394     __ BIND(L_loop);
 5395 
 5396     __ ld4(v0, v1, v2, v3, __ T4S, __ post(input, 64));
 5397 
 5398     // rplus in v0
 5399     //  rplus = rplus - ((rplus + 5373807) >> 23) * dilithium_q;
 5400     __ addv(v4, __ T4S, v0, v31);
 5401     __ addv(v5, __ T4S, v1, v31);
 5402     __ addv(v6, __ T4S, v2, v31);
 5403     __ addv(v7, __ T4S, v3, v31);
 5404 
 5405     __ sshr(v4, __ T4S, v4, 23);
 5406     __ sshr(v5, __ T4S, v5, 23);
 5407     __ sshr(v6, __ T4S, v6, 23);
 5408     __ sshr(v7, __ T4S, v7, 23);
 5409 
 5410     __ mulv(v4, __ T4S, v4, v30);
 5411     __ mulv(v5, __ T4S, v5, v30);
 5412     __ mulv(v6, __ T4S, v6, v30);
 5413     __ mulv(v7, __ T4S, v7, v30);
 5414 
 5415     __ subv(v0, __ T4S, v0, v4);
 5416     __ subv(v1, __ T4S, v1, v5);
 5417     __ subv(v2, __ T4S, v2, v6);
 5418     __ subv(v3, __ T4S, v3, v7);
 5419 
 5420     // rplus in v0
 5421     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 5422     __ sshr(v4, __ T4S, v0, 31);
 5423     __ sshr(v5, __ T4S, v1, 31);
 5424     __ sshr(v6, __ T4S, v2, 31);
 5425     __ sshr(v7, __ T4S, v3, 31);
 5426 
 5427     __ andr(v4, __ T16B, v4, v30);
 5428     __ andr(v5, __ T16B, v5, v30);
 5429     __ andr(v6, __ T16B, v6, v30);
 5430     __ andr(v7, __ T16B, v7, v30);
 5431 
 5432     __ addv(v0, __ T4S, v0, v4);
 5433     __ addv(v1, __ T4S, v1, v5);
 5434     __ addv(v2, __ T4S, v2, v6);
 5435     __ addv(v3, __ T4S, v3, v7);
 5436 
 5437     // rplus in v0
 5438     // int quotient = (rplus * multiplier) >> 22;
 5439     __ mulv(v4, __ T4S, v0, v29);
 5440     __ mulv(v5, __ T4S, v1, v29);
 5441     __ mulv(v6, __ T4S, v2, v29);
 5442     __ mulv(v7, __ T4S, v3, v29);
 5443 
 5444     __ sshr(v4, __ T4S, v4, 22);
 5445     __ sshr(v5, __ T4S, v5, 22);
 5446     __ sshr(v6, __ T4S, v6, 22);
 5447     __ sshr(v7, __ T4S, v7, 22);
 5448 
 5449     // quotient in v4
 5450     // int r0 = rplus - quotient * twoGamma2;
 5451     __ mulv(v8, __ T4S, v4, v28);
 5452     __ mulv(v9, __ T4S, v5, v28);
 5453     __ mulv(v10, __ T4S, v6, v28);
 5454     __ mulv(v11, __ T4S, v7, v28);
 5455 
 5456     __ subv(v8, __ T4S, v0, v8);
 5457     __ subv(v9, __ T4S, v1, v9);
 5458     __ subv(v10, __ T4S, v2, v10);
 5459     __ subv(v11, __ T4S, v3, v11);
 5460 
 5461     // r0 in v8
 5462     // int mask = (twoGamma2 - r0) >> 22;
 5463     __ subv(v12, __ T4S, v28, v8);
 5464     __ subv(v13, __ T4S, v28, v9);
 5465     __ subv(v14, __ T4S, v28, v10);
 5466     __ subv(v15, __ T4S, v28, v11);
 5467 
 5468     __ sshr(v12, __ T4S, v12, 22);
 5469     __ sshr(v13, __ T4S, v13, 22);
 5470     __ sshr(v14, __ T4S, v14, 22);
 5471     __ sshr(v15, __ T4S, v15, 22);
 5472 
 5473     // mask in v12
 5474     // r0 -= (mask & twoGamma2);
 5475     __ andr(v16, __ T16B, v12, v28);
 5476     __ andr(v17, __ T16B, v13, v28);
 5477     __ andr(v18, __ T16B, v14, v28);
 5478     __ andr(v19, __ T16B, v15, v28);
 5479 
 5480     __ subv(v8, __ T4S, v8, v16);
 5481     __ subv(v9, __ T4S, v9, v17);
 5482     __ subv(v10, __ T4S, v10, v18);
 5483     __ subv(v11, __ T4S, v11, v19);
 5484 
 5485     // r0 in v8
 5486     //  quotient += (mask & 1);
 5487     __ andr(v16, __ T16B, v12, v25);
 5488     __ andr(v17, __ T16B, v13, v25);
 5489     __ andr(v18, __ T16B, v14, v25);
 5490     __ andr(v19, __ T16B, v15, v25);
 5491 
 5492     __ addv(v4, __ T4S, v4, v16);
 5493     __ addv(v5, __ T4S, v5, v17);
 5494     __ addv(v6, __ T4S, v6, v18);
 5495     __ addv(v7, __ T4S, v7, v19);
 5496 
 5497     // mask = (twoGamma2 / 2 - r0) >> 31;
 5498     __ subv(v12, __ T4S, v27, v8);
 5499     __ subv(v13, __ T4S, v27, v9);
 5500     __ subv(v14, __ T4S, v27, v10);
 5501     __ subv(v15, __ T4S, v27, v11);
 5502 
 5503     __ sshr(v12, __ T4S, v12, 31);
 5504     __ sshr(v13, __ T4S, v13, 31);
 5505     __ sshr(v14, __ T4S, v14, 31);
 5506     __ sshr(v15, __ T4S, v15, 31);
 5507 
 5508     // r0 -= (mask & twoGamma2);
 5509     __ andr(v16, __ T16B, v12, v28);
 5510     __ andr(v17, __ T16B, v13, v28);
 5511     __ andr(v18, __ T16B, v14, v28);
 5512     __ andr(v19, __ T16B, v15, v28);
 5513 
 5514     __ subv(v8, __ T4S, v8, v16);
 5515     __ subv(v9, __ T4S, v9, v17);
 5516     __ subv(v10, __ T4S, v10, v18);
 5517     __ subv(v11, __ T4S, v11, v19);
 5518 
 5519     // quotient += (mask & 1);
 5520     __ andr(v16, __ T16B, v12, v25);
 5521     __ andr(v17, __ T16B, v13, v25);
 5522     __ andr(v18, __ T16B, v14, v25);
 5523     __ andr(v19, __ T16B, v15, v25);
 5524 
 5525     __ addv(v4, __ T4S, v4, v16);
 5526     __ addv(v5, __ T4S, v5, v17);
 5527     __ addv(v6, __ T4S, v6, v18);
 5528     __ addv(v7, __ T4S, v7, v19);
 5529 
 5530     // int r1 = rplus - r0 - (dilithium_q - 1);
 5531     __ subv(v16, __ T4S, v0, v8);
 5532     __ subv(v17, __ T4S, v1, v9);
 5533     __ subv(v18, __ T4S, v2, v10);
 5534     __ subv(v19, __ T4S, v3, v11);
 5535 
 5536     __ subv(v16, __ T4S, v16, v26);
 5537     __ subv(v17, __ T4S, v17, v26);
 5538     __ subv(v18, __ T4S, v18, v26);
 5539     __ subv(v19, __ T4S, v19, v26);
 5540 
 5541     // r1 in v16
 5542     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 5543     __ negr(v20, __ T4S, v16);
 5544     __ negr(v21, __ T4S, v17);
 5545     __ negr(v22, __ T4S, v18);
 5546     __ negr(v23, __ T4S, v19);
 5547 
 5548     __ orr(v16, __ T16B, v16, v20);
 5549     __ orr(v17, __ T16B, v17, v21);
 5550     __ orr(v18, __ T16B, v18, v22);
 5551     __ orr(v19, __ T16B, v19, v23);
 5552 
 5553     __ sshr(v0, __ T4S, v16, 31);
 5554     __ sshr(v1, __ T4S, v17, 31);
 5555     __ sshr(v2, __ T4S, v18, 31);
 5556     __ sshr(v3, __ T4S, v19, 31);
 5557 
 5558     // r1 in v0
 5559     // r0 += ~r1;
 5560     __ notr(v20, __ T16B, v0);
 5561     __ notr(v21, __ T16B, v1);
 5562     __ notr(v22, __ T16B, v2);
 5563     __ notr(v23, __ T16B, v3);
 5564 
 5565     __ addv(v8, __ T4S, v8, v20);
 5566     __ addv(v9, __ T4S, v9, v21);
 5567     __ addv(v10, __ T4S, v10, v22);
 5568     __ addv(v11, __ T4S, v11, v23);
 5569 
 5570     // r0 in v8
 5571     // r1 = r1 & quotient;
 5572     __ andr(v0, __ T16B, v4, v0);
 5573     __ andr(v1, __ T16B, v5, v1);
 5574     __ andr(v2, __ T16B, v6, v2);
 5575     __ andr(v3, __ T16B, v7, v3);
 5576 
 5577     // r1 in v0
 5578     // lowPart[m] = r0;
 5579     // highPart[m] = r1;
 5580     __ st4(v8, v9, v10, v11, __ T4S, __ post(lowPart, 64));
 5581     __ st4(v0, v1, v2, v3, __ T4S, __ post(highPart, 64));
 5582 
 5583 
 5584     __ sub(len, len, 64);
 5585     __ cmp(len, (u1)64);
 5586     __ br(Assembler::GE, L_loop);
 5587 
 5588     // restore callee-saved vector registers
 5589     __ ldpd(v14, v15, Address(sp, 48));
 5590     __ ldpd(v12, v13, Address(sp, 32));
 5591     __ ldpd(v10, v11, Address(sp, 16));
 5592     __ ldpd(v8, v9, __ post(sp, 64));
 5593 
 5594     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5595     __ mov(r0, zr); // return 0
 5596     __ ret(lr);
 5597 
 5598     return start;
 5599   }
 5600 
 5601   /**
 5602    *  Arguments:
 5603    *
 5604    * Inputs:
 5605    *   c_rarg0   - int crc
 5606    *   c_rarg1   - byte* buf
 5607    *   c_rarg2   - int length
 5608    *   c_rarg3   - int* table
 5609    *
 5610    * Output:
 5611    *       r0   - int crc result
 5612    */
 5613   address generate_updateBytesCRC32C() {
 5614     assert(UseCRC32CIntrinsics, "what are we doing here?");
 5615 
 5616     __ align(CodeEntryAlignment);
 5617     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 5618     StubCodeMark mark(this, stub_id);
 5619 
 5620     address start = __ pc();
 5621 
 5622     const Register crc   = c_rarg0;  // crc
 5623     const Register buf   = c_rarg1;  // source java byte array address
 5624     const Register len   = c_rarg2;  // length
 5625     const Register table0 = c_rarg3; // crc_table address
 5626     const Register table1 = c_rarg4;
 5627     const Register table2 = c_rarg5;
 5628     const Register table3 = c_rarg6;
 5629     const Register tmp3 = c_rarg7;
 5630 
 5631     BLOCK_COMMENT("Entry:");
 5632     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5633 
 5634     __ kernel_crc32c(crc, buf, len,
 5635               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 5636 
 5637     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5638     __ ret(lr);
 5639 
 5640     return start;
 5641   }
 5642 
 5643   /***
 5644    *  Arguments:
 5645    *
 5646    *  Inputs:
 5647    *   c_rarg0   - int   adler
 5648    *   c_rarg1   - byte* buff
 5649    *   c_rarg2   - int   len
 5650    *
 5651    * Output:
 5652    *   c_rarg0   - int adler result
 5653    */
 5654   address generate_updateBytesAdler32() {
 5655     __ align(CodeEntryAlignment);
 5656     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 5657     StubCodeMark mark(this, stub_id);
 5658     address start = __ pc();
 5659 
 5660     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 5661 
 5662     // Aliases
 5663     Register adler  = c_rarg0;
 5664     Register s1     = c_rarg0;
 5665     Register s2     = c_rarg3;
 5666     Register buff   = c_rarg1;
 5667     Register len    = c_rarg2;
 5668     Register nmax  = r4;
 5669     Register base  = r5;
 5670     Register count = r6;
 5671     Register temp0 = rscratch1;
 5672     Register temp1 = rscratch2;
 5673     FloatRegister vbytes = v0;
 5674     FloatRegister vs1acc = v1;
 5675     FloatRegister vs2acc = v2;
 5676     FloatRegister vtable = v3;
 5677 
 5678     // Max number of bytes we can process before having to take the mod
 5679     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 5680     uint64_t BASE = 0xfff1;
 5681     uint64_t NMAX = 0x15B0;
 5682 
 5683     __ mov(base, BASE);
 5684     __ mov(nmax, NMAX);
 5685 
 5686     // Load accumulation coefficients for the upper 16 bits
 5687     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 5688     __ ld1(vtable, __ T16B, Address(temp0));
 5689 
 5690     // s1 is initialized to the lower 16 bits of adler
 5691     // s2 is initialized to the upper 16 bits of adler
 5692     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 5693     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 5694 
 5695     // The pipelined loop needs at least 16 elements for 1 iteration
 5696     // It does check this, but it is more effective to skip to the cleanup loop
 5697     __ cmp(len, (u1)16);
 5698     __ br(Assembler::HS, L_nmax);
 5699     __ cbz(len, L_combine);
 5700 
 5701     __ bind(L_simple_by1_loop);
 5702     __ ldrb(temp0, Address(__ post(buff, 1)));
 5703     __ add(s1, s1, temp0);
 5704     __ add(s2, s2, s1);
 5705     __ subs(len, len, 1);
 5706     __ br(Assembler::HI, L_simple_by1_loop);
 5707 
 5708     // s1 = s1 % BASE
 5709     __ subs(temp0, s1, base);
 5710     __ csel(s1, temp0, s1, Assembler::HS);
 5711 
 5712     // s2 = s2 % BASE
 5713     __ lsr(temp0, s2, 16);
 5714     __ lsl(temp1, temp0, 4);
 5715     __ sub(temp1, temp1, temp0);
 5716     __ add(s2, temp1, s2, ext::uxth);
 5717 
 5718     __ subs(temp0, s2, base);
 5719     __ csel(s2, temp0, s2, Assembler::HS);
 5720 
 5721     __ b(L_combine);
 5722 
 5723     __ bind(L_nmax);
 5724     __ subs(len, len, nmax);
 5725     __ sub(count, nmax, 16);
 5726     __ br(Assembler::LO, L_by16);
 5727 
 5728     __ bind(L_nmax_loop);
 5729 
 5730     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5731                                       vbytes, vs1acc, vs2acc, vtable);
 5732 
 5733     __ subs(count, count, 16);
 5734     __ br(Assembler::HS, L_nmax_loop);
 5735 
 5736     // s1 = s1 % BASE
 5737     __ lsr(temp0, s1, 16);
 5738     __ lsl(temp1, temp0, 4);
 5739     __ sub(temp1, temp1, temp0);
 5740     __ add(temp1, temp1, s1, ext::uxth);
 5741 
 5742     __ lsr(temp0, temp1, 16);
 5743     __ lsl(s1, temp0, 4);
 5744     __ sub(s1, s1, temp0);
 5745     __ add(s1, s1, temp1, ext:: uxth);
 5746 
 5747     __ subs(temp0, s1, base);
 5748     __ csel(s1, temp0, s1, Assembler::HS);
 5749 
 5750     // s2 = s2 % BASE
 5751     __ lsr(temp0, s2, 16);
 5752     __ lsl(temp1, temp0, 4);
 5753     __ sub(temp1, temp1, temp0);
 5754     __ add(temp1, temp1, s2, ext::uxth);
 5755 
 5756     __ lsr(temp0, temp1, 16);
 5757     __ lsl(s2, temp0, 4);
 5758     __ sub(s2, s2, temp0);
 5759     __ add(s2, s2, temp1, ext:: uxth);
 5760 
 5761     __ subs(temp0, s2, base);
 5762     __ csel(s2, temp0, s2, Assembler::HS);
 5763 
 5764     __ subs(len, len, nmax);
 5765     __ sub(count, nmax, 16);
 5766     __ br(Assembler::HS, L_nmax_loop);
 5767 
 5768     __ bind(L_by16);
 5769     __ adds(len, len, count);
 5770     __ br(Assembler::LO, L_by1);
 5771 
 5772     __ bind(L_by16_loop);
 5773 
 5774     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5775                                       vbytes, vs1acc, vs2acc, vtable);
 5776 
 5777     __ subs(len, len, 16);
 5778     __ br(Assembler::HS, L_by16_loop);
 5779 
 5780     __ bind(L_by1);
 5781     __ adds(len, len, 15);
 5782     __ br(Assembler::LO, L_do_mod);
 5783 
 5784     __ bind(L_by1_loop);
 5785     __ ldrb(temp0, Address(__ post(buff, 1)));
 5786     __ add(s1, temp0, s1);
 5787     __ add(s2, s2, s1);
 5788     __ subs(len, len, 1);
 5789     __ br(Assembler::HS, L_by1_loop);
 5790 
 5791     __ bind(L_do_mod);
 5792     // s1 = s1 % BASE
 5793     __ lsr(temp0, s1, 16);
 5794     __ lsl(temp1, temp0, 4);
 5795     __ sub(temp1, temp1, temp0);
 5796     __ add(temp1, temp1, s1, ext::uxth);
 5797 
 5798     __ lsr(temp0, temp1, 16);
 5799     __ lsl(s1, temp0, 4);
 5800     __ sub(s1, s1, temp0);
 5801     __ add(s1, s1, temp1, ext:: uxth);
 5802 
 5803     __ subs(temp0, s1, base);
 5804     __ csel(s1, temp0, s1, Assembler::HS);
 5805 
 5806     // s2 = s2 % BASE
 5807     __ lsr(temp0, s2, 16);
 5808     __ lsl(temp1, temp0, 4);
 5809     __ sub(temp1, temp1, temp0);
 5810     __ add(temp1, temp1, s2, ext::uxth);
 5811 
 5812     __ lsr(temp0, temp1, 16);
 5813     __ lsl(s2, temp0, 4);
 5814     __ sub(s2, s2, temp0);
 5815     __ add(s2, s2, temp1, ext:: uxth);
 5816 
 5817     __ subs(temp0, s2, base);
 5818     __ csel(s2, temp0, s2, Assembler::HS);
 5819 
 5820     // Combine lower bits and higher bits
 5821     __ bind(L_combine);
 5822     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 5823 
 5824     __ ret(lr);
 5825 
 5826     return start;
 5827   }
 5828 
 5829   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 5830           Register temp0, Register temp1, FloatRegister vbytes,
 5831           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 5832     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 5833     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 5834     // In non-vectorized code, we update s1 and s2 as:
 5835     //   s1 <- s1 + b1
 5836     //   s2 <- s2 + s1
 5837     //   s1 <- s1 + b2
 5838     //   s2 <- s2 + b1
 5839     //   ...
 5840     //   s1 <- s1 + b16
 5841     //   s2 <- s2 + s1
 5842     // Putting above assignments together, we have:
 5843     //   s1_new = s1 + b1 + b2 + ... + b16
 5844     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 5845     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 5846     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 5847     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 5848 
 5849     // s2 = s2 + s1 * 16
 5850     __ add(s2, s2, s1, Assembler::LSL, 4);
 5851 
 5852     // vs1acc = b1 + b2 + b3 + ... + b16
 5853     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 5854     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 5855     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 5856     __ uaddlv(vs1acc, __ T16B, vbytes);
 5857     __ uaddlv(vs2acc, __ T8H, vs2acc);
 5858 
 5859     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 5860     __ fmovd(temp0, vs1acc);
 5861     __ fmovd(temp1, vs2acc);
 5862     __ add(s1, s1, temp0);
 5863     __ add(s2, s2, temp1);
 5864   }
 5865 
 5866   /**
 5867    *  Arguments:
 5868    *
 5869    *  Input:
 5870    *    c_rarg0   - x address
 5871    *    c_rarg1   - x length
 5872    *    c_rarg2   - y address
 5873    *    c_rarg3   - y length
 5874    *    c_rarg4   - z address
 5875    */
 5876   address generate_multiplyToLen() {
 5877     __ align(CodeEntryAlignment);
 5878     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 5879     StubCodeMark mark(this, stub_id);
 5880 
 5881     address start = __ pc();
 5882     const Register x     = r0;
 5883     const Register xlen  = r1;
 5884     const Register y     = r2;
 5885     const Register ylen  = r3;
 5886     const Register z     = r4;
 5887 
 5888     const Register tmp0  = r5;
 5889     const Register tmp1  = r10;
 5890     const Register tmp2  = r11;
 5891     const Register tmp3  = r12;
 5892     const Register tmp4  = r13;
 5893     const Register tmp5  = r14;
 5894     const Register tmp6  = r15;
 5895     const Register tmp7  = r16;
 5896 
 5897     BLOCK_COMMENT("Entry:");
 5898     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5899     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5900     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5901     __ ret(lr);
 5902 
 5903     return start;
 5904   }
 5905 
 5906   address generate_squareToLen() {
 5907     // squareToLen algorithm for sizes 1..127 described in java code works
 5908     // faster than multiply_to_len on some CPUs and slower on others, but
 5909     // multiply_to_len shows a bit better overall results
 5910     __ align(CodeEntryAlignment);
 5911     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 5912     StubCodeMark mark(this, stub_id);
 5913     address start = __ pc();
 5914 
 5915     const Register x     = r0;
 5916     const Register xlen  = r1;
 5917     const Register z     = r2;
 5918     const Register y     = r4; // == x
 5919     const Register ylen  = r5; // == xlen
 5920 
 5921     const Register tmp0  = r3;
 5922     const Register tmp1  = r10;
 5923     const Register tmp2  = r11;
 5924     const Register tmp3  = r12;
 5925     const Register tmp4  = r13;
 5926     const Register tmp5  = r14;
 5927     const Register tmp6  = r15;
 5928     const Register tmp7  = r16;
 5929 
 5930     RegSet spilled_regs = RegSet::of(y, ylen);
 5931     BLOCK_COMMENT("Entry:");
 5932     __ enter();
 5933     __ push(spilled_regs, sp);
 5934     __ mov(y, x);
 5935     __ mov(ylen, xlen);
 5936     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5937     __ pop(spilled_regs, sp);
 5938     __ leave();
 5939     __ ret(lr);
 5940     return start;
 5941   }
 5942 
 5943   address generate_mulAdd() {
 5944     __ align(CodeEntryAlignment);
 5945     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 5946     StubCodeMark mark(this, stub_id);
 5947 
 5948     address start = __ pc();
 5949 
 5950     const Register out     = r0;
 5951     const Register in      = r1;
 5952     const Register offset  = r2;
 5953     const Register len     = r3;
 5954     const Register k       = r4;
 5955 
 5956     BLOCK_COMMENT("Entry:");
 5957     __ enter();
 5958     __ mul_add(out, in, offset, len, k);
 5959     __ leave();
 5960     __ ret(lr);
 5961 
 5962     return start;
 5963   }
 5964 
 5965   // Arguments:
 5966   //
 5967   // Input:
 5968   //   c_rarg0   - newArr address
 5969   //   c_rarg1   - oldArr address
 5970   //   c_rarg2   - newIdx
 5971   //   c_rarg3   - shiftCount
 5972   //   c_rarg4   - numIter
 5973   //
 5974   address generate_bigIntegerRightShift() {
 5975     __ align(CodeEntryAlignment);
 5976     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 5977     StubCodeMark mark(this, stub_id);
 5978     address start = __ pc();
 5979 
 5980     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 5981 
 5982     Register newArr        = c_rarg0;
 5983     Register oldArr        = c_rarg1;
 5984     Register newIdx        = c_rarg2;
 5985     Register shiftCount    = c_rarg3;
 5986     Register numIter       = c_rarg4;
 5987     Register idx           = numIter;
 5988 
 5989     Register newArrCur     = rscratch1;
 5990     Register shiftRevCount = rscratch2;
 5991     Register oldArrCur     = r13;
 5992     Register oldArrNext    = r14;
 5993 
 5994     FloatRegister oldElem0        = v0;
 5995     FloatRegister oldElem1        = v1;
 5996     FloatRegister newElem         = v2;
 5997     FloatRegister shiftVCount     = v3;
 5998     FloatRegister shiftVRevCount  = v4;
 5999 
 6000     __ cbz(idx, Exit);
 6001 
 6002     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6003 
 6004     // left shift count
 6005     __ movw(shiftRevCount, 32);
 6006     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6007 
 6008     // numIter too small to allow a 4-words SIMD loop, rolling back
 6009     __ cmp(numIter, (u1)4);
 6010     __ br(Assembler::LT, ShiftThree);
 6011 
 6012     __ dup(shiftVCount,    __ T4S, shiftCount);
 6013     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 6014     __ negr(shiftVCount,   __ T4S, shiftVCount);
 6015 
 6016     __ BIND(ShiftSIMDLoop);
 6017 
 6018     // Calculate the load addresses
 6019     __ sub(idx, idx, 4);
 6020     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6021     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6022     __ add(oldArrCur,  oldArrNext, 4);
 6023 
 6024     // Load 4 words and process
 6025     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 6026     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 6027     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6028     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6029     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6030     __ st1(newElem,   __ T4S,  Address(newArrCur));
 6031 
 6032     __ cmp(idx, (u1)4);
 6033     __ br(Assembler::LT, ShiftTwoLoop);
 6034     __ b(ShiftSIMDLoop);
 6035 
 6036     __ BIND(ShiftTwoLoop);
 6037     __ cbz(idx, Exit);
 6038     __ cmp(idx, (u1)1);
 6039     __ br(Assembler::EQ, ShiftOne);
 6040 
 6041     // Calculate the load addresses
 6042     __ sub(idx, idx, 2);
 6043     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6044     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6045     __ add(oldArrCur,  oldArrNext, 4);
 6046 
 6047     // Load 2 words and process
 6048     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 6049     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 6050     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 6051     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 6052     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 6053     __ st1(newElem,   __ T2S, Address(newArrCur));
 6054     __ b(ShiftTwoLoop);
 6055 
 6056     __ BIND(ShiftThree);
 6057     __ tbz(idx, 1, ShiftOne);
 6058     __ tbz(idx, 0, ShiftTwo);
 6059     __ ldrw(r10,  Address(oldArr, 12));
 6060     __ ldrw(r11,  Address(oldArr, 8));
 6061     __ lsrvw(r10, r10, shiftCount);
 6062     __ lslvw(r11, r11, shiftRevCount);
 6063     __ orrw(r12,  r10, r11);
 6064     __ strw(r12,  Address(newArr, 8));
 6065 
 6066     __ BIND(ShiftTwo);
 6067     __ ldrw(r10,  Address(oldArr, 8));
 6068     __ ldrw(r11,  Address(oldArr, 4));
 6069     __ lsrvw(r10, r10, shiftCount);
 6070     __ lslvw(r11, r11, shiftRevCount);
 6071     __ orrw(r12,  r10, r11);
 6072     __ strw(r12,  Address(newArr, 4));
 6073 
 6074     __ BIND(ShiftOne);
 6075     __ ldrw(r10,  Address(oldArr, 4));
 6076     __ ldrw(r11,  Address(oldArr));
 6077     __ lsrvw(r10, r10, shiftCount);
 6078     __ lslvw(r11, r11, shiftRevCount);
 6079     __ orrw(r12,  r10, r11);
 6080     __ strw(r12,  Address(newArr));
 6081 
 6082     __ BIND(Exit);
 6083     __ ret(lr);
 6084 
 6085     return start;
 6086   }
 6087 
 6088   // Arguments:
 6089   //
 6090   // Input:
 6091   //   c_rarg0   - newArr address
 6092   //   c_rarg1   - oldArr address
 6093   //   c_rarg2   - newIdx
 6094   //   c_rarg3   - shiftCount
 6095   //   c_rarg4   - numIter
 6096   //
 6097   address generate_bigIntegerLeftShift() {
 6098     __ align(CodeEntryAlignment);
 6099     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 6100     StubCodeMark mark(this, stub_id);
 6101     address start = __ pc();
 6102 
 6103     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6104 
 6105     Register newArr        = c_rarg0;
 6106     Register oldArr        = c_rarg1;
 6107     Register newIdx        = c_rarg2;
 6108     Register shiftCount    = c_rarg3;
 6109     Register numIter       = c_rarg4;
 6110 
 6111     Register shiftRevCount = rscratch1;
 6112     Register oldArrNext    = rscratch2;
 6113 
 6114     FloatRegister oldElem0        = v0;
 6115     FloatRegister oldElem1        = v1;
 6116     FloatRegister newElem         = v2;
 6117     FloatRegister shiftVCount     = v3;
 6118     FloatRegister shiftVRevCount  = v4;
 6119 
 6120     __ cbz(numIter, Exit);
 6121 
 6122     __ add(oldArrNext, oldArr, 4);
 6123     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6124 
 6125     // right shift count
 6126     __ movw(shiftRevCount, 32);
 6127     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6128 
 6129     // numIter too small to allow a 4-words SIMD loop, rolling back
 6130     __ cmp(numIter, (u1)4);
 6131     __ br(Assembler::LT, ShiftThree);
 6132 
 6133     __ dup(shiftVCount,     __ T4S, shiftCount);
 6134     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 6135     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 6136 
 6137     __ BIND(ShiftSIMDLoop);
 6138 
 6139     // load 4 words and process
 6140     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 6141     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 6142     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6143     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6144     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6145     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 6146     __ sub(numIter,   numIter, 4);
 6147 
 6148     __ cmp(numIter, (u1)4);
 6149     __ br(Assembler::LT, ShiftTwoLoop);
 6150     __ b(ShiftSIMDLoop);
 6151 
 6152     __ BIND(ShiftTwoLoop);
 6153     __ cbz(numIter, Exit);
 6154     __ cmp(numIter, (u1)1);
 6155     __ br(Assembler::EQ, ShiftOne);
 6156 
 6157     // load 2 words and process
 6158     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 6159     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 6160     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 6161     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 6162     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 6163     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 6164     __ sub(numIter,   numIter, 2);
 6165     __ b(ShiftTwoLoop);
 6166 
 6167     __ BIND(ShiftThree);
 6168     __ ldrw(r10,  __ post(oldArr, 4));
 6169     __ ldrw(r11,  __ post(oldArrNext, 4));
 6170     __ lslvw(r10, r10, shiftCount);
 6171     __ lsrvw(r11, r11, shiftRevCount);
 6172     __ orrw(r12,  r10, r11);
 6173     __ strw(r12,  __ post(newArr, 4));
 6174     __ tbz(numIter, 1, Exit);
 6175     __ tbz(numIter, 0, ShiftOne);
 6176 
 6177     __ BIND(ShiftTwo);
 6178     __ ldrw(r10,  __ post(oldArr, 4));
 6179     __ ldrw(r11,  __ post(oldArrNext, 4));
 6180     __ lslvw(r10, r10, shiftCount);
 6181     __ lsrvw(r11, r11, shiftRevCount);
 6182     __ orrw(r12,  r10, r11);
 6183     __ strw(r12,  __ post(newArr, 4));
 6184 
 6185     __ BIND(ShiftOne);
 6186     __ ldrw(r10,  Address(oldArr));
 6187     __ ldrw(r11,  Address(oldArrNext));
 6188     __ lslvw(r10, r10, shiftCount);
 6189     __ lsrvw(r11, r11, shiftRevCount);
 6190     __ orrw(r12,  r10, r11);
 6191     __ strw(r12,  Address(newArr));
 6192 
 6193     __ BIND(Exit);
 6194     __ ret(lr);
 6195 
 6196     return start;
 6197   }
 6198 
 6199   address generate_count_positives(address &count_positives_long) {
 6200     const u1 large_loop_size = 64;
 6201     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 6202     int dcache_line = VM_Version::dcache_line_size();
 6203 
 6204     Register ary1 = r1, len = r2, result = r0;
 6205 
 6206     __ align(CodeEntryAlignment);
 6207 
 6208     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 6209     StubCodeMark mark(this, stub_id);
 6210 
 6211     address entry = __ pc();
 6212 
 6213     __ enter();
 6214     // precondition: a copy of len is already in result
 6215     // __ mov(result, len);
 6216 
 6217   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 6218         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 6219 
 6220   __ cmp(len, (u1)15);
 6221   __ br(Assembler::GT, LEN_OVER_15);
 6222   // The only case when execution falls into this code is when pointer is near
 6223   // the end of memory page and we have to avoid reading next page
 6224   __ add(ary1, ary1, len);
 6225   __ subs(len, len, 8);
 6226   __ br(Assembler::GT, LEN_OVER_8);
 6227   __ ldr(rscratch2, Address(ary1, -8));
 6228   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 6229   __ lsrv(rscratch2, rscratch2, rscratch1);
 6230   __ tst(rscratch2, UPPER_BIT_MASK);
 6231   __ csel(result, zr, result, Assembler::NE);
 6232   __ leave();
 6233   __ ret(lr);
 6234   __ bind(LEN_OVER_8);
 6235   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 6236   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 6237   __ tst(rscratch2, UPPER_BIT_MASK);
 6238   __ br(Assembler::NE, RET_NO_POP);
 6239   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 6240   __ lsrv(rscratch1, rscratch1, rscratch2);
 6241   __ tst(rscratch1, UPPER_BIT_MASK);
 6242   __ bind(RET_NO_POP);
 6243   __ csel(result, zr, result, Assembler::NE);
 6244   __ leave();
 6245   __ ret(lr);
 6246 
 6247   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 6248   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 6249 
 6250   count_positives_long = __ pc(); // 2nd entry point
 6251 
 6252   __ enter();
 6253 
 6254   __ bind(LEN_OVER_15);
 6255     __ push(spilled_regs, sp);
 6256     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 6257     __ cbz(rscratch2, ALIGNED);
 6258     __ ldp(tmp6, tmp1, Address(ary1));
 6259     __ mov(tmp5, 16);
 6260     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 6261     __ add(ary1, ary1, rscratch1);
 6262     __ orr(tmp6, tmp6, tmp1);
 6263     __ tst(tmp6, UPPER_BIT_MASK);
 6264     __ br(Assembler::NE, RET_ADJUST);
 6265     __ sub(len, len, rscratch1);
 6266 
 6267   __ bind(ALIGNED);
 6268     __ cmp(len, large_loop_size);
 6269     __ br(Assembler::LT, CHECK_16);
 6270     // Perform 16-byte load as early return in pre-loop to handle situation
 6271     // when initially aligned large array has negative values at starting bytes,
 6272     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 6273     // slower. Cases with negative bytes further ahead won't be affected that
 6274     // much. In fact, it'll be faster due to early loads, less instructions and
 6275     // less branches in LARGE_LOOP.
 6276     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 6277     __ sub(len, len, 16);
 6278     __ orr(tmp6, tmp6, tmp1);
 6279     __ tst(tmp6, UPPER_BIT_MASK);
 6280     __ br(Assembler::NE, RET_ADJUST_16);
 6281     __ cmp(len, large_loop_size);
 6282     __ br(Assembler::LT, CHECK_16);
 6283 
 6284     if (SoftwarePrefetchHintDistance >= 0
 6285         && SoftwarePrefetchHintDistance >= dcache_line) {
 6286       // initial prefetch
 6287       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 6288     }
 6289   __ bind(LARGE_LOOP);
 6290     if (SoftwarePrefetchHintDistance >= 0) {
 6291       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 6292     }
 6293     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 6294     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 6295     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 6296     // instructions per cycle and have less branches, but this approach disables
 6297     // early return, thus, all 64 bytes are loaded and checked every time.
 6298     __ ldp(tmp2, tmp3, Address(ary1));
 6299     __ ldp(tmp4, tmp5, Address(ary1, 16));
 6300     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 6301     __ ldp(tmp6, tmp1, Address(ary1, 48));
 6302     __ add(ary1, ary1, large_loop_size);
 6303     __ sub(len, len, large_loop_size);
 6304     __ orr(tmp2, tmp2, tmp3);
 6305     __ orr(tmp4, tmp4, tmp5);
 6306     __ orr(rscratch1, rscratch1, rscratch2);
 6307     __ orr(tmp6, tmp6, tmp1);
 6308     __ orr(tmp2, tmp2, tmp4);
 6309     __ orr(rscratch1, rscratch1, tmp6);
 6310     __ orr(tmp2, tmp2, rscratch1);
 6311     __ tst(tmp2, UPPER_BIT_MASK);
 6312     __ br(Assembler::NE, RET_ADJUST_LONG);
 6313     __ cmp(len, large_loop_size);
 6314     __ br(Assembler::GE, LARGE_LOOP);
 6315 
 6316   __ bind(CHECK_16); // small 16-byte load pre-loop
 6317     __ cmp(len, (u1)16);
 6318     __ br(Assembler::LT, POST_LOOP16);
 6319 
 6320   __ bind(LOOP16); // small 16-byte load loop
 6321     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 6322     __ sub(len, len, 16);
 6323     __ orr(tmp2, tmp2, tmp3);
 6324     __ tst(tmp2, UPPER_BIT_MASK);
 6325     __ br(Assembler::NE, RET_ADJUST_16);
 6326     __ cmp(len, (u1)16);
 6327     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 6328 
 6329   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 6330     __ cmp(len, (u1)8);
 6331     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 6332     __ ldr(tmp3, Address(__ post(ary1, 8)));
 6333     __ tst(tmp3, UPPER_BIT_MASK);
 6334     __ br(Assembler::NE, RET_ADJUST);
 6335     __ sub(len, len, 8);
 6336 
 6337   __ bind(POST_LOOP16_LOAD_TAIL);
 6338     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 6339     __ ldr(tmp1, Address(ary1));
 6340     __ mov(tmp2, 64);
 6341     __ sub(tmp4, tmp2, len, __ LSL, 3);
 6342     __ lslv(tmp1, tmp1, tmp4);
 6343     __ tst(tmp1, UPPER_BIT_MASK);
 6344     __ br(Assembler::NE, RET_ADJUST);
 6345     // Fallthrough
 6346 
 6347   __ bind(RET_LEN);
 6348     __ pop(spilled_regs, sp);
 6349     __ leave();
 6350     __ ret(lr);
 6351 
 6352     // difference result - len is the count of guaranteed to be
 6353     // positive bytes
 6354 
 6355   __ bind(RET_ADJUST_LONG);
 6356     __ add(len, len, (u1)(large_loop_size - 16));
 6357   __ bind(RET_ADJUST_16);
 6358     __ add(len, len, 16);
 6359   __ bind(RET_ADJUST);
 6360     __ pop(spilled_regs, sp);
 6361     __ leave();
 6362     __ sub(result, result, len);
 6363     __ ret(lr);
 6364 
 6365     return entry;
 6366   }
 6367 
 6368   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 6369         bool usePrefetch, Label &NOT_EQUAL) {
 6370     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6371         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6372         tmp7 = r12, tmp8 = r13;
 6373     Label LOOP;
 6374 
 6375     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6376     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6377     __ bind(LOOP);
 6378     if (usePrefetch) {
 6379       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6380       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6381     }
 6382     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6383     __ eor(tmp1, tmp1, tmp2);
 6384     __ eor(tmp3, tmp3, tmp4);
 6385     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6386     __ orr(tmp1, tmp1, tmp3);
 6387     __ cbnz(tmp1, NOT_EQUAL);
 6388     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6389     __ eor(tmp5, tmp5, tmp6);
 6390     __ eor(tmp7, tmp7, tmp8);
 6391     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6392     __ orr(tmp5, tmp5, tmp7);
 6393     __ cbnz(tmp5, NOT_EQUAL);
 6394     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6395     __ eor(tmp1, tmp1, tmp2);
 6396     __ eor(tmp3, tmp3, tmp4);
 6397     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6398     __ orr(tmp1, tmp1, tmp3);
 6399     __ cbnz(tmp1, NOT_EQUAL);
 6400     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6401     __ eor(tmp5, tmp5, tmp6);
 6402     __ sub(cnt1, cnt1, 8 * wordSize);
 6403     __ eor(tmp7, tmp7, tmp8);
 6404     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6405     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 6406     // cmp) because subs allows an unlimited range of immediate operand.
 6407     __ subs(tmp6, cnt1, loopThreshold);
 6408     __ orr(tmp5, tmp5, tmp7);
 6409     __ cbnz(tmp5, NOT_EQUAL);
 6410     __ br(__ GE, LOOP);
 6411     // post-loop
 6412     __ eor(tmp1, tmp1, tmp2);
 6413     __ eor(tmp3, tmp3, tmp4);
 6414     __ orr(tmp1, tmp1, tmp3);
 6415     __ sub(cnt1, cnt1, 2 * wordSize);
 6416     __ cbnz(tmp1, NOT_EQUAL);
 6417   }
 6418 
 6419   void generate_large_array_equals_loop_simd(int loopThreshold,
 6420         bool usePrefetch, Label &NOT_EQUAL) {
 6421     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6422         tmp2 = rscratch2;
 6423     Label LOOP;
 6424 
 6425     __ bind(LOOP);
 6426     if (usePrefetch) {
 6427       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6428       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6429     }
 6430     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 6431     __ sub(cnt1, cnt1, 8 * wordSize);
 6432     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 6433     __ subs(tmp1, cnt1, loopThreshold);
 6434     __ eor(v0, __ T16B, v0, v4);
 6435     __ eor(v1, __ T16B, v1, v5);
 6436     __ eor(v2, __ T16B, v2, v6);
 6437     __ eor(v3, __ T16B, v3, v7);
 6438     __ orr(v0, __ T16B, v0, v1);
 6439     __ orr(v1, __ T16B, v2, v3);
 6440     __ orr(v0, __ T16B, v0, v1);
 6441     __ umov(tmp1, v0, __ D, 0);
 6442     __ umov(tmp2, v0, __ D, 1);
 6443     __ orr(tmp1, tmp1, tmp2);
 6444     __ cbnz(tmp1, NOT_EQUAL);
 6445     __ br(__ GE, LOOP);
 6446   }
 6447 
 6448   // a1 = r1 - array1 address
 6449   // a2 = r2 - array2 address
 6450   // result = r0 - return value. Already contains "false"
 6451   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 6452   // r3-r5 are reserved temporary registers
 6453   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 6454   address generate_large_array_equals() {
 6455     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6456         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6457         tmp7 = r12, tmp8 = r13;
 6458     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 6459         SMALL_LOOP, POST_LOOP;
 6460     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 6461     // calculate if at least 32 prefetched bytes are used
 6462     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 6463     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 6464     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 6465     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 6466         tmp5, tmp6, tmp7, tmp8);
 6467 
 6468     __ align(CodeEntryAlignment);
 6469 
 6470     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 6471     StubCodeMark mark(this, stub_id);
 6472 
 6473     address entry = __ pc();
 6474     __ enter();
 6475     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 6476     // also advance pointers to use post-increment instead of pre-increment
 6477     __ add(a1, a1, wordSize);
 6478     __ add(a2, a2, wordSize);
 6479     if (AvoidUnalignedAccesses) {
 6480       // both implementations (SIMD/nonSIMD) are using relatively large load
 6481       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 6482       // on some CPUs in case of address is not at least 16-byte aligned.
 6483       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 6484       // load if needed at least for 1st address and make if 16-byte aligned.
 6485       Label ALIGNED16;
 6486       __ tbz(a1, 3, ALIGNED16);
 6487       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6488       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6489       __ sub(cnt1, cnt1, wordSize);
 6490       __ eor(tmp1, tmp1, tmp2);
 6491       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 6492       __ bind(ALIGNED16);
 6493     }
 6494     if (UseSIMDForArrayEquals) {
 6495       if (SoftwarePrefetchHintDistance >= 0) {
 6496         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6497         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6498         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 6499             /* prfm = */ true, NOT_EQUAL);
 6500         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6501         __ br(__ LT, TAIL);
 6502       }
 6503       __ bind(NO_PREFETCH_LARGE_LOOP);
 6504       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 6505           /* prfm = */ false, NOT_EQUAL);
 6506     } else {
 6507       __ push(spilled_regs, sp);
 6508       if (SoftwarePrefetchHintDistance >= 0) {
 6509         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6510         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6511         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 6512             /* prfm = */ true, NOT_EQUAL);
 6513         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6514         __ br(__ LT, TAIL);
 6515       }
 6516       __ bind(NO_PREFETCH_LARGE_LOOP);
 6517       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 6518           /* prfm = */ false, NOT_EQUAL);
 6519     }
 6520     __ bind(TAIL);
 6521       __ cbz(cnt1, EQUAL);
 6522       __ subs(cnt1, cnt1, wordSize);
 6523       __ br(__ LE, POST_LOOP);
 6524     __ bind(SMALL_LOOP);
 6525       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6526       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6527       __ subs(cnt1, cnt1, wordSize);
 6528       __ eor(tmp1, tmp1, tmp2);
 6529       __ cbnz(tmp1, NOT_EQUAL);
 6530       __ br(__ GT, SMALL_LOOP);
 6531     __ bind(POST_LOOP);
 6532       __ ldr(tmp1, Address(a1, cnt1));
 6533       __ ldr(tmp2, Address(a2, cnt1));
 6534       __ eor(tmp1, tmp1, tmp2);
 6535       __ cbnz(tmp1, NOT_EQUAL);
 6536     __ bind(EQUAL);
 6537       __ mov(result, true);
 6538     __ bind(NOT_EQUAL);
 6539       if (!UseSIMDForArrayEquals) {
 6540         __ pop(spilled_regs, sp);
 6541       }
 6542     __ bind(NOT_EQUAL_NO_POP);
 6543     __ leave();
 6544     __ ret(lr);
 6545     return entry;
 6546   }
 6547 
 6548   // result = r0 - return value. Contains initial hashcode value on entry.
 6549   // ary = r1 - array address
 6550   // cnt = r2 - elements count
 6551   // Clobbers: v0-v13, rscratch1, rscratch2
 6552   address generate_large_arrays_hashcode(BasicType eltype) {
 6553     const Register result = r0, ary = r1, cnt = r2;
 6554     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 6555     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 6556     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 6557     const FloatRegister vpowm = v13;
 6558 
 6559     ARRAYS_HASHCODE_REGISTERS;
 6560 
 6561     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 6562 
 6563     unsigned int vf; // vectorization factor
 6564     bool multiply_by_halves;
 6565     Assembler::SIMD_Arrangement load_arrangement;
 6566     switch (eltype) {
 6567     case T_BOOLEAN:
 6568     case T_BYTE:
 6569       load_arrangement = Assembler::T8B;
 6570       multiply_by_halves = true;
 6571       vf = 8;
 6572       break;
 6573     case T_CHAR:
 6574     case T_SHORT:
 6575       load_arrangement = Assembler::T8H;
 6576       multiply_by_halves = true;
 6577       vf = 8;
 6578       break;
 6579     case T_INT:
 6580       load_arrangement = Assembler::T4S;
 6581       multiply_by_halves = false;
 6582       vf = 4;
 6583       break;
 6584     default:
 6585       ShouldNotReachHere();
 6586     }
 6587 
 6588     // Unroll factor
 6589     const unsigned uf = 4;
 6590 
 6591     // Effective vectorization factor
 6592     const unsigned evf = vf * uf;
 6593 
 6594     __ align(CodeEntryAlignment);
 6595 
 6596     StubGenStubId stub_id;
 6597     switch (eltype) {
 6598     case T_BOOLEAN:
 6599       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 6600       break;
 6601     case T_BYTE:
 6602       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 6603       break;
 6604     case T_CHAR:
 6605       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 6606       break;
 6607     case T_SHORT:
 6608       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 6609       break;
 6610     case T_INT:
 6611       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 6612       break;
 6613     default:
 6614       stub_id = StubGenStubId::NO_STUBID;
 6615       ShouldNotReachHere();
 6616     };
 6617 
 6618     StubCodeMark mark(this, stub_id);
 6619 
 6620     address entry = __ pc();
 6621     __ enter();
 6622 
 6623     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 6624     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 6625     // value shouldn't change throughout both loops.
 6626     __ movw(rscratch1, intpow(31U, 3));
 6627     __ mov(vpow, Assembler::S, 0, rscratch1);
 6628     __ movw(rscratch1, intpow(31U, 2));
 6629     __ mov(vpow, Assembler::S, 1, rscratch1);
 6630     __ movw(rscratch1, intpow(31U, 1));
 6631     __ mov(vpow, Assembler::S, 2, rscratch1);
 6632     __ movw(rscratch1, intpow(31U, 0));
 6633     __ mov(vpow, Assembler::S, 3, rscratch1);
 6634 
 6635     __ mov(vmul0, Assembler::T16B, 0);
 6636     __ mov(vmul0, Assembler::S, 3, result);
 6637 
 6638     __ andr(rscratch2, cnt, (uf - 1) * vf);
 6639     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 6640 
 6641     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 6642     __ mov(vpowm, Assembler::S, 0, rscratch1);
 6643 
 6644     // SMALL LOOP
 6645     __ bind(SMALL_LOOP);
 6646 
 6647     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 6648     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6649     __ subsw(rscratch2, rscratch2, vf);
 6650 
 6651     if (load_arrangement == Assembler::T8B) {
 6652       // Extend 8B to 8H to be able to use vector multiply
 6653       // instructions
 6654       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6655       if (is_signed_subword_type(eltype)) {
 6656         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6657       } else {
 6658         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6659       }
 6660     }
 6661 
 6662     switch (load_arrangement) {
 6663     case Assembler::T4S:
 6664       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6665       break;
 6666     case Assembler::T8B:
 6667     case Assembler::T8H:
 6668       assert(is_subword_type(eltype), "subword type expected");
 6669       if (is_signed_subword_type(eltype)) {
 6670         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6671       } else {
 6672         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6673       }
 6674       break;
 6675     default:
 6676       __ should_not_reach_here();
 6677     }
 6678 
 6679     // Process the upper half of a vector
 6680     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6681       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6682       if (is_signed_subword_type(eltype)) {
 6683         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6684       } else {
 6685         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6686       }
 6687     }
 6688 
 6689     __ br(Assembler::HI, SMALL_LOOP);
 6690 
 6691     // SMALL LOOP'S EPILOQUE
 6692     __ lsr(rscratch2, cnt, exact_log2(evf));
 6693     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 6694 
 6695     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6696     __ addv(vmul0, Assembler::T4S, vmul0);
 6697     __ umov(result, vmul0, Assembler::S, 0);
 6698 
 6699     // TAIL
 6700     __ bind(TAIL);
 6701 
 6702     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 6703     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 6704     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 6705     __ andr(rscratch2, cnt, vf - 1);
 6706     __ bind(TAIL_SHORTCUT);
 6707     __ adr(rscratch1, BR_BASE);
 6708     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 6709     __ movw(rscratch2, 0x1f);
 6710     __ br(rscratch1);
 6711 
 6712     for (size_t i = 0; i < vf - 1; ++i) {
 6713       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 6714                                    eltype);
 6715       __ maddw(result, result, rscratch2, rscratch1);
 6716     }
 6717     __ bind(BR_BASE);
 6718 
 6719     __ leave();
 6720     __ ret(lr);
 6721 
 6722     // LARGE LOOP
 6723     __ bind(LARGE_LOOP_PREHEADER);
 6724 
 6725     __ lsr(rscratch2, cnt, exact_log2(evf));
 6726 
 6727     if (multiply_by_halves) {
 6728       // 31^4 - multiplier between lower and upper parts of a register
 6729       __ movw(rscratch1, intpow(31U, vf / 2));
 6730       __ mov(vpowm, Assembler::S, 1, rscratch1);
 6731       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 6732       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 6733       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6734     } else {
 6735       // 31^16
 6736       __ movw(rscratch1, intpow(31U, evf));
 6737       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6738     }
 6739 
 6740     __ mov(vmul3, Assembler::T16B, 0);
 6741     __ mov(vmul2, Assembler::T16B, 0);
 6742     __ mov(vmul1, Assembler::T16B, 0);
 6743 
 6744     __ bind(LARGE_LOOP);
 6745 
 6746     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 6747     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 6748     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 6749     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6750 
 6751     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 6752            Address(__ post(ary, evf * type2aelembytes(eltype))));
 6753 
 6754     if (load_arrangement == Assembler::T8B) {
 6755       // Extend 8B to 8H to be able to use vector multiply
 6756       // instructions
 6757       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6758       if (is_signed_subword_type(eltype)) {
 6759         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6760         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6761         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6762         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6763       } else {
 6764         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6765         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6766         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6767         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6768       }
 6769     }
 6770 
 6771     switch (load_arrangement) {
 6772     case Assembler::T4S:
 6773       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 6774       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 6775       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 6776       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6777       break;
 6778     case Assembler::T8B:
 6779     case Assembler::T8H:
 6780       assert(is_subword_type(eltype), "subword type expected");
 6781       if (is_signed_subword_type(eltype)) {
 6782         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6783         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6784         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6785         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6786       } else {
 6787         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6788         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6789         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6790         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6791       }
 6792       break;
 6793     default:
 6794       __ should_not_reach_here();
 6795     }
 6796 
 6797     // Process the upper half of a vector
 6798     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6799       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 6800       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 6801       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 6802       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 6803       if (is_signed_subword_type(eltype)) {
 6804         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6805         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6806         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6807         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6808       } else {
 6809         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6810         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6811         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6812         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6813       }
 6814     }
 6815 
 6816     __ subsw(rscratch2, rscratch2, 1);
 6817     __ br(Assembler::HI, LARGE_LOOP);
 6818 
 6819     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 6820     __ addv(vmul3, Assembler::T4S, vmul3);
 6821     __ umov(result, vmul3, Assembler::S, 0);
 6822 
 6823     __ mov(rscratch2, intpow(31U, vf));
 6824 
 6825     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 6826     __ addv(vmul2, Assembler::T4S, vmul2);
 6827     __ umov(rscratch1, vmul2, Assembler::S, 0);
 6828     __ maddw(result, result, rscratch2, rscratch1);
 6829 
 6830     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 6831     __ addv(vmul1, Assembler::T4S, vmul1);
 6832     __ umov(rscratch1, vmul1, Assembler::S, 0);
 6833     __ maddw(result, result, rscratch2, rscratch1);
 6834 
 6835     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6836     __ addv(vmul0, Assembler::T4S, vmul0);
 6837     __ umov(rscratch1, vmul0, Assembler::S, 0);
 6838     __ maddw(result, result, rscratch2, rscratch1);
 6839 
 6840     __ andr(rscratch2, cnt, vf - 1);
 6841     __ cbnz(rscratch2, TAIL_SHORTCUT);
 6842 
 6843     __ leave();
 6844     __ ret(lr);
 6845 
 6846     return entry;
 6847   }
 6848 
 6849   address generate_dsin_dcos(bool isCos) {
 6850     __ align(CodeEntryAlignment);
 6851     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 6852     StubCodeMark mark(this, stub_id);
 6853     address start = __ pc();
 6854     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 6855         (address)StubRoutines::aarch64::_two_over_pi,
 6856         (address)StubRoutines::aarch64::_pio2,
 6857         (address)StubRoutines::aarch64::_dsin_coef,
 6858         (address)StubRoutines::aarch64::_dcos_coef);
 6859     return start;
 6860   }
 6861 
 6862   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 6863   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 6864       Label &DIFF2) {
 6865     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 6866     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 6867 
 6868     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 6869     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6870     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 6871     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 6872 
 6873     __ fmovd(tmpL, vtmp3);
 6874     __ eor(rscratch2, tmp3, tmpL);
 6875     __ cbnz(rscratch2, DIFF2);
 6876 
 6877     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6878     __ umov(tmpL, vtmp3, __ D, 1);
 6879     __ eor(rscratch2, tmpU, tmpL);
 6880     __ cbnz(rscratch2, DIFF1);
 6881 
 6882     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 6883     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6884     __ fmovd(tmpL, vtmp);
 6885     __ eor(rscratch2, tmp3, tmpL);
 6886     __ cbnz(rscratch2, DIFF2);
 6887 
 6888     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6889     __ umov(tmpL, vtmp, __ D, 1);
 6890     __ eor(rscratch2, tmpU, tmpL);
 6891     __ cbnz(rscratch2, DIFF1);
 6892   }
 6893 
 6894   // r0  = result
 6895   // r1  = str1
 6896   // r2  = cnt1
 6897   // r3  = str2
 6898   // r4  = cnt2
 6899   // r10 = tmp1
 6900   // r11 = tmp2
 6901   address generate_compare_long_string_different_encoding(bool isLU) {
 6902     __ align(CodeEntryAlignment);
 6903     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 6904     StubCodeMark mark(this, stub_id);
 6905     address entry = __ pc();
 6906     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 6907         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 6908         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 6909     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 6910         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 6911     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 6912     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 6913 
 6914     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 6915 
 6916     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 6917     // cnt2 == amount of characters left to compare
 6918     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 6919     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 6920     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 6921     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 6922     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 6923     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 6924     __ eor(rscratch2, tmp1, tmp2);
 6925     __ mov(rscratch1, tmp2);
 6926     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 6927     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 6928              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 6929     __ push(spilled_regs, sp);
 6930     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 6931     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 6932 
 6933     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6934 
 6935     if (SoftwarePrefetchHintDistance >= 0) {
 6936       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6937       __ br(__ LT, NO_PREFETCH);
 6938       __ bind(LARGE_LOOP_PREFETCH);
 6939         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 6940         __ mov(tmp4, 2);
 6941         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6942         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 6943           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6944           __ subs(tmp4, tmp4, 1);
 6945           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 6946           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6947           __ mov(tmp4, 2);
 6948         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 6949           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6950           __ subs(tmp4, tmp4, 1);
 6951           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 6952           __ sub(cnt2, cnt2, 64);
 6953           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6954           __ br(__ GE, LARGE_LOOP_PREFETCH);
 6955     }
 6956     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 6957     __ bind(NO_PREFETCH);
 6958     __ subs(cnt2, cnt2, 16);
 6959     __ br(__ LT, TAIL);
 6960     __ align(OptoLoopAlignment);
 6961     __ bind(SMALL_LOOP); // smaller loop
 6962       __ subs(cnt2, cnt2, 16);
 6963       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6964       __ br(__ GE, SMALL_LOOP);
 6965       __ cmn(cnt2, (u1)16);
 6966       __ br(__ EQ, LOAD_LAST);
 6967     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 6968       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 6969       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 6970       __ ldr(tmp3, Address(cnt1, -8));
 6971       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 6972       __ b(LOAD_LAST);
 6973     __ bind(DIFF2);
 6974       __ mov(tmpU, tmp3);
 6975     __ bind(DIFF1);
 6976       __ pop(spilled_regs, sp);
 6977       __ b(CALCULATE_DIFFERENCE);
 6978     __ bind(LOAD_LAST);
 6979       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 6980       // No need to load it again
 6981       __ mov(tmpU, tmp3);
 6982       __ pop(spilled_regs, sp);
 6983 
 6984       // tmp2 points to the address of the last 4 Latin1 characters right now
 6985       __ ldrs(vtmp, Address(tmp2));
 6986       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 6987       __ fmovd(tmpL, vtmp);
 6988 
 6989       __ eor(rscratch2, tmpU, tmpL);
 6990       __ cbz(rscratch2, DONE);
 6991 
 6992     // Find the first different characters in the longwords and
 6993     // compute their difference.
 6994     __ bind(CALCULATE_DIFFERENCE);
 6995       __ rev(rscratch2, rscratch2);
 6996       __ clz(rscratch2, rscratch2);
 6997       __ andr(rscratch2, rscratch2, -16);
 6998       __ lsrv(tmp1, tmp1, rscratch2);
 6999       __ uxthw(tmp1, tmp1);
 7000       __ lsrv(rscratch1, rscratch1, rscratch2);
 7001       __ uxthw(rscratch1, rscratch1);
 7002       __ subw(result, tmp1, rscratch1);
 7003     __ bind(DONE);
 7004       __ ret(lr);
 7005     return entry;
 7006   }
 7007 
 7008   // r0 = input (float16)
 7009   // v0 = result (float)
 7010   // v1 = temporary float register
 7011   address generate_float16ToFloat() {
 7012     __ align(CodeEntryAlignment);
 7013     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 7014     StubCodeMark mark(this, stub_id);
 7015     address entry = __ pc();
 7016     BLOCK_COMMENT("Entry:");
 7017     __ flt16_to_flt(v0, r0, v1);
 7018     __ ret(lr);
 7019     return entry;
 7020   }
 7021 
 7022   // v0 = input (float)
 7023   // r0 = result (float16)
 7024   // v1 = temporary float register
 7025   address generate_floatToFloat16() {
 7026     __ align(CodeEntryAlignment);
 7027     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 7028     StubCodeMark mark(this, stub_id);
 7029     address entry = __ pc();
 7030     BLOCK_COMMENT("Entry:");
 7031     __ flt_to_flt16(r0, v0, v1);
 7032     __ ret(lr);
 7033     return entry;
 7034   }
 7035 
 7036   address generate_method_entry_barrier() {
 7037     __ align(CodeEntryAlignment);
 7038     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 7039     StubCodeMark mark(this, stub_id);
 7040 
 7041     Label deoptimize_label;
 7042 
 7043     address start = __ pc();
 7044 
 7045     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 7046 
 7047     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 7048       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 7049       // We can get here despite the nmethod being good, if we have not
 7050       // yet applied our cross modification fence (or data fence).
 7051       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 7052       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 7053       __ ldrw(rscratch2, rscratch2);
 7054       __ strw(rscratch2, thread_epoch_addr);
 7055       __ isb();
 7056       __ membar(__ LoadLoad);
 7057     }
 7058 
 7059     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 7060 
 7061     __ enter();
 7062     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 7063 
 7064     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 7065 
 7066     __ push_call_clobbered_registers();
 7067 
 7068     __ mov(c_rarg0, rscratch2);
 7069     __ call_VM_leaf
 7070          (CAST_FROM_FN_PTR
 7071           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 7072 
 7073     __ reset_last_Java_frame(true);
 7074 
 7075     __ mov(rscratch1, r0);
 7076 
 7077     __ pop_call_clobbered_registers();
 7078 
 7079     __ cbnz(rscratch1, deoptimize_label);
 7080 
 7081     __ leave();
 7082     __ ret(lr);
 7083 
 7084     __ BIND(deoptimize_label);
 7085 
 7086     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 7087     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 7088 
 7089     __ mov(sp, rscratch1);
 7090     __ br(rscratch2);
 7091 
 7092     return start;
 7093   }
 7094 
 7095   // r0  = result
 7096   // r1  = str1
 7097   // r2  = cnt1
 7098   // r3  = str2
 7099   // r4  = cnt2
 7100   // r10 = tmp1
 7101   // r11 = tmp2
 7102   address generate_compare_long_string_same_encoding(bool isLL) {
 7103     __ align(CodeEntryAlignment);
 7104     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 7105     StubCodeMark mark(this, stub_id);
 7106     address entry = __ pc();
 7107     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7108         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 7109 
 7110     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 7111 
 7112     // exit from large loop when less than 64 bytes left to read or we're about
 7113     // to prefetch memory behind array border
 7114     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 7115 
 7116     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 7117     __ eor(rscratch2, tmp1, tmp2);
 7118     __ cbnz(rscratch2, CAL_DIFFERENCE);
 7119 
 7120     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 7121     // update pointers, because of previous read
 7122     __ add(str1, str1, wordSize);
 7123     __ add(str2, str2, wordSize);
 7124     if (SoftwarePrefetchHintDistance >= 0) {
 7125       __ align(OptoLoopAlignment);
 7126       __ bind(LARGE_LOOP_PREFETCH);
 7127         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 7128         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 7129 
 7130         for (int i = 0; i < 4; i++) {
 7131           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 7132           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 7133           __ cmp(tmp1, tmp2);
 7134           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7135           __ br(Assembler::NE, DIFF);
 7136         }
 7137         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 7138         __ add(str1, str1, 64);
 7139         __ add(str2, str2, 64);
 7140         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 7141         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 7142         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 7143     }
 7144 
 7145     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 7146     __ br(Assembler::LE, LESS16);
 7147     __ align(OptoLoopAlignment);
 7148     __ bind(LOOP_COMPARE16);
 7149       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7150       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7151       __ cmp(tmp1, tmp2);
 7152       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7153       __ br(Assembler::NE, DIFF);
 7154       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7155       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7156       __ br(Assembler::LT, LESS16);
 7157 
 7158       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7159       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7160       __ cmp(tmp1, tmp2);
 7161       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7162       __ br(Assembler::NE, DIFF);
 7163       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7164       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7165       __ br(Assembler::GE, LOOP_COMPARE16);
 7166       __ cbz(cnt2, LENGTH_DIFF);
 7167 
 7168     __ bind(LESS16);
 7169       // each 8 compare
 7170       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 7171       __ br(Assembler::LE, LESS8);
 7172       __ ldr(tmp1, Address(__ post(str1, 8)));
 7173       __ ldr(tmp2, Address(__ post(str2, 8)));
 7174       __ eor(rscratch2, tmp1, tmp2);
 7175       __ cbnz(rscratch2, CAL_DIFFERENCE);
 7176       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 7177 
 7178     __ bind(LESS8); // directly load last 8 bytes
 7179       if (!isLL) {
 7180         __ add(cnt2, cnt2, cnt2);
 7181       }
 7182       __ ldr(tmp1, Address(str1, cnt2));
 7183       __ ldr(tmp2, Address(str2, cnt2));
 7184       __ eor(rscratch2, tmp1, tmp2);
 7185       __ cbz(rscratch2, LENGTH_DIFF);
 7186       __ b(CAL_DIFFERENCE);
 7187 
 7188     __ bind(DIFF);
 7189       __ cmp(tmp1, tmp2);
 7190       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 7191       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 7192       // reuse rscratch2 register for the result of eor instruction
 7193       __ eor(rscratch2, tmp1, tmp2);
 7194 
 7195     __ bind(CAL_DIFFERENCE);
 7196       __ rev(rscratch2, rscratch2);
 7197       __ clz(rscratch2, rscratch2);
 7198       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 7199       __ lsrv(tmp1, tmp1, rscratch2);
 7200       __ lsrv(tmp2, tmp2, rscratch2);
 7201       if (isLL) {
 7202         __ uxtbw(tmp1, tmp1);
 7203         __ uxtbw(tmp2, tmp2);
 7204       } else {
 7205         __ uxthw(tmp1, tmp1);
 7206         __ uxthw(tmp2, tmp2);
 7207       }
 7208       __ subw(result, tmp1, tmp2);
 7209 
 7210     __ bind(LENGTH_DIFF);
 7211       __ ret(lr);
 7212     return entry;
 7213   }
 7214 
 7215   enum string_compare_mode {
 7216     LL,
 7217     LU,
 7218     UL,
 7219     UU,
 7220   };
 7221 
 7222   // The following registers are declared in aarch64.ad
 7223   // r0  = result
 7224   // r1  = str1
 7225   // r2  = cnt1
 7226   // r3  = str2
 7227   // r4  = cnt2
 7228   // r10 = tmp1
 7229   // r11 = tmp2
 7230   // z0  = ztmp1
 7231   // z1  = ztmp2
 7232   // p0  = pgtmp1
 7233   // p1  = pgtmp2
 7234   address generate_compare_long_string_sve(string_compare_mode mode) {
 7235     StubGenStubId stub_id;
 7236     switch (mode) {
 7237       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 7238       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 7239       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 7240       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 7241       default: ShouldNotReachHere();
 7242     }
 7243 
 7244     __ align(CodeEntryAlignment);
 7245     address entry = __ pc();
 7246     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7247              tmp1 = r10, tmp2 = r11;
 7248 
 7249     Label LOOP, DONE, MISMATCH;
 7250     Register vec_len = tmp1;
 7251     Register idx = tmp2;
 7252     // The minimum of the string lengths has been stored in cnt2.
 7253     Register cnt = cnt2;
 7254     FloatRegister ztmp1 = z0, ztmp2 = z1;
 7255     PRegister pgtmp1 = p0, pgtmp2 = p1;
 7256 
 7257 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 7258     switch (mode) {                                                            \
 7259       case LL:                                                                 \
 7260         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 7261         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 7262         break;                                                                 \
 7263       case LU:                                                                 \
 7264         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 7265         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7266         break;                                                                 \
 7267       case UL:                                                                 \
 7268         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7269         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 7270         break;                                                                 \
 7271       case UU:                                                                 \
 7272         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7273         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7274         break;                                                                 \
 7275       default:                                                                 \
 7276         ShouldNotReachHere();                                                  \
 7277     }
 7278 
 7279     StubCodeMark mark(this, stub_id);
 7280 
 7281     __ mov(idx, 0);
 7282     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7283 
 7284     if (mode == LL) {
 7285       __ sve_cntb(vec_len);
 7286     } else {
 7287       __ sve_cnth(vec_len);
 7288     }
 7289 
 7290     __ sub(rscratch1, cnt, vec_len);
 7291 
 7292     __ bind(LOOP);
 7293 
 7294       // main loop
 7295       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7296       __ add(idx, idx, vec_len);
 7297       // Compare strings.
 7298       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7299       __ br(__ NE, MISMATCH);
 7300       __ cmp(idx, rscratch1);
 7301       __ br(__ LT, LOOP);
 7302 
 7303     // post loop, last iteration
 7304     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7305 
 7306     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7307     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7308     __ br(__ EQ, DONE);
 7309 
 7310     __ bind(MISMATCH);
 7311 
 7312     // Crop the vector to find its location.
 7313     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 7314     // Extract the first different characters of each string.
 7315     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 7316     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 7317 
 7318     // Compute the difference of the first different characters.
 7319     __ sub(result, rscratch1, rscratch2);
 7320 
 7321     __ bind(DONE);
 7322     __ ret(lr);
 7323 #undef LOAD_PAIR
 7324     return entry;
 7325   }
 7326 
 7327   void generate_compare_long_strings() {
 7328     if (UseSVE == 0) {
 7329       StubRoutines::aarch64::_compare_long_string_LL
 7330           = generate_compare_long_string_same_encoding(true);
 7331       StubRoutines::aarch64::_compare_long_string_UU
 7332           = generate_compare_long_string_same_encoding(false);
 7333       StubRoutines::aarch64::_compare_long_string_LU
 7334           = generate_compare_long_string_different_encoding(true);
 7335       StubRoutines::aarch64::_compare_long_string_UL
 7336           = generate_compare_long_string_different_encoding(false);
 7337     } else {
 7338       StubRoutines::aarch64::_compare_long_string_LL
 7339           = generate_compare_long_string_sve(LL);
 7340       StubRoutines::aarch64::_compare_long_string_UU
 7341           = generate_compare_long_string_sve(UU);
 7342       StubRoutines::aarch64::_compare_long_string_LU
 7343           = generate_compare_long_string_sve(LU);
 7344       StubRoutines::aarch64::_compare_long_string_UL
 7345           = generate_compare_long_string_sve(UL);
 7346     }
 7347   }
 7348 
 7349   // R0 = result
 7350   // R1 = str2
 7351   // R2 = cnt1
 7352   // R3 = str1
 7353   // R4 = cnt2
 7354   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 7355   //
 7356   // This generic linear code use few additional ideas, which makes it faster:
 7357   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 7358   // in order to skip initial loading(help in systems with 1 ld pipeline)
 7359   // 2) we can use "fast" algorithm of finding single character to search for
 7360   // first symbol with less branches(1 branch per each loaded register instead
 7361   // of branch for each symbol), so, this is where constants like
 7362   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 7363   // 3) after loading and analyzing 1st register of source string, it can be
 7364   // used to search for every 1st character entry, saving few loads in
 7365   // comparison with "simplier-but-slower" implementation
 7366   // 4) in order to avoid lots of push/pop operations, code below is heavily
 7367   // re-using/re-initializing/compressing register values, which makes code
 7368   // larger and a bit less readable, however, most of extra operations are
 7369   // issued during loads or branches, so, penalty is minimal
 7370   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 7371     StubGenStubId stub_id;
 7372     if (str1_isL) {
 7373       if (str2_isL) {
 7374         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 7375       } else {
 7376         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 7377       }
 7378     } else {
 7379       if (str2_isL) {
 7380         ShouldNotReachHere();
 7381       } else {
 7382         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 7383       }
 7384     }
 7385     __ align(CodeEntryAlignment);
 7386     StubCodeMark mark(this, stub_id);
 7387     address entry = __ pc();
 7388 
 7389     int str1_chr_size = str1_isL ? 1 : 2;
 7390     int str2_chr_size = str2_isL ? 1 : 2;
 7391     int str1_chr_shift = str1_isL ? 0 : 1;
 7392     int str2_chr_shift = str2_isL ? 0 : 1;
 7393     bool isL = str1_isL && str2_isL;
 7394    // parameters
 7395     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 7396     // temporary registers
 7397     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 7398     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 7399     // redefinitions
 7400     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 7401 
 7402     __ push(spilled_regs, sp);
 7403     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 7404         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 7405         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 7406         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 7407         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 7408         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 7409     // Read whole register from str1. It is safe, because length >=8 here
 7410     __ ldr(ch1, Address(str1));
 7411     // Read whole register from str2. It is safe, because length >=8 here
 7412     __ ldr(ch2, Address(str2));
 7413     __ sub(cnt2, cnt2, cnt1);
 7414     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 7415     if (str1_isL != str2_isL) {
 7416       __ eor(v0, __ T16B, v0, v0);
 7417     }
 7418     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 7419     __ mul(first, first, tmp1);
 7420     // check if we have less than 1 register to check
 7421     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 7422     if (str1_isL != str2_isL) {
 7423       __ fmovd(v1, ch1);
 7424     }
 7425     __ br(__ LE, L_SMALL);
 7426     __ eor(ch2, first, ch2);
 7427     if (str1_isL != str2_isL) {
 7428       __ zip1(v1, __ T16B, v1, v0);
 7429     }
 7430     __ sub(tmp2, ch2, tmp1);
 7431     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7432     __ bics(tmp2, tmp2, ch2);
 7433     if (str1_isL != str2_isL) {
 7434       __ fmovd(ch1, v1);
 7435     }
 7436     __ br(__ NE, L_HAS_ZERO);
 7437     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7438     __ add(result, result, wordSize/str2_chr_size);
 7439     __ add(str2, str2, wordSize);
 7440     __ br(__ LT, L_POST_LOOP);
 7441     __ BIND(L_LOOP);
 7442       __ ldr(ch2, Address(str2));
 7443       __ eor(ch2, first, ch2);
 7444       __ sub(tmp2, ch2, tmp1);
 7445       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7446       __ bics(tmp2, tmp2, ch2);
 7447       __ br(__ NE, L_HAS_ZERO);
 7448     __ BIND(L_LOOP_PROCEED);
 7449       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7450       __ add(str2, str2, wordSize);
 7451       __ add(result, result, wordSize/str2_chr_size);
 7452       __ br(__ GE, L_LOOP);
 7453     __ BIND(L_POST_LOOP);
 7454       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 7455       __ br(__ LE, NOMATCH);
 7456       __ ldr(ch2, Address(str2));
 7457       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7458       __ eor(ch2, first, ch2);
 7459       __ sub(tmp2, ch2, tmp1);
 7460       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7461       __ mov(tmp4, -1); // all bits set
 7462       __ b(L_SMALL_PROCEED);
 7463     __ align(OptoLoopAlignment);
 7464     __ BIND(L_SMALL);
 7465       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7466       __ eor(ch2, first, ch2);
 7467       if (str1_isL != str2_isL) {
 7468         __ zip1(v1, __ T16B, v1, v0);
 7469       }
 7470       __ sub(tmp2, ch2, tmp1);
 7471       __ mov(tmp4, -1); // all bits set
 7472       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7473       if (str1_isL != str2_isL) {
 7474         __ fmovd(ch1, v1); // move converted 4 symbols
 7475       }
 7476     __ BIND(L_SMALL_PROCEED);
 7477       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 7478       __ bic(tmp2, tmp2, ch2);
 7479       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 7480       __ rbit(tmp2, tmp2);
 7481       __ br(__ EQ, NOMATCH);
 7482     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 7483       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 7484       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 7485       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 7486       if (str2_isL) { // LL
 7487         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7488         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7489         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7490         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7491         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7492       } else {
 7493         __ mov(ch2, 0xE); // all bits in byte set except last one
 7494         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7495         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7496         __ lslv(tmp2, tmp2, tmp4);
 7497         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7498         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7499         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7500         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7501       }
 7502       __ cmp(ch1, ch2);
 7503       __ mov(tmp4, wordSize/str2_chr_size);
 7504       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7505     __ BIND(L_SMALL_CMP_LOOP);
 7506       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7507                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7508       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7509                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7510       __ add(tmp4, tmp4, 1);
 7511       __ cmp(tmp4, cnt1);
 7512       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 7513       __ cmp(first, ch2);
 7514       __ br(__ EQ, L_SMALL_CMP_LOOP);
 7515     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 7516       __ cbz(tmp2, NOMATCH); // no more matches. exit
 7517       __ clz(tmp4, tmp2);
 7518       __ add(result, result, 1); // advance index
 7519       __ add(str2, str2, str2_chr_size); // advance pointer
 7520       __ b(L_SMALL_HAS_ZERO_LOOP);
 7521     __ align(OptoLoopAlignment);
 7522     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 7523       __ cmp(first, ch2);
 7524       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7525       __ b(DONE);
 7526     __ align(OptoLoopAlignment);
 7527     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 7528       if (str2_isL) { // LL
 7529         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7530         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7531         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7532         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7533         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7534       } else {
 7535         __ mov(ch2, 0xE); // all bits in byte set except last one
 7536         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7537         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7538         __ lslv(tmp2, tmp2, tmp4);
 7539         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7540         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7541         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7542         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7543       }
 7544       __ cmp(ch1, ch2);
 7545       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7546       __ b(DONE);
 7547     __ align(OptoLoopAlignment);
 7548     __ BIND(L_HAS_ZERO);
 7549       __ rbit(tmp2, tmp2);
 7550       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 7551       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 7552       // It's fine because both counters are 32bit and are not changed in this
 7553       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 7554       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 7555       __ sub(result, result, 1);
 7556     __ BIND(L_HAS_ZERO_LOOP);
 7557       __ mov(cnt1, wordSize/str2_chr_size);
 7558       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7559       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 7560       if (str2_isL) {
 7561         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7562         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7563         __ lslv(tmp2, tmp2, tmp4);
 7564         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7565         __ add(tmp4, tmp4, 1);
 7566         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7567         __ lsl(tmp2, tmp2, 1);
 7568         __ mov(tmp4, wordSize/str2_chr_size);
 7569       } else {
 7570         __ mov(ch2, 0xE);
 7571         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7572         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7573         __ lslv(tmp2, tmp2, tmp4);
 7574         __ add(tmp4, tmp4, 1);
 7575         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7576         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7577         __ lsl(tmp2, tmp2, 1);
 7578         __ mov(tmp4, wordSize/str2_chr_size);
 7579         __ sub(str2, str2, str2_chr_size);
 7580       }
 7581       __ cmp(ch1, ch2);
 7582       __ mov(tmp4, wordSize/str2_chr_size);
 7583       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7584     __ BIND(L_CMP_LOOP);
 7585       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7586                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7587       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7588                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7589       __ add(tmp4, tmp4, 1);
 7590       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7591       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 7592       __ cmp(cnt1, ch2);
 7593       __ br(__ EQ, L_CMP_LOOP);
 7594     __ BIND(L_CMP_LOOP_NOMATCH);
 7595       // here we're not matched
 7596       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 7597       __ clz(tmp4, tmp2);
 7598       __ add(str2, str2, str2_chr_size); // advance pointer
 7599       __ b(L_HAS_ZERO_LOOP);
 7600     __ align(OptoLoopAlignment);
 7601     __ BIND(L_CMP_LOOP_LAST_CMP);
 7602       __ cmp(cnt1, ch2);
 7603       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7604       __ b(DONE);
 7605     __ align(OptoLoopAlignment);
 7606     __ BIND(L_CMP_LOOP_LAST_CMP2);
 7607       if (str2_isL) {
 7608         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7609         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7610         __ lslv(tmp2, tmp2, tmp4);
 7611         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7612         __ add(tmp4, tmp4, 1);
 7613         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7614         __ lsl(tmp2, tmp2, 1);
 7615       } else {
 7616         __ mov(ch2, 0xE);
 7617         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7618         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7619         __ lslv(tmp2, tmp2, tmp4);
 7620         __ add(tmp4, tmp4, 1);
 7621         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7622         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7623         __ lsl(tmp2, tmp2, 1);
 7624         __ sub(str2, str2, str2_chr_size);
 7625       }
 7626       __ cmp(ch1, ch2);
 7627       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7628       __ b(DONE);
 7629     __ align(OptoLoopAlignment);
 7630     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 7631       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 7632       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 7633       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 7634       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 7635       // result by analyzed characters value, so, we can just reset lower bits
 7636       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 7637       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 7638       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 7639       // index of last analyzed substring inside current octet. So, str2 in at
 7640       // respective start address. We need to advance it to next octet
 7641       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 7642       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 7643       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 7644       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 7645       __ movw(cnt2, cnt2);
 7646       __ b(L_LOOP_PROCEED);
 7647     __ align(OptoLoopAlignment);
 7648     __ BIND(NOMATCH);
 7649       __ mov(result, -1);
 7650     __ BIND(DONE);
 7651       __ pop(spilled_regs, sp);
 7652       __ ret(lr);
 7653     return entry;
 7654   }
 7655 
 7656   void generate_string_indexof_stubs() {
 7657     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 7658     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 7659     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 7660   }
 7661 
 7662   void inflate_and_store_2_fp_registers(bool generatePrfm,
 7663       FloatRegister src1, FloatRegister src2) {
 7664     Register dst = r1;
 7665     __ zip1(v1, __ T16B, src1, v0);
 7666     __ zip2(v2, __ T16B, src1, v0);
 7667     if (generatePrfm) {
 7668       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 7669     }
 7670     __ zip1(v3, __ T16B, src2, v0);
 7671     __ zip2(v4, __ T16B, src2, v0);
 7672     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 7673   }
 7674 
 7675   // R0 = src
 7676   // R1 = dst
 7677   // R2 = len
 7678   // R3 = len >> 3
 7679   // V0 = 0
 7680   // v1 = loaded 8 bytes
 7681   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 7682   address generate_large_byte_array_inflate() {
 7683     __ align(CodeEntryAlignment);
 7684     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 7685     StubCodeMark mark(this, stub_id);
 7686     address entry = __ pc();
 7687     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 7688     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 7689     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 7690 
 7691     // do one more 8-byte read to have address 16-byte aligned in most cases
 7692     // also use single store instruction
 7693     __ ldrd(v2, __ post(src, 8));
 7694     __ sub(octetCounter, octetCounter, 2);
 7695     __ zip1(v1, __ T16B, v1, v0);
 7696     __ zip1(v2, __ T16B, v2, v0);
 7697     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 7698     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7699     __ subs(rscratch1, octetCounter, large_loop_threshold);
 7700     __ br(__ LE, LOOP_START);
 7701     __ b(LOOP_PRFM_START);
 7702     __ bind(LOOP_PRFM);
 7703       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7704     __ bind(LOOP_PRFM_START);
 7705       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 7706       __ sub(octetCounter, octetCounter, 8);
 7707       __ subs(rscratch1, octetCounter, large_loop_threshold);
 7708       inflate_and_store_2_fp_registers(true, v3, v4);
 7709       inflate_and_store_2_fp_registers(true, v5, v6);
 7710       __ br(__ GT, LOOP_PRFM);
 7711       __ cmp(octetCounter, (u1)8);
 7712       __ br(__ LT, DONE);
 7713     __ bind(LOOP);
 7714       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7715       __ bind(LOOP_START);
 7716       __ sub(octetCounter, octetCounter, 8);
 7717       __ cmp(octetCounter, (u1)8);
 7718       inflate_and_store_2_fp_registers(false, v3, v4);
 7719       inflate_and_store_2_fp_registers(false, v5, v6);
 7720       __ br(__ GE, LOOP);
 7721     __ bind(DONE);
 7722       __ ret(lr);
 7723     return entry;
 7724   }
 7725 
 7726   /**
 7727    *  Arguments:
 7728    *
 7729    *  Input:
 7730    *  c_rarg0   - current state address
 7731    *  c_rarg1   - H key address
 7732    *  c_rarg2   - data address
 7733    *  c_rarg3   - number of blocks
 7734    *
 7735    *  Output:
 7736    *  Updated state at c_rarg0
 7737    */
 7738   address generate_ghash_processBlocks() {
 7739     // Bafflingly, GCM uses little-endian for the byte order, but
 7740     // big-endian for the bit order.  For example, the polynomial 1 is
 7741     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 7742     //
 7743     // So, we must either reverse the bytes in each word and do
 7744     // everything big-endian or reverse the bits in each byte and do
 7745     // it little-endian.  On AArch64 it's more idiomatic to reverse
 7746     // the bits in each byte (we have an instruction, RBIT, to do
 7747     // that) and keep the data in little-endian bit order through the
 7748     // calculation, bit-reversing the inputs and outputs.
 7749 
 7750     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 7751     StubCodeMark mark(this, stub_id);
 7752     __ align(wordSize * 2);
 7753     address p = __ pc();
 7754     __ emit_int64(0x87);  // The low-order bits of the field
 7755                           // polynomial (i.e. p = z^7+z^2+z+1)
 7756                           // repeated in the low and high parts of a
 7757                           // 128-bit vector
 7758     __ emit_int64(0x87);
 7759 
 7760     __ align(CodeEntryAlignment);
 7761     address start = __ pc();
 7762 
 7763     Register state   = c_rarg0;
 7764     Register subkeyH = c_rarg1;
 7765     Register data    = c_rarg2;
 7766     Register blocks  = c_rarg3;
 7767 
 7768     FloatRegister vzr = v30;
 7769     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 7770 
 7771     __ ldrq(v24, p);    // The field polynomial
 7772 
 7773     __ ldrq(v0, Address(state));
 7774     __ ldrq(v1, Address(subkeyH));
 7775 
 7776     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 7777     __ rbit(v0, __ T16B, v0);
 7778     __ rev64(v1, __ T16B, v1);
 7779     __ rbit(v1, __ T16B, v1);
 7780 
 7781     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 7782     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 7783 
 7784     {
 7785       Label L_ghash_loop;
 7786       __ bind(L_ghash_loop);
 7787 
 7788       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 7789                                                  // reversing each byte
 7790       __ rbit(v2, __ T16B, v2);
 7791       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 7792 
 7793       // Multiply state in v2 by subkey in v1
 7794       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 7795                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 7796                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 7797       // Reduce v7:v5 by the field polynomial
 7798       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 7799 
 7800       __ sub(blocks, blocks, 1);
 7801       __ cbnz(blocks, L_ghash_loop);
 7802     }
 7803 
 7804     // The bit-reversed result is at this point in v0
 7805     __ rev64(v0, __ T16B, v0);
 7806     __ rbit(v0, __ T16B, v0);
 7807 
 7808     __ st1(v0, __ T16B, state);
 7809     __ ret(lr);
 7810 
 7811     return start;
 7812   }
 7813 
 7814   address generate_ghash_processBlocks_wide() {
 7815     address small = generate_ghash_processBlocks();
 7816 
 7817     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 7818     StubCodeMark mark(this, stub_id);
 7819     __ align(wordSize * 2);
 7820     address p = __ pc();
 7821     __ emit_int64(0x87);  // The low-order bits of the field
 7822                           // polynomial (i.e. p = z^7+z^2+z+1)
 7823                           // repeated in the low and high parts of a
 7824                           // 128-bit vector
 7825     __ emit_int64(0x87);
 7826 
 7827     __ align(CodeEntryAlignment);
 7828     address start = __ pc();
 7829 
 7830     Register state   = c_rarg0;
 7831     Register subkeyH = c_rarg1;
 7832     Register data    = c_rarg2;
 7833     Register blocks  = c_rarg3;
 7834 
 7835     const int unroll = 4;
 7836 
 7837     __ cmp(blocks, (unsigned char)(unroll * 2));
 7838     __ br(__ LT, small);
 7839 
 7840     if (unroll > 1) {
 7841     // Save state before entering routine
 7842       __ sub(sp, sp, 4 * 16);
 7843       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 7844       __ sub(sp, sp, 4 * 16);
 7845       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 7846     }
 7847 
 7848     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 7849 
 7850     if (unroll > 1) {
 7851       // And restore state
 7852       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 7853       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 7854     }
 7855 
 7856     __ cmp(blocks, (unsigned char)0);
 7857     __ br(__ GT, small);
 7858 
 7859     __ ret(lr);
 7860 
 7861     return start;
 7862   }
 7863 
 7864   void generate_base64_encode_simdround(Register src, Register dst,
 7865         FloatRegister codec, u8 size) {
 7866 
 7867     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 7868     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 7869     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 7870 
 7871     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 7872 
 7873     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 7874 
 7875     __ ushr(ind0, arrangement, in0,  2);
 7876 
 7877     __ ushr(ind1, arrangement, in1,  2);
 7878     __ shl(in0,   arrangement, in0,  6);
 7879     __ orr(ind1,  arrangement, ind1, in0);
 7880     __ ushr(ind1, arrangement, ind1, 2);
 7881 
 7882     __ ushr(ind2, arrangement, in2,  4);
 7883     __ shl(in1,   arrangement, in1,  4);
 7884     __ orr(ind2,  arrangement, in1,  ind2);
 7885     __ ushr(ind2, arrangement, ind2, 2);
 7886 
 7887     __ shl(ind3,  arrangement, in2,  2);
 7888     __ ushr(ind3, arrangement, ind3, 2);
 7889 
 7890     __ tbl(out0,  arrangement, codec,  4, ind0);
 7891     __ tbl(out1,  arrangement, codec,  4, ind1);
 7892     __ tbl(out2,  arrangement, codec,  4, ind2);
 7893     __ tbl(out3,  arrangement, codec,  4, ind3);
 7894 
 7895     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 7896   }
 7897 
 7898    /**
 7899    *  Arguments:
 7900    *
 7901    *  Input:
 7902    *  c_rarg0   - src_start
 7903    *  c_rarg1   - src_offset
 7904    *  c_rarg2   - src_length
 7905    *  c_rarg3   - dest_start
 7906    *  c_rarg4   - dest_offset
 7907    *  c_rarg5   - isURL
 7908    *
 7909    */
 7910   address generate_base64_encodeBlock() {
 7911 
 7912     static const char toBase64[64] = {
 7913       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7914       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7915       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7916       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7917       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 7918     };
 7919 
 7920     static const char toBase64URL[64] = {
 7921       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7922       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7923       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7924       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7925       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 7926     };
 7927 
 7928     __ align(CodeEntryAlignment);
 7929     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 7930     StubCodeMark mark(this, stub_id);
 7931     address start = __ pc();
 7932 
 7933     Register src   = c_rarg0;  // source array
 7934     Register soff  = c_rarg1;  // source start offset
 7935     Register send  = c_rarg2;  // source end offset
 7936     Register dst   = c_rarg3;  // dest array
 7937     Register doff  = c_rarg4;  // position for writing to dest array
 7938     Register isURL = c_rarg5;  // Base64 or URL character set
 7939 
 7940     // c_rarg6 and c_rarg7 are free to use as temps
 7941     Register codec  = c_rarg6;
 7942     Register length = c_rarg7;
 7943 
 7944     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 7945 
 7946     __ add(src, src, soff);
 7947     __ add(dst, dst, doff);
 7948     __ sub(length, send, soff);
 7949 
 7950     // load the codec base address
 7951     __ lea(codec, ExternalAddress((address) toBase64));
 7952     __ cbz(isURL, ProcessData);
 7953     __ lea(codec, ExternalAddress((address) toBase64URL));
 7954 
 7955     __ BIND(ProcessData);
 7956 
 7957     // too short to formup a SIMD loop, roll back
 7958     __ cmp(length, (u1)24);
 7959     __ br(Assembler::LT, Process3B);
 7960 
 7961     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 7962 
 7963     __ BIND(Process48B);
 7964     __ cmp(length, (u1)48);
 7965     __ br(Assembler::LT, Process24B);
 7966     generate_base64_encode_simdround(src, dst, v0, 16);
 7967     __ sub(length, length, 48);
 7968     __ b(Process48B);
 7969 
 7970     __ BIND(Process24B);
 7971     __ cmp(length, (u1)24);
 7972     __ br(Assembler::LT, SIMDExit);
 7973     generate_base64_encode_simdround(src, dst, v0, 8);
 7974     __ sub(length, length, 24);
 7975 
 7976     __ BIND(SIMDExit);
 7977     __ cbz(length, Exit);
 7978 
 7979     __ BIND(Process3B);
 7980     //  3 src bytes, 24 bits
 7981     __ ldrb(r10, __ post(src, 1));
 7982     __ ldrb(r11, __ post(src, 1));
 7983     __ ldrb(r12, __ post(src, 1));
 7984     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 7985     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 7986     // codec index
 7987     __ ubfmw(r15, r12, 18, 23);
 7988     __ ubfmw(r14, r12, 12, 17);
 7989     __ ubfmw(r13, r12, 6,  11);
 7990     __ andw(r12,  r12, 63);
 7991     // get the code based on the codec
 7992     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 7993     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 7994     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 7995     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 7996     __ strb(r15, __ post(dst, 1));
 7997     __ strb(r14, __ post(dst, 1));
 7998     __ strb(r13, __ post(dst, 1));
 7999     __ strb(r12, __ post(dst, 1));
 8000     __ sub(length, length, 3);
 8001     __ cbnz(length, Process3B);
 8002 
 8003     __ BIND(Exit);
 8004     __ ret(lr);
 8005 
 8006     return start;
 8007   }
 8008 
 8009   void generate_base64_decode_simdround(Register src, Register dst,
 8010         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 8011 
 8012     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 8013     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 8014 
 8015     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 8016     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 8017 
 8018     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 8019 
 8020     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 8021 
 8022     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 8023 
 8024     // we need unsigned saturating subtract, to make sure all input values
 8025     // in range [0, 63] will have 0U value in the higher half lookup
 8026     __ uqsubv(decH0, __ T16B, in0, v27);
 8027     __ uqsubv(decH1, __ T16B, in1, v27);
 8028     __ uqsubv(decH2, __ T16B, in2, v27);
 8029     __ uqsubv(decH3, __ T16B, in3, v27);
 8030 
 8031     // lower half lookup
 8032     __ tbl(decL0, arrangement, codecL, 4, in0);
 8033     __ tbl(decL1, arrangement, codecL, 4, in1);
 8034     __ tbl(decL2, arrangement, codecL, 4, in2);
 8035     __ tbl(decL3, arrangement, codecL, 4, in3);
 8036 
 8037     // higher half lookup
 8038     __ tbx(decH0, arrangement, codecH, 4, decH0);
 8039     __ tbx(decH1, arrangement, codecH, 4, decH1);
 8040     __ tbx(decH2, arrangement, codecH, 4, decH2);
 8041     __ tbx(decH3, arrangement, codecH, 4, decH3);
 8042 
 8043     // combine lower and higher
 8044     __ orr(decL0, arrangement, decL0, decH0);
 8045     __ orr(decL1, arrangement, decL1, decH1);
 8046     __ orr(decL2, arrangement, decL2, decH2);
 8047     __ orr(decL3, arrangement, decL3, decH3);
 8048 
 8049     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 8050     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 8051     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 8052     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 8053     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 8054     __ orr(in0, arrangement, decH0, decH1);
 8055     __ orr(in1, arrangement, decH2, decH3);
 8056     __ orr(in2, arrangement, in0,   in1);
 8057     __ umaxv(in3, arrangement, in2);
 8058     __ umov(rscratch2, in3, __ B, 0);
 8059 
 8060     // get the data to output
 8061     __ shl(out0,  arrangement, decL0, 2);
 8062     __ ushr(out1, arrangement, decL1, 4);
 8063     __ orr(out0,  arrangement, out0,  out1);
 8064     __ shl(out1,  arrangement, decL1, 4);
 8065     __ ushr(out2, arrangement, decL2, 2);
 8066     __ orr(out1,  arrangement, out1,  out2);
 8067     __ shl(out2,  arrangement, decL2, 6);
 8068     __ orr(out2,  arrangement, out2,  decL3);
 8069 
 8070     __ cbz(rscratch2, NoIllegalData);
 8071 
 8072     // handle illegal input
 8073     __ umov(r10, in2, __ D, 0);
 8074     if (size == 16) {
 8075       __ cbnz(r10, ErrorInLowerHalf);
 8076 
 8077       // illegal input is in higher half, store the lower half now.
 8078       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 8079 
 8080       __ umov(r10, in2,  __ D, 1);
 8081       __ umov(r11, out0, __ D, 1);
 8082       __ umov(r12, out1, __ D, 1);
 8083       __ umov(r13, out2, __ D, 1);
 8084       __ b(StoreLegalData);
 8085 
 8086       __ BIND(ErrorInLowerHalf);
 8087     }
 8088     __ umov(r11, out0, __ D, 0);
 8089     __ umov(r12, out1, __ D, 0);
 8090     __ umov(r13, out2, __ D, 0);
 8091 
 8092     __ BIND(StoreLegalData);
 8093     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 8094     __ strb(r11, __ post(dst, 1));
 8095     __ strb(r12, __ post(dst, 1));
 8096     __ strb(r13, __ post(dst, 1));
 8097     __ lsr(r10, r10, 8);
 8098     __ lsr(r11, r11, 8);
 8099     __ lsr(r12, r12, 8);
 8100     __ lsr(r13, r13, 8);
 8101     __ b(StoreLegalData);
 8102 
 8103     __ BIND(NoIllegalData);
 8104     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 8105   }
 8106 
 8107 
 8108    /**
 8109    *  Arguments:
 8110    *
 8111    *  Input:
 8112    *  c_rarg0   - src_start
 8113    *  c_rarg1   - src_offset
 8114    *  c_rarg2   - src_length
 8115    *  c_rarg3   - dest_start
 8116    *  c_rarg4   - dest_offset
 8117    *  c_rarg5   - isURL
 8118    *  c_rarg6   - isMIME
 8119    *
 8120    */
 8121   address generate_base64_decodeBlock() {
 8122 
 8123     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 8124     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 8125     // titled "Base64 decoding".
 8126 
 8127     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 8128     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 8129     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 8130     static const uint8_t fromBase64ForNoSIMD[256] = {
 8131       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8132       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8133       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8134        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8135       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8136        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 8137       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8138        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8139       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8140       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8141       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8142       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8143       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8144       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8145       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8146       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8147     };
 8148 
 8149     static const uint8_t fromBase64URLForNoSIMD[256] = {
 8150       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8151       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8152       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8153        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8154       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8155        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 8156       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8157        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8158       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8159       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8160       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8161       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8162       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8163       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8164       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8166     };
 8167 
 8168     // A legal value of base64 code is in range [0, 127].  We need two lookups
 8169     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 8170     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 8171     // table vector lookup use tbx, out of range indices are unchanged in
 8172     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 8173     // The value of index 64 is set to 0, so that we know that we already get the
 8174     // decoded data with the 1st lookup.
 8175     static const uint8_t fromBase64ForSIMD[128] = {
 8176       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8177       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8178       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8179        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8180         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8181        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8182       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8183        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8184     };
 8185 
 8186     static const uint8_t fromBase64URLForSIMD[128] = {
 8187       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8188       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8189       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8190        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8191         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8192        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8193        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8194        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8195     };
 8196 
 8197     __ align(CodeEntryAlignment);
 8198     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 8199     StubCodeMark mark(this, stub_id);
 8200     address start = __ pc();
 8201 
 8202     Register src    = c_rarg0;  // source array
 8203     Register soff   = c_rarg1;  // source start offset
 8204     Register send   = c_rarg2;  // source end offset
 8205     Register dst    = c_rarg3;  // dest array
 8206     Register doff   = c_rarg4;  // position for writing to dest array
 8207     Register isURL  = c_rarg5;  // Base64 or URL character set
 8208     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 8209 
 8210     Register length = send;    // reuse send as length of source data to process
 8211 
 8212     Register simd_codec   = c_rarg6;
 8213     Register nosimd_codec = c_rarg7;
 8214 
 8215     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 8216 
 8217     __ enter();
 8218 
 8219     __ add(src, src, soff);
 8220     __ add(dst, dst, doff);
 8221 
 8222     __ mov(doff, dst);
 8223 
 8224     __ sub(length, send, soff);
 8225     __ bfm(length, zr, 0, 1);
 8226 
 8227     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 8228     __ cbz(isURL, ProcessData);
 8229     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 8230 
 8231     __ BIND(ProcessData);
 8232     __ mov(rscratch1, length);
 8233     __ cmp(length, (u1)144); // 144 = 80 + 64
 8234     __ br(Assembler::LT, Process4B);
 8235 
 8236     // In the MIME case, the line length cannot be more than 76
 8237     // bytes (see RFC 2045). This is too short a block for SIMD
 8238     // to be worthwhile, so we use non-SIMD here.
 8239     __ movw(rscratch1, 79);
 8240 
 8241     __ BIND(Process4B);
 8242     __ ldrw(r14, __ post(src, 4));
 8243     __ ubfxw(r10, r14, 0,  8);
 8244     __ ubfxw(r11, r14, 8,  8);
 8245     __ ubfxw(r12, r14, 16, 8);
 8246     __ ubfxw(r13, r14, 24, 8);
 8247     // get the de-code
 8248     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 8249     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 8250     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 8251     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 8252     // error detection, 255u indicates an illegal input
 8253     __ orrw(r14, r10, r11);
 8254     __ orrw(r15, r12, r13);
 8255     __ orrw(r14, r14, r15);
 8256     __ tbnz(r14, 7, Exit);
 8257     // recover the data
 8258     __ lslw(r14, r10, 10);
 8259     __ bfiw(r14, r11, 4, 6);
 8260     __ bfmw(r14, r12, 2, 5);
 8261     __ rev16w(r14, r14);
 8262     __ bfiw(r13, r12, 6, 2);
 8263     __ strh(r14, __ post(dst, 2));
 8264     __ strb(r13, __ post(dst, 1));
 8265     // non-simd loop
 8266     __ subsw(rscratch1, rscratch1, 4);
 8267     __ br(Assembler::GT, Process4B);
 8268 
 8269     // if exiting from PreProcess80B, rscratch1 == -1;
 8270     // otherwise, rscratch1 == 0.
 8271     __ cbzw(rscratch1, Exit);
 8272     __ sub(length, length, 80);
 8273 
 8274     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 8275     __ cbz(isURL, SIMDEnter);
 8276     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 8277 
 8278     __ BIND(SIMDEnter);
 8279     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 8280     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 8281     __ mov(rscratch1, 63);
 8282     __ dup(v27, __ T16B, rscratch1);
 8283 
 8284     __ BIND(Process64B);
 8285     __ cmp(length, (u1)64);
 8286     __ br(Assembler::LT, Process32B);
 8287     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 8288     __ sub(length, length, 64);
 8289     __ b(Process64B);
 8290 
 8291     __ BIND(Process32B);
 8292     __ cmp(length, (u1)32);
 8293     __ br(Assembler::LT, SIMDExit);
 8294     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 8295     __ sub(length, length, 32);
 8296     __ b(Process32B);
 8297 
 8298     __ BIND(SIMDExit);
 8299     __ cbz(length, Exit);
 8300     __ movw(rscratch1, length);
 8301     __ b(Process4B);
 8302 
 8303     __ BIND(Exit);
 8304     __ sub(c_rarg0, dst, doff);
 8305 
 8306     __ leave();
 8307     __ ret(lr);
 8308 
 8309     return start;
 8310   }
 8311 
 8312   // Support for spin waits.
 8313   address generate_spin_wait() {
 8314     __ align(CodeEntryAlignment);
 8315     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 8316     StubCodeMark mark(this, stub_id);
 8317     address start = __ pc();
 8318 
 8319     __ spin_wait();
 8320     __ ret(lr);
 8321 
 8322     return start;
 8323   }
 8324 
 8325   void generate_lookup_secondary_supers_table_stub() {
 8326     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 8327     StubCodeMark mark(this, stub_id);
 8328 
 8329     const Register
 8330       r_super_klass  = r0,
 8331       r_array_base   = r1,
 8332       r_array_length = r2,
 8333       r_array_index  = r3,
 8334       r_sub_klass    = r4,
 8335       r_bitmap       = rscratch2,
 8336       result         = r5;
 8337     const FloatRegister
 8338       vtemp          = v0;
 8339 
 8340     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 8341       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 8342       Label L_success;
 8343       __ enter();
 8344       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 8345                                              r_array_base, r_array_length, r_array_index,
 8346                                              vtemp, result, slot,
 8347                                              /*stub_is_near*/true);
 8348       __ leave();
 8349       __ ret(lr);
 8350     }
 8351   }
 8352 
 8353   // Slow path implementation for UseSecondarySupersTable.
 8354   address generate_lookup_secondary_supers_table_slow_path_stub() {
 8355     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 8356     StubCodeMark mark(this, stub_id);
 8357 
 8358     address start = __ pc();
 8359     const Register
 8360       r_super_klass  = r0,        // argument
 8361       r_array_base   = r1,        // argument
 8362       temp1          = r2,        // temp
 8363       r_array_index  = r3,        // argument
 8364       r_bitmap       = rscratch2, // argument
 8365       result         = r5;        // argument
 8366 
 8367     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 8368     __ ret(lr);
 8369 
 8370     return start;
 8371   }
 8372 
 8373 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 8374 
 8375   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 8376   //
 8377   // If LSE is in use, generate LSE versions of all the stubs. The
 8378   // non-LSE versions are in atomic_aarch64.S.
 8379 
 8380   // class AtomicStubMark records the entry point of a stub and the
 8381   // stub pointer which will point to it. The stub pointer is set to
 8382   // the entry point when ~AtomicStubMark() is called, which must be
 8383   // after ICache::invalidate_range. This ensures safe publication of
 8384   // the generated code.
 8385   class AtomicStubMark {
 8386     address _entry_point;
 8387     aarch64_atomic_stub_t *_stub;
 8388     MacroAssembler *_masm;
 8389   public:
 8390     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 8391       _masm = masm;
 8392       __ align(32);
 8393       _entry_point = __ pc();
 8394       _stub = stub;
 8395     }
 8396     ~AtomicStubMark() {
 8397       *_stub = (aarch64_atomic_stub_t)_entry_point;
 8398     }
 8399   };
 8400 
 8401   // NB: For memory_order_conservative we need a trailing membar after
 8402   // LSE atomic operations but not a leading membar.
 8403   //
 8404   // We don't need a leading membar because a clause in the Arm ARM
 8405   // says:
 8406   //
 8407   //   Barrier-ordered-before
 8408   //
 8409   //   Barrier instructions order prior Memory effects before subsequent
 8410   //   Memory effects generated by the same Observer. A read or a write
 8411   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 8412   //   Observer if and only if RW1 appears in program order before RW 2
 8413   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 8414   //   instruction with both Acquire and Release semantics.
 8415   //
 8416   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 8417   // and Release semantics, therefore we don't need a leading
 8418   // barrier. However, there is no corresponding Barrier-ordered-after
 8419   // relationship, therefore we need a trailing membar to prevent a
 8420   // later store or load from being reordered with the store in an
 8421   // atomic instruction.
 8422   //
 8423   // This was checked by using the herd7 consistency model simulator
 8424   // (http://diy.inria.fr/) with this test case:
 8425   //
 8426   // AArch64 LseCas
 8427   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 8428   // P0 | P1;
 8429   // LDR W4, [X2] | MOV W3, #0;
 8430   // DMB LD       | MOV W4, #1;
 8431   // LDR W3, [X1] | CASAL W3, W4, [X1];
 8432   //              | DMB ISH;
 8433   //              | STR W4, [X2];
 8434   // exists
 8435   // (0:X3=0 /\ 0:X4=1)
 8436   //
 8437   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 8438   // with the store to x in P1. Without the DMB in P1 this may happen.
 8439   //
 8440   // At the time of writing we don't know of any AArch64 hardware that
 8441   // reorders stores in this way, but the Reference Manual permits it.
 8442 
 8443   void gen_cas_entry(Assembler::operand_size size,
 8444                      atomic_memory_order order) {
 8445     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 8446       exchange_val = c_rarg2;
 8447     bool acquire, release;
 8448     switch (order) {
 8449       case memory_order_relaxed:
 8450         acquire = false;
 8451         release = false;
 8452         break;
 8453       case memory_order_release:
 8454         acquire = false;
 8455         release = true;
 8456         break;
 8457       default:
 8458         acquire = true;
 8459         release = true;
 8460         break;
 8461     }
 8462     __ mov(prev, compare_val);
 8463     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 8464     if (order == memory_order_conservative) {
 8465       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8466     }
 8467     if (size == Assembler::xword) {
 8468       __ mov(r0, prev);
 8469     } else {
 8470       __ movw(r0, prev);
 8471     }
 8472     __ ret(lr);
 8473   }
 8474 
 8475   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 8476     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8477     // If not relaxed, then default to conservative.  Relaxed is the only
 8478     // case we use enough to be worth specializing.
 8479     if (order == memory_order_relaxed) {
 8480       __ ldadd(size, incr, prev, addr);
 8481     } else {
 8482       __ ldaddal(size, incr, prev, addr);
 8483       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8484     }
 8485     if (size == Assembler::xword) {
 8486       __ mov(r0, prev);
 8487     } else {
 8488       __ movw(r0, prev);
 8489     }
 8490     __ ret(lr);
 8491   }
 8492 
 8493   void gen_swpal_entry(Assembler::operand_size size) {
 8494     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8495     __ swpal(size, incr, prev, addr);
 8496     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8497     if (size == Assembler::xword) {
 8498       __ mov(r0, prev);
 8499     } else {
 8500       __ movw(r0, prev);
 8501     }
 8502     __ ret(lr);
 8503   }
 8504 
 8505   void generate_atomic_entry_points() {
 8506     if (! UseLSE) {
 8507       return;
 8508     }
 8509     __ align(CodeEntryAlignment);
 8510     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 8511     StubCodeMark mark(this, stub_id);
 8512     address first_entry = __ pc();
 8513 
 8514     // ADD, memory_order_conservative
 8515     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 8516     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 8517     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 8518     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 8519 
 8520     // ADD, memory_order_relaxed
 8521     AtomicStubMark mark_fetch_add_4_relaxed
 8522       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 8523     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 8524     AtomicStubMark mark_fetch_add_8_relaxed
 8525       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 8526     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 8527 
 8528     // XCHG, memory_order_conservative
 8529     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 8530     gen_swpal_entry(Assembler::word);
 8531     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 8532     gen_swpal_entry(Assembler::xword);
 8533 
 8534     // CAS, memory_order_conservative
 8535     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 8536     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 8537     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 8538     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 8539     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 8540     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 8541 
 8542     // CAS, memory_order_relaxed
 8543     AtomicStubMark mark_cmpxchg_1_relaxed
 8544       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 8545     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 8546     AtomicStubMark mark_cmpxchg_4_relaxed
 8547       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 8548     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 8549     AtomicStubMark mark_cmpxchg_8_relaxed
 8550       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 8551     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 8552 
 8553     AtomicStubMark mark_cmpxchg_4_release
 8554       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 8555     gen_cas_entry(MacroAssembler::word, memory_order_release);
 8556     AtomicStubMark mark_cmpxchg_8_release
 8557       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 8558     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 8559 
 8560     AtomicStubMark mark_cmpxchg_4_seq_cst
 8561       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 8562     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 8563     AtomicStubMark mark_cmpxchg_8_seq_cst
 8564       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 8565     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 8566 
 8567     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 8568   }
 8569 #endif // LINUX
 8570 
 8571   address generate_cont_thaw(Continuation::thaw_kind kind) {
 8572     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 8573     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
 8574 
 8575     address start = __ pc();
 8576 
 8577     if (return_barrier) {
 8578       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
 8579       __ mov(sp, rscratch1);
 8580     }
 8581     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8582 
 8583     if (return_barrier) {
 8584       // preserve possible return value from a method returning to the return barrier
 8585       __ fmovd(rscratch1, v0);
 8586       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8587     }
 8588 
 8589     __ movw(c_rarg1, (return_barrier ? 1 : 0));
 8590     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
 8591     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
 8592 
 8593     if (return_barrier) {
 8594       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8595       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8596       __ fmovd(v0, rscratch1);
 8597     }
 8598     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8599 
 8600 
 8601     Label thaw_success;
 8602     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
 8603     __ cbnz(rscratch2, thaw_success);
 8604     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
 8605     __ br(rscratch1);
 8606     __ bind(thaw_success);
 8607 
 8608     // make room for the thawed frames
 8609     __ sub(rscratch1, sp, rscratch2);
 8610     __ andr(rscratch1, rscratch1, -16); // align
 8611     __ mov(sp, rscratch1);
 8612 
 8613     if (return_barrier) {
 8614       // save original return value -- again
 8615       __ fmovd(rscratch1, v0);
 8616       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8617     }
 8618 
 8619     // If we want, we can templatize thaw by kind, and have three different entries
 8620     __ movw(c_rarg1, (uint32_t)kind);
 8621 
 8622     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
 8623     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
 8624 
 8625     if (return_barrier) {
 8626       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8627       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8628       __ fmovd(v0, rscratch1);
 8629     } else {
 8630       __ mov(r0, zr); // return 0 (success) from doYield
 8631     }
 8632 
 8633     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
 8634     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
 8635     __ mov(rfp, sp);
 8636 
 8637     if (return_barrier_exception) {
 8638       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
 8639       __ authenticate_return_address(c_rarg1);
 8640       __ verify_oop(r0);
 8641       // save return value containing the exception oop in callee-saved R19
 8642       __ mov(r19, r0);
 8643 
 8644       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
 8645 
 8646       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
 8647       // __ reinitialize_ptrue();
 8648 
 8649       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
 8650 
 8651       __ mov(r1, r0); // the exception handler
 8652       __ mov(r0, r19); // restore return value containing the exception oop
 8653       __ verify_oop(r0);
 8654 
 8655       __ leave();
 8656       __ mov(r3, lr);
 8657       __ br(r1); // the exception handler
 8658     } else {
 8659       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
 8660       __ leave();
 8661       __ ret(lr);
 8662     }
 8663 
 8664     return start;
 8665   }
 8666 
 8667   address generate_cont_thaw() {
 8668     if (!Continuations::enabled()) return nullptr;
 8669 
 8670     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
 8671     StubCodeMark mark(this, stub_id);
 8672     address start = __ pc();
 8673     generate_cont_thaw(Continuation::thaw_top);
 8674     return start;
 8675   }
 8676 
 8677   address generate_cont_returnBarrier() {
 8678     if (!Continuations::enabled()) return nullptr;
 8679 
 8680     // TODO: will probably need multiple return barriers depending on return type
 8681     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
 8682     StubCodeMark mark(this, stub_id);
 8683     address start = __ pc();
 8684 
 8685     generate_cont_thaw(Continuation::thaw_return_barrier);
 8686 
 8687     return start;
 8688   }
 8689 
 8690   address generate_cont_returnBarrier_exception() {
 8691     if (!Continuations::enabled()) return nullptr;
 8692 
 8693     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
 8694     StubCodeMark mark(this, stub_id);
 8695     address start = __ pc();
 8696 
 8697     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
 8698 
 8699     return start;
 8700   }
 8701 
 8702   address generate_cont_preempt_stub() {
 8703     if (!Continuations::enabled()) return nullptr;
 8704     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
 8705     StubCodeMark mark(this, stub_id);
 8706     address start = __ pc();
 8707 
 8708     __ reset_last_Java_frame(true);
 8709 
 8710     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
 8711     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
 8712     __ mov(sp, rscratch2);
 8713 
 8714     Label preemption_cancelled;
 8715     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8716     __ cbnz(rscratch1, preemption_cancelled);
 8717 
 8718     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
 8719     SharedRuntime::continuation_enter_cleanup(_masm);
 8720     __ leave();
 8721     __ ret(lr);
 8722 
 8723     // We acquired the monitor after freezing the frames so call thaw to continue execution.
 8724     __ bind(preemption_cancelled);
 8725     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8726     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
 8727     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
 8728     __ ldr(rscratch1, Address(rscratch1));
 8729     __ br(rscratch1);
 8730 
 8731     return start;
 8732   }
 8733 
 8734   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
 8735   // are represented as long[5], with BITS_PER_LIMB = 26.
 8736   // Pack five 26-bit limbs into three 64-bit registers.
 8737   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
 8738     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
 8739     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
 8740     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
 8741     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
 8742 
 8743     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
 8744     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
 8745     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
 8746     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
 8747 
 8748     if (dest2->is_valid()) {
 8749       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8750     } else {
 8751 #ifdef ASSERT
 8752       Label OK;
 8753       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8754       __ br(__ EQ, OK);
 8755       __ stop("high bits of Poly1305 integer should be zero");
 8756       __ should_not_reach_here();
 8757       __ bind(OK);
 8758 #endif
 8759     }
 8760   }
 8761 
 8762   // As above, but return only a 128-bit integer, packed into two
 8763   // 64-bit registers.
 8764   void pack_26(Register dest0, Register dest1, Register src) {
 8765     pack_26(dest0, dest1, noreg, src);
 8766   }
 8767 
 8768   // Multiply and multiply-accumulate unsigned 64-bit registers.
 8769   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
 8770     __ mul(prod_lo, n, m);
 8771     __ umulh(prod_hi, n, m);
 8772   }
 8773   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
 8774     wide_mul(rscratch1, rscratch2, n, m);
 8775     __ adds(sum_lo, sum_lo, rscratch1);
 8776     __ adc(sum_hi, sum_hi, rscratch2);
 8777   }
 8778 
 8779   // Poly1305, RFC 7539
 8780 
 8781   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
 8782   // description of the tricks used to simplify and accelerate this
 8783   // computation.
 8784 
 8785   address generate_poly1305_processBlocks() {
 8786     __ align(CodeEntryAlignment);
 8787     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
 8788     StubCodeMark mark(this, stub_id);
 8789     address start = __ pc();
 8790     Label here;
 8791     __ enter();
 8792     RegSet callee_saved = RegSet::range(r19, r28);
 8793     __ push(callee_saved, sp);
 8794 
 8795     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
 8796 
 8797     // Arguments
 8798     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
 8799 
 8800     // R_n is the 128-bit randomly-generated key, packed into two
 8801     // registers.  The caller passes this key to us as long[5], with
 8802     // BITS_PER_LIMB = 26.
 8803     const Register R_0 = *++regs, R_1 = *++regs;
 8804     pack_26(R_0, R_1, r_start);
 8805 
 8806     // RR_n is (R_n >> 2) * 5
 8807     const Register RR_0 = *++regs, RR_1 = *++regs;
 8808     __ lsr(RR_0, R_0, 2);
 8809     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
 8810     __ lsr(RR_1, R_1, 2);
 8811     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
 8812 
 8813     // U_n is the current checksum
 8814     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
 8815     pack_26(U_0, U_1, U_2, acc_start);
 8816 
 8817     static constexpr int BLOCK_LENGTH = 16;
 8818     Label DONE, LOOP;
 8819 
 8820     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8821     __ br(Assembler::LT, DONE); {
 8822       __ bind(LOOP);
 8823 
 8824       // S_n is to be the sum of U_n and the next block of data
 8825       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
 8826       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
 8827       __ adds(S_0, U_0, S_0);
 8828       __ adcs(S_1, U_1, S_1);
 8829       __ adc(S_2, U_2, zr);
 8830       __ add(S_2, S_2, 1);
 8831 
 8832       const Register U_0HI = *++regs, U_1HI = *++regs;
 8833 
 8834       // NB: this logic depends on some of the special properties of
 8835       // Poly1305 keys. In particular, because we know that the top
 8836       // four bits of R_0 and R_1 are zero, we can add together
 8837       // partial products without any risk of needing to propagate a
 8838       // carry out.
 8839       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
 8840       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
 8841       __ andr(U_2, R_0, 3);
 8842       __ mul(U_2, S_2, U_2);
 8843 
 8844       // Recycle registers S_0, S_1, S_2
 8845       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
 8846 
 8847       // Partial reduction mod 2**130 - 5
 8848       __ adds(U_1, U_0HI, U_1);
 8849       __ adc(U_2, U_1HI, U_2);
 8850       // Sum now in U_2:U_1:U_0.
 8851       // Dead: U_0HI, U_1HI.
 8852       regs = (regs.remaining() + U_0HI + U_1HI).begin();
 8853 
 8854       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
 8855 
 8856       // First, U_2:U_1:U_0 += (U_2 >> 2)
 8857       __ lsr(rscratch1, U_2, 2);
 8858       __ andr(U_2, U_2, (u8)3);
 8859       __ adds(U_0, U_0, rscratch1);
 8860       __ adcs(U_1, U_1, zr);
 8861       __ adc(U_2, U_2, zr);
 8862       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
 8863       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
 8864       __ adcs(U_1, U_1, zr);
 8865       __ adc(U_2, U_2, zr);
 8866 
 8867       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
 8868       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8869       __ br(~ Assembler::LT, LOOP);
 8870     }
 8871 
 8872     // Further reduce modulo 2^130 - 5
 8873     __ lsr(rscratch1, U_2, 2);
 8874     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
 8875     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
 8876     __ adcs(U_1, U_1, zr);
 8877     __ andr(U_2, U_2, (u1)3);
 8878     __ adc(U_2, U_2, zr);
 8879 
 8880     // Unpack the sum into five 26-bit limbs and write to memory.
 8881     __ ubfiz(rscratch1, U_0, 0, 26);
 8882     __ ubfx(rscratch2, U_0, 26, 26);
 8883     __ stp(rscratch1, rscratch2, Address(acc_start));
 8884     __ ubfx(rscratch1, U_0, 52, 12);
 8885     __ bfi(rscratch1, U_1, 12, 14);
 8886     __ ubfx(rscratch2, U_1, 14, 26);
 8887     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
 8888     __ ubfx(rscratch1, U_1, 40, 24);
 8889     __ bfi(rscratch1, U_2, 24, 3);
 8890     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
 8891 
 8892     __ bind(DONE);
 8893     __ pop(callee_saved, sp);
 8894     __ leave();
 8895     __ ret(lr);
 8896 
 8897     return start;
 8898   }
 8899 
 8900   // exception handler for upcall stubs
 8901   address generate_upcall_stub_exception_handler() {
 8902     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
 8903     StubCodeMark mark(this, stub_id);
 8904     address start = __ pc();
 8905 
 8906     // Native caller has no idea how to handle exceptions,
 8907     // so we just crash here. Up to callee to catch exceptions.
 8908     __ verify_oop(r0);
 8909     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
 8910     __ blr(rscratch1);
 8911     __ should_not_reach_here();
 8912 
 8913     return start;
 8914   }
 8915 
 8916   // load Method* target of MethodHandle
 8917   // j_rarg0 = jobject receiver
 8918   // rmethod = result
 8919   address generate_upcall_stub_load_target() {
 8920     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
 8921     StubCodeMark mark(this, stub_id);
 8922     address start = __ pc();
 8923 
 8924     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
 8925       // Load target method from receiver
 8926     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
 8927     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
 8928     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
 8929     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
 8930                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
 8931                       noreg, noreg);
 8932     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
 8933 
 8934     __ ret(lr);
 8935 
 8936     return start;
 8937   }
 8938 
 8939 #undef __
 8940 #define __ masm->
 8941 
 8942   class MontgomeryMultiplyGenerator : public MacroAssembler {
 8943 
 8944     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
 8945       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
 8946 
 8947     RegSet _toSave;
 8948     bool _squaring;
 8949 
 8950   public:
 8951     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
 8952       : MacroAssembler(as->code()), _squaring(squaring) {
 8953 
 8954       // Register allocation
 8955 
 8956       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
 8957       Pa_base = *regs;       // Argument registers
 8958       if (squaring)
 8959         Pb_base = Pa_base;
 8960       else
 8961         Pb_base = *++regs;
 8962       Pn_base = *++regs;
 8963       Rlen= *++regs;
 8964       inv = *++regs;
 8965       Pm_base = *++regs;
 8966 
 8967                           // Working registers:
 8968       Ra =  *++regs;        // The current digit of a, b, n, and m.
 8969       Rb =  *++regs;
 8970       Rm =  *++regs;
 8971       Rn =  *++regs;
 8972 
 8973       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
 8974       Pb =  *++regs;
 8975       Pm =  *++regs;
 8976       Pn =  *++regs;
 8977 
 8978       t0 =  *++regs;        // Three registers which form a
 8979       t1 =  *++regs;        // triple-precision accumuator.
 8980       t2 =  *++regs;
 8981 
 8982       Ri =  *++regs;        // Inner and outer loop indexes.
 8983       Rj =  *++regs;
 8984 
 8985       Rhi_ab = *++regs;     // Product registers: low and high parts
 8986       Rlo_ab = *++regs;     // of a*b and m*n.
 8987       Rhi_mn = *++regs;
 8988       Rlo_mn = *++regs;
 8989 
 8990       // r19 and up are callee-saved.
 8991       _toSave = RegSet::range(r19, *regs) + Pm_base;
 8992     }
 8993 
 8994   private:
 8995     void save_regs() {
 8996       push(_toSave, sp);
 8997     }
 8998 
 8999     void restore_regs() {
 9000       pop(_toSave, sp);
 9001     }
 9002 
 9003     template <typename T>
 9004     void unroll_2(Register count, T block) {
 9005       Label loop, end, odd;
 9006       tbnz(count, 0, odd);
 9007       cbz(count, end);
 9008       align(16);
 9009       bind(loop);
 9010       (this->*block)();
 9011       bind(odd);
 9012       (this->*block)();
 9013       subs(count, count, 2);
 9014       br(Assembler::GT, loop);
 9015       bind(end);
 9016     }
 9017 
 9018     template <typename T>
 9019     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
 9020       Label loop, end, odd;
 9021       tbnz(count, 0, odd);
 9022       cbz(count, end);
 9023       align(16);
 9024       bind(loop);
 9025       (this->*block)(d, s, tmp);
 9026       bind(odd);
 9027       (this->*block)(d, s, tmp);
 9028       subs(count, count, 2);
 9029       br(Assembler::GT, loop);
 9030       bind(end);
 9031     }
 9032 
 9033     void pre1(RegisterOrConstant i) {
 9034       block_comment("pre1");
 9035       // Pa = Pa_base;
 9036       // Pb = Pb_base + i;
 9037       // Pm = Pm_base;
 9038       // Pn = Pn_base + i;
 9039       // Ra = *Pa;
 9040       // Rb = *Pb;
 9041       // Rm = *Pm;
 9042       // Rn = *Pn;
 9043       ldr(Ra, Address(Pa_base));
 9044       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9045       ldr(Rm, Address(Pm_base));
 9046       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9047       lea(Pa, Address(Pa_base));
 9048       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9049       lea(Pm, Address(Pm_base));
 9050       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9051 
 9052       // Zero the m*n result.
 9053       mov(Rhi_mn, zr);
 9054       mov(Rlo_mn, zr);
 9055     }
 9056 
 9057     // The core multiply-accumulate step of a Montgomery
 9058     // multiplication.  The idea is to schedule operations as a
 9059     // pipeline so that instructions with long latencies (loads and
 9060     // multiplies) have time to complete before their results are
 9061     // used.  This most benefits in-order implementations of the
 9062     // architecture but out-of-order ones also benefit.
 9063     void step() {
 9064       block_comment("step");
 9065       // MACC(Ra, Rb, t0, t1, t2);
 9066       // Ra = *++Pa;
 9067       // Rb = *--Pb;
 9068       umulh(Rhi_ab, Ra, Rb);
 9069       mul(Rlo_ab, Ra, Rb);
 9070       ldr(Ra, pre(Pa, wordSize));
 9071       ldr(Rb, pre(Pb, -wordSize));
 9072       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
 9073                                        // previous iteration.
 9074       // MACC(Rm, Rn, t0, t1, t2);
 9075       // Rm = *++Pm;
 9076       // Rn = *--Pn;
 9077       umulh(Rhi_mn, Rm, Rn);
 9078       mul(Rlo_mn, Rm, Rn);
 9079       ldr(Rm, pre(Pm, wordSize));
 9080       ldr(Rn, pre(Pn, -wordSize));
 9081       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9082     }
 9083 
 9084     void post1() {
 9085       block_comment("post1");
 9086 
 9087       // MACC(Ra, Rb, t0, t1, t2);
 9088       // Ra = *++Pa;
 9089       // Rb = *--Pb;
 9090       umulh(Rhi_ab, Ra, Rb);
 9091       mul(Rlo_ab, Ra, Rb);
 9092       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9093       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9094 
 9095       // *Pm = Rm = t0 * inv;
 9096       mul(Rm, t0, inv);
 9097       str(Rm, Address(Pm));
 9098 
 9099       // MACC(Rm, Rn, t0, t1, t2);
 9100       // t0 = t1; t1 = t2; t2 = 0;
 9101       umulh(Rhi_mn, Rm, Rn);
 9102 
 9103 #ifndef PRODUCT
 9104       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9105       {
 9106         mul(Rlo_mn, Rm, Rn);
 9107         add(Rlo_mn, t0, Rlo_mn);
 9108         Label ok;
 9109         cbz(Rlo_mn, ok); {
 9110           stop("broken Montgomery multiply");
 9111         } bind(ok);
 9112       }
 9113 #endif
 9114       // We have very carefully set things up so that
 9115       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9116       // the lower half of Rm * Rn because we know the result already:
 9117       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9118       // t0 != 0.  So, rather than do a mul and an adds we just set
 9119       // the carry flag iff t0 is nonzero.
 9120       //
 9121       // mul(Rlo_mn, Rm, Rn);
 9122       // adds(zr, t0, Rlo_mn);
 9123       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9124       adcs(t0, t1, Rhi_mn);
 9125       adc(t1, t2, zr);
 9126       mov(t2, zr);
 9127     }
 9128 
 9129     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
 9130       block_comment("pre2");
 9131       // Pa = Pa_base + i-len;
 9132       // Pb = Pb_base + len;
 9133       // Pm = Pm_base + i-len;
 9134       // Pn = Pn_base + len;
 9135 
 9136       if (i.is_register()) {
 9137         sub(Rj, i.as_register(), len);
 9138       } else {
 9139         mov(Rj, i.as_constant());
 9140         sub(Rj, Rj, len);
 9141       }
 9142       // Rj == i-len
 9143 
 9144       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
 9145       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
 9146       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9147       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
 9148 
 9149       // Ra = *++Pa;
 9150       // Rb = *--Pb;
 9151       // Rm = *++Pm;
 9152       // Rn = *--Pn;
 9153       ldr(Ra, pre(Pa, wordSize));
 9154       ldr(Rb, pre(Pb, -wordSize));
 9155       ldr(Rm, pre(Pm, wordSize));
 9156       ldr(Rn, pre(Pn, -wordSize));
 9157 
 9158       mov(Rhi_mn, zr);
 9159       mov(Rlo_mn, zr);
 9160     }
 9161 
 9162     void post2(RegisterOrConstant i, RegisterOrConstant len) {
 9163       block_comment("post2");
 9164       if (i.is_constant()) {
 9165         mov(Rj, i.as_constant()-len.as_constant());
 9166       } else {
 9167         sub(Rj, i.as_register(), len);
 9168       }
 9169 
 9170       adds(t0, t0, Rlo_mn); // The pending m*n, low part
 9171 
 9172       // As soon as we know the least significant digit of our result,
 9173       // store it.
 9174       // Pm_base[i-len] = t0;
 9175       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9176 
 9177       // t0 = t1; t1 = t2; t2 = 0;
 9178       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
 9179       adc(t1, t2, zr);
 9180       mov(t2, zr);
 9181     }
 9182 
 9183     // A carry in t0 after Montgomery multiplication means that we
 9184     // should subtract multiples of n from our result in m.  We'll
 9185     // keep doing that until there is no carry.
 9186     void normalize(RegisterOrConstant len) {
 9187       block_comment("normalize");
 9188       // while (t0)
 9189       //   t0 = sub(Pm_base, Pn_base, t0, len);
 9190       Label loop, post, again;
 9191       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
 9192       cbz(t0, post); {
 9193         bind(again); {
 9194           mov(i, zr);
 9195           mov(cnt, len);
 9196           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9197           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9198           subs(zr, zr, zr); // set carry flag, i.e. no borrow
 9199           align(16);
 9200           bind(loop); {
 9201             sbcs(Rm, Rm, Rn);
 9202             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9203             add(i, i, 1);
 9204             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9205             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9206             sub(cnt, cnt, 1);
 9207           } cbnz(cnt, loop);
 9208           sbc(t0, t0, zr);
 9209         } cbnz(t0, again);
 9210       } bind(post);
 9211     }
 9212 
 9213     // Move memory at s to d, reversing words.
 9214     //    Increments d to end of copied memory
 9215     //    Destroys tmp1, tmp2
 9216     //    Preserves len
 9217     //    Leaves s pointing to the address which was in d at start
 9218     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
 9219       assert(tmp1->encoding() < r19->encoding(), "register corruption");
 9220       assert(tmp2->encoding() < r19->encoding(), "register corruption");
 9221 
 9222       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
 9223       mov(tmp1, len);
 9224       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
 9225       sub(s, d, len, ext::uxtw, LogBytesPerWord);
 9226     }
 9227     // where
 9228     void reverse1(Register d, Register s, Register tmp) {
 9229       ldr(tmp, pre(s, -wordSize));
 9230       ror(tmp, tmp, 32);
 9231       str(tmp, post(d, wordSize));
 9232     }
 9233 
 9234     void step_squaring() {
 9235       // An extra ACC
 9236       step();
 9237       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9238     }
 9239 
 9240     void last_squaring(RegisterOrConstant i) {
 9241       Label dont;
 9242       // if ((i & 1) == 0) {
 9243       tbnz(i.as_register(), 0, dont); {
 9244         // MACC(Ra, Rb, t0, t1, t2);
 9245         // Ra = *++Pa;
 9246         // Rb = *--Pb;
 9247         umulh(Rhi_ab, Ra, Rb);
 9248         mul(Rlo_ab, Ra, Rb);
 9249         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9250       } bind(dont);
 9251     }
 9252 
 9253     void extra_step_squaring() {
 9254       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9255 
 9256       // MACC(Rm, Rn, t0, t1, t2);
 9257       // Rm = *++Pm;
 9258       // Rn = *--Pn;
 9259       umulh(Rhi_mn, Rm, Rn);
 9260       mul(Rlo_mn, Rm, Rn);
 9261       ldr(Rm, pre(Pm, wordSize));
 9262       ldr(Rn, pre(Pn, -wordSize));
 9263     }
 9264 
 9265     void post1_squaring() {
 9266       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9267 
 9268       // *Pm = Rm = t0 * inv;
 9269       mul(Rm, t0, inv);
 9270       str(Rm, Address(Pm));
 9271 
 9272       // MACC(Rm, Rn, t0, t1, t2);
 9273       // t0 = t1; t1 = t2; t2 = 0;
 9274       umulh(Rhi_mn, Rm, Rn);
 9275 
 9276 #ifndef PRODUCT
 9277       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9278       {
 9279         mul(Rlo_mn, Rm, Rn);
 9280         add(Rlo_mn, t0, Rlo_mn);
 9281         Label ok;
 9282         cbz(Rlo_mn, ok); {
 9283           stop("broken Montgomery multiply");
 9284         } bind(ok);
 9285       }
 9286 #endif
 9287       // We have very carefully set things up so that
 9288       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9289       // the lower half of Rm * Rn because we know the result already:
 9290       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9291       // t0 != 0.  So, rather than do a mul and an adds we just set
 9292       // the carry flag iff t0 is nonzero.
 9293       //
 9294       // mul(Rlo_mn, Rm, Rn);
 9295       // adds(zr, t0, Rlo_mn);
 9296       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9297       adcs(t0, t1, Rhi_mn);
 9298       adc(t1, t2, zr);
 9299       mov(t2, zr);
 9300     }
 9301 
 9302     void acc(Register Rhi, Register Rlo,
 9303              Register t0, Register t1, Register t2) {
 9304       adds(t0, t0, Rlo);
 9305       adcs(t1, t1, Rhi);
 9306       adc(t2, t2, zr);
 9307     }
 9308 
 9309   public:
 9310     /**
 9311      * Fast Montgomery multiplication.  The derivation of the
 9312      * algorithm is in A Cryptographic Library for the Motorola
 9313      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
 9314      *
 9315      * Arguments:
 9316      *
 9317      * Inputs for multiplication:
 9318      *   c_rarg0   - int array elements a
 9319      *   c_rarg1   - int array elements b
 9320      *   c_rarg2   - int array elements n (the modulus)
 9321      *   c_rarg3   - int length
 9322      *   c_rarg4   - int inv
 9323      *   c_rarg5   - int array elements m (the result)
 9324      *
 9325      * Inputs for squaring:
 9326      *   c_rarg0   - int array elements a
 9327      *   c_rarg1   - int array elements n (the modulus)
 9328      *   c_rarg2   - int length
 9329      *   c_rarg3   - int inv
 9330      *   c_rarg4   - int array elements m (the result)
 9331      *
 9332      */
 9333     address generate_multiply() {
 9334       Label argh, nothing;
 9335       bind(argh);
 9336       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9337 
 9338       align(CodeEntryAlignment);
 9339       address entry = pc();
 9340 
 9341       cbzw(Rlen, nothing);
 9342 
 9343       enter();
 9344 
 9345       // Make room.
 9346       cmpw(Rlen, 512);
 9347       br(Assembler::HI, argh);
 9348       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9349       andr(sp, Ra, -2 * wordSize);
 9350 
 9351       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9352 
 9353       {
 9354         // Copy input args, reversing as we go.  We use Ra as a
 9355         // temporary variable.
 9356         reverse(Ra, Pa_base, Rlen, t0, t1);
 9357         if (!_squaring)
 9358           reverse(Ra, Pb_base, Rlen, t0, t1);
 9359         reverse(Ra, Pn_base, Rlen, t0, t1);
 9360       }
 9361 
 9362       // Push all call-saved registers and also Pm_base which we'll need
 9363       // at the end.
 9364       save_regs();
 9365 
 9366 #ifndef PRODUCT
 9367       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
 9368       {
 9369         ldr(Rn, Address(Pn_base, 0));
 9370         mul(Rlo_mn, Rn, inv);
 9371         subs(zr, Rlo_mn, -1);
 9372         Label ok;
 9373         br(EQ, ok); {
 9374           stop("broken inverse in Montgomery multiply");
 9375         } bind(ok);
 9376       }
 9377 #endif
 9378 
 9379       mov(Pm_base, Ra);
 9380 
 9381       mov(t0, zr);
 9382       mov(t1, zr);
 9383       mov(t2, zr);
 9384 
 9385       block_comment("for (int i = 0; i < len; i++) {");
 9386       mov(Ri, zr); {
 9387         Label loop, end;
 9388         cmpw(Ri, Rlen);
 9389         br(Assembler::GE, end);
 9390 
 9391         bind(loop);
 9392         pre1(Ri);
 9393 
 9394         block_comment("  for (j = i; j; j--) {"); {
 9395           movw(Rj, Ri);
 9396           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9397         } block_comment("  } // j");
 9398 
 9399         post1();
 9400         addw(Ri, Ri, 1);
 9401         cmpw(Ri, Rlen);
 9402         br(Assembler::LT, loop);
 9403         bind(end);
 9404         block_comment("} // i");
 9405       }
 9406 
 9407       block_comment("for (int i = len; i < 2*len; i++) {");
 9408       mov(Ri, Rlen); {
 9409         Label loop, end;
 9410         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9411         br(Assembler::GE, end);
 9412 
 9413         bind(loop);
 9414         pre2(Ri, Rlen);
 9415 
 9416         block_comment("  for (j = len*2-i-1; j; j--) {"); {
 9417           lslw(Rj, Rlen, 1);
 9418           subw(Rj, Rj, Ri);
 9419           subw(Rj, Rj, 1);
 9420           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9421         } block_comment("  } // j");
 9422 
 9423         post2(Ri, Rlen);
 9424         addw(Ri, Ri, 1);
 9425         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9426         br(Assembler::LT, loop);
 9427         bind(end);
 9428       }
 9429       block_comment("} // i");
 9430 
 9431       normalize(Rlen);
 9432 
 9433       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9434       restore_regs();  // Restore caller's Pm_base
 9435 
 9436       // Copy our result into caller's Pm_base
 9437       reverse(Pm_base, Ra, Rlen, t0, t1);
 9438 
 9439       leave();
 9440       bind(nothing);
 9441       ret(lr);
 9442 
 9443       return entry;
 9444     }
 9445     // In C, approximately:
 9446 
 9447     // void
 9448     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
 9449     //                     julong Pn_base[], julong Pm_base[],
 9450     //                     julong inv, int len) {
 9451     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9452     //   julong *Pa, *Pb, *Pn, *Pm;
 9453     //   julong Ra, Rb, Rn, Rm;
 9454 
 9455     //   int i;
 9456 
 9457     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9458 
 9459     //   for (i = 0; i < len; i++) {
 9460     //     int j;
 9461 
 9462     //     Pa = Pa_base;
 9463     //     Pb = Pb_base + i;
 9464     //     Pm = Pm_base;
 9465     //     Pn = Pn_base + i;
 9466 
 9467     //     Ra = *Pa;
 9468     //     Rb = *Pb;
 9469     //     Rm = *Pm;
 9470     //     Rn = *Pn;
 9471 
 9472     //     int iters = i;
 9473     //     for (j = 0; iters--; j++) {
 9474     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9475     //       MACC(Ra, Rb, t0, t1, t2);
 9476     //       Ra = *++Pa;
 9477     //       Rb = *--Pb;
 9478     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9479     //       MACC(Rm, Rn, t0, t1, t2);
 9480     //       Rm = *++Pm;
 9481     //       Rn = *--Pn;
 9482     //     }
 9483 
 9484     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
 9485     //     MACC(Ra, Rb, t0, t1, t2);
 9486     //     *Pm = Rm = t0 * inv;
 9487     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9488     //     MACC(Rm, Rn, t0, t1, t2);
 9489 
 9490     //     assert(t0 == 0, "broken Montgomery multiply");
 9491 
 9492     //     t0 = t1; t1 = t2; t2 = 0;
 9493     //   }
 9494 
 9495     //   for (i = len; i < 2*len; i++) {
 9496     //     int j;
 9497 
 9498     //     Pa = Pa_base + i-len;
 9499     //     Pb = Pb_base + len;
 9500     //     Pm = Pm_base + i-len;
 9501     //     Pn = Pn_base + len;
 9502 
 9503     //     Ra = *++Pa;
 9504     //     Rb = *--Pb;
 9505     //     Rm = *++Pm;
 9506     //     Rn = *--Pn;
 9507 
 9508     //     int iters = len*2-i-1;
 9509     //     for (j = i-len+1; iters--; j++) {
 9510     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9511     //       MACC(Ra, Rb, t0, t1, t2);
 9512     //       Ra = *++Pa;
 9513     //       Rb = *--Pb;
 9514     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9515     //       MACC(Rm, Rn, t0, t1, t2);
 9516     //       Rm = *++Pm;
 9517     //       Rn = *--Pn;
 9518     //     }
 9519 
 9520     //     Pm_base[i-len] = t0;
 9521     //     t0 = t1; t1 = t2; t2 = 0;
 9522     //   }
 9523 
 9524     //   while (t0)
 9525     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9526     // }
 9527 
 9528     /**
 9529      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
 9530      * multiplies than Montgomery multiplication so it should be up to
 9531      * 25% faster.  However, its loop control is more complex and it
 9532      * may actually run slower on some machines.
 9533      *
 9534      * Arguments:
 9535      *
 9536      * Inputs:
 9537      *   c_rarg0   - int array elements a
 9538      *   c_rarg1   - int array elements n (the modulus)
 9539      *   c_rarg2   - int length
 9540      *   c_rarg3   - int inv
 9541      *   c_rarg4   - int array elements m (the result)
 9542      *
 9543      */
 9544     address generate_square() {
 9545       Label argh;
 9546       bind(argh);
 9547       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9548 
 9549       align(CodeEntryAlignment);
 9550       address entry = pc();
 9551 
 9552       enter();
 9553 
 9554       // Make room.
 9555       cmpw(Rlen, 512);
 9556       br(Assembler::HI, argh);
 9557       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9558       andr(sp, Ra, -2 * wordSize);
 9559 
 9560       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9561 
 9562       {
 9563         // Copy input args, reversing as we go.  We use Ra as a
 9564         // temporary variable.
 9565         reverse(Ra, Pa_base, Rlen, t0, t1);
 9566         reverse(Ra, Pn_base, Rlen, t0, t1);
 9567       }
 9568 
 9569       // Push all call-saved registers and also Pm_base which we'll need
 9570       // at the end.
 9571       save_regs();
 9572 
 9573       mov(Pm_base, Ra);
 9574 
 9575       mov(t0, zr);
 9576       mov(t1, zr);
 9577       mov(t2, zr);
 9578 
 9579       block_comment("for (int i = 0; i < len; i++) {");
 9580       mov(Ri, zr); {
 9581         Label loop, end;
 9582         bind(loop);
 9583         cmp(Ri, Rlen);
 9584         br(Assembler::GE, end);
 9585 
 9586         pre1(Ri);
 9587 
 9588         block_comment("for (j = (i+1)/2; j; j--) {"); {
 9589           add(Rj, Ri, 1);
 9590           lsr(Rj, Rj, 1);
 9591           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9592         } block_comment("  } // j");
 9593 
 9594         last_squaring(Ri);
 9595 
 9596         block_comment("  for (j = i/2; j; j--) {"); {
 9597           lsr(Rj, Ri, 1);
 9598           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9599         } block_comment("  } // j");
 9600 
 9601         post1_squaring();
 9602         add(Ri, Ri, 1);
 9603         cmp(Ri, Rlen);
 9604         br(Assembler::LT, loop);
 9605 
 9606         bind(end);
 9607         block_comment("} // i");
 9608       }
 9609 
 9610       block_comment("for (int i = len; i < 2*len; i++) {");
 9611       mov(Ri, Rlen); {
 9612         Label loop, end;
 9613         bind(loop);
 9614         cmp(Ri, Rlen, Assembler::LSL, 1);
 9615         br(Assembler::GE, end);
 9616 
 9617         pre2(Ri, Rlen);
 9618 
 9619         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
 9620           lsl(Rj, Rlen, 1);
 9621           sub(Rj, Rj, Ri);
 9622           sub(Rj, Rj, 1);
 9623           lsr(Rj, Rj, 1);
 9624           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9625         } block_comment("  } // j");
 9626 
 9627         last_squaring(Ri);
 9628 
 9629         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
 9630           lsl(Rj, Rlen, 1);
 9631           sub(Rj, Rj, Ri);
 9632           lsr(Rj, Rj, 1);
 9633           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9634         } block_comment("  } // j");
 9635 
 9636         post2(Ri, Rlen);
 9637         add(Ri, Ri, 1);
 9638         cmp(Ri, Rlen, Assembler::LSL, 1);
 9639 
 9640         br(Assembler::LT, loop);
 9641         bind(end);
 9642         block_comment("} // i");
 9643       }
 9644 
 9645       normalize(Rlen);
 9646 
 9647       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9648       restore_regs();  // Restore caller's Pm_base
 9649 
 9650       // Copy our result into caller's Pm_base
 9651       reverse(Pm_base, Ra, Rlen, t0, t1);
 9652 
 9653       leave();
 9654       ret(lr);
 9655 
 9656       return entry;
 9657     }
 9658     // In C, approximately:
 9659 
 9660     // void
 9661     // montgomery_square(julong Pa_base[], julong Pn_base[],
 9662     //                   julong Pm_base[], julong inv, int len) {
 9663     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9664     //   julong *Pa, *Pb, *Pn, *Pm;
 9665     //   julong Ra, Rb, Rn, Rm;
 9666 
 9667     //   int i;
 9668 
 9669     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9670 
 9671     //   for (i = 0; i < len; i++) {
 9672     //     int j;
 9673 
 9674     //     Pa = Pa_base;
 9675     //     Pb = Pa_base + i;
 9676     //     Pm = Pm_base;
 9677     //     Pn = Pn_base + i;
 9678 
 9679     //     Ra = *Pa;
 9680     //     Rb = *Pb;
 9681     //     Rm = *Pm;
 9682     //     Rn = *Pn;
 9683 
 9684     //     int iters = (i+1)/2;
 9685     //     for (j = 0; iters--; j++) {
 9686     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9687     //       MACC2(Ra, Rb, t0, t1, t2);
 9688     //       Ra = *++Pa;
 9689     //       Rb = *--Pb;
 9690     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9691     //       MACC(Rm, Rn, t0, t1, t2);
 9692     //       Rm = *++Pm;
 9693     //       Rn = *--Pn;
 9694     //     }
 9695     //     if ((i & 1) == 0) {
 9696     //       assert(Ra == Pa_base[j], "must be");
 9697     //       MACC(Ra, Ra, t0, t1, t2);
 9698     //     }
 9699     //     iters = i/2;
 9700     //     assert(iters == i-j, "must be");
 9701     //     for (; iters--; j++) {
 9702     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9703     //       MACC(Rm, Rn, t0, t1, t2);
 9704     //       Rm = *++Pm;
 9705     //       Rn = *--Pn;
 9706     //     }
 9707 
 9708     //     *Pm = Rm = t0 * inv;
 9709     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9710     //     MACC(Rm, Rn, t0, t1, t2);
 9711 
 9712     //     assert(t0 == 0, "broken Montgomery multiply");
 9713 
 9714     //     t0 = t1; t1 = t2; t2 = 0;
 9715     //   }
 9716 
 9717     //   for (i = len; i < 2*len; i++) {
 9718     //     int start = i-len+1;
 9719     //     int end = start + (len - start)/2;
 9720     //     int j;
 9721 
 9722     //     Pa = Pa_base + i-len;
 9723     //     Pb = Pa_base + len;
 9724     //     Pm = Pm_base + i-len;
 9725     //     Pn = Pn_base + len;
 9726 
 9727     //     Ra = *++Pa;
 9728     //     Rb = *--Pb;
 9729     //     Rm = *++Pm;
 9730     //     Rn = *--Pn;
 9731 
 9732     //     int iters = (2*len-i-1)/2;
 9733     //     assert(iters == end-start, "must be");
 9734     //     for (j = start; iters--; j++) {
 9735     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9736     //       MACC2(Ra, Rb, t0, t1, t2);
 9737     //       Ra = *++Pa;
 9738     //       Rb = *--Pb;
 9739     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9740     //       MACC(Rm, Rn, t0, t1, t2);
 9741     //       Rm = *++Pm;
 9742     //       Rn = *--Pn;
 9743     //     }
 9744     //     if ((i & 1) == 0) {
 9745     //       assert(Ra == Pa_base[j], "must be");
 9746     //       MACC(Ra, Ra, t0, t1, t2);
 9747     //     }
 9748     //     iters =  (2*len-i)/2;
 9749     //     assert(iters == len-j, "must be");
 9750     //     for (; iters--; j++) {
 9751     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9752     //       MACC(Rm, Rn, t0, t1, t2);
 9753     //       Rm = *++Pm;
 9754     //       Rn = *--Pn;
 9755     //     }
 9756     //     Pm_base[i-len] = t0;
 9757     //     t0 = t1; t1 = t2; t2 = 0;
 9758     //   }
 9759 
 9760     //   while (t0)
 9761     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9762     // }
 9763   };
 9764 
 9765   void generate_vector_math_stubs() {
 9766     // Get native vector math stub routine addresses
 9767     void* libsleef = nullptr;
 9768     char ebuf[1024];
 9769     char dll_name[JVM_MAXPATHLEN];
 9770     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
 9771       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
 9772     }
 9773     if (libsleef == nullptr) {
 9774       log_info(library)("Failed to load native vector math library, %s!", ebuf);
 9775       return;
 9776     }
 9777     // Method naming convention
 9778     //   All the methods are named as <OP><T><N>_<U><suffix>
 9779     //   Where:
 9780     //     <OP>     is the operation name, e.g. sin
 9781     //     <T>      is optional to indicate float/double
 9782     //              "f/d" for vector float/double operation
 9783     //     <N>      is the number of elements in the vector
 9784     //              "2/4" for neon, and "x" for sve
 9785     //     <U>      is the precision level
 9786     //              "u10/u05" represents 1.0/0.5 ULP error bounds
 9787     //               We use "u10" for all operations by default
 9788     //               But for those functions do not have u10 support, we use "u05" instead
 9789     //     <suffix> indicates neon/sve
 9790     //              "sve/advsimd" for sve/neon implementations
 9791     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
 9792     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
 9793     //
 9794     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
 9795 
 9796     // Math vector stubs implemented with SVE for scalable vector size.
 9797     if (UseSVE > 0) {
 9798       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9799         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9800         // Skip "tanh" because there is performance regression
 9801         if (vop == VectorSupport::VECTOR_OP_TANH) {
 9802           continue;
 9803         }
 9804 
 9805         // The native library does not support u10 level of "hypot".
 9806         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9807 
 9808         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
 9809         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9810 
 9811         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
 9812         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9813       }
 9814     }
 9815 
 9816     // Math vector stubs implemented with NEON for 64/128 bits vector size.
 9817     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9818       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9819       // Skip "tanh" because there is performance regression
 9820       if (vop == VectorSupport::VECTOR_OP_TANH) {
 9821         continue;
 9822       }
 9823 
 9824       // The native library does not support u10 level of "hypot".
 9825       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9826 
 9827       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9828       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
 9829 
 9830       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9831       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9832 
 9833       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
 9834       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9835     }
 9836   }
 9837 
 9838   // Initialization
 9839   void generate_initial_stubs() {
 9840     // Generate initial stubs and initializes the entry points
 9841 
 9842     // entry points that exist in all platforms Note: This is code
 9843     // that could be shared among different platforms - however the
 9844     // benefit seems to be smaller than the disadvantage of having a
 9845     // much more complicated generator structure. See also comment in
 9846     // stubRoutines.hpp.
 9847 
 9848     StubRoutines::_forward_exception_entry = generate_forward_exception();
 9849 
 9850     StubRoutines::_call_stub_entry =
 9851       generate_call_stub(StubRoutines::_call_stub_return_address);
 9852 
 9853     // is referenced by megamorphic call
 9854     StubRoutines::_catch_exception_entry = generate_catch_exception();
 9855 
 9856     // Initialize table for copy memory (arraycopy) check.
 9857     if (UnsafeMemoryAccess::_table == nullptr) {
 9858       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
 9859     }
 9860 
 9861     if (UseCRC32Intrinsics) {
 9862       // set table address before stub generation which use it
 9863       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
 9864       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
 9865     }
 9866 
 9867     if (UseCRC32CIntrinsics) {
 9868       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
 9869     }
 9870 
 9871     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
 9872       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
 9873     }
 9874 
 9875     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
 9876       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
 9877     }
 9878 
 9879     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
 9880         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
 9881       StubRoutines::_hf2f = generate_float16ToFloat();
 9882       StubRoutines::_f2hf = generate_floatToFloat16();
 9883     }
 9884   }
 9885 
 9886   void generate_continuation_stubs() {
 9887     // Continuation stubs:
 9888     StubRoutines::_cont_thaw          = generate_cont_thaw();
 9889     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
 9890     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
 9891     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
 9892   }
 9893 
 9894   void generate_final_stubs() {
 9895     // support for verify_oop (must happen after universe_init)
 9896     if (VerifyOops) {
 9897       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
 9898     }
 9899 
 9900     // arraycopy stubs used by compilers
 9901     generate_arraycopy_stubs();
 9902 
 9903     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 9904     if (bs_nm != nullptr) {
 9905       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
 9906     }
 9907 
 9908     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
 9909 
 9910     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
 9911     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
 9912 
 9913 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9914 
 9915     generate_atomic_entry_points();
 9916 
 9917 #endif // LINUX
 9918 
 9919 #ifdef COMPILER2
 9920     if (UseSecondarySupersTable) {
 9921       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
 9922       if (! InlineSecondarySupersTest) {
 9923         generate_lookup_secondary_supers_table_stub();
 9924       }
 9925     }
 9926 #endif
 9927 
 9928     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
 9929   }
 9930 
 9931   void generate_compiler_stubs() {
 9932 #if COMPILER2_OR_JVMCI
 9933 
 9934     if (UseSVE == 0) {
 9935       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
 9936     }
 9937 
 9938     // array equals stub for large arrays.
 9939     if (!UseSimpleArrayEquals) {
 9940       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
 9941     }
 9942 
 9943     // arrays_hascode stub for large arrays.
 9944     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
 9945     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
 9946     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
 9947     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
 9948     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
 9949 
 9950     // byte_array_inflate stub for large arrays.
 9951     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
 9952 
 9953     // countPositives stub for large arrays.
 9954     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
 9955 
 9956     generate_compare_long_strings();
 9957 
 9958     generate_string_indexof_stubs();
 9959 
 9960 #ifdef COMPILER2
 9961     if (UseMultiplyToLenIntrinsic) {
 9962       StubRoutines::_multiplyToLen = generate_multiplyToLen();
 9963     }
 9964 
 9965     if (UseSquareToLenIntrinsic) {
 9966       StubRoutines::_squareToLen = generate_squareToLen();
 9967     }
 9968 
 9969     if (UseMulAddIntrinsic) {
 9970       StubRoutines::_mulAdd = generate_mulAdd();
 9971     }
 9972 
 9973     if (UseSIMDForBigIntegerShiftIntrinsics) {
 9974       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
 9975       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
 9976     }
 9977 
 9978     if (UseMontgomeryMultiplyIntrinsic) {
 9979       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
 9980       StubCodeMark mark(this, stub_id);
 9981       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
 9982       StubRoutines::_montgomeryMultiply = g.generate_multiply();
 9983     }
 9984 
 9985     if (UseMontgomerySquareIntrinsic) {
 9986       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
 9987       StubCodeMark mark(this, stub_id);
 9988       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
 9989       // We use generate_multiply() rather than generate_square()
 9990       // because it's faster for the sizes of modulus we care about.
 9991       StubRoutines::_montgomerySquare = g.generate_multiply();
 9992     }
 9993 
 9994     generate_vector_math_stubs();
 9995 
 9996 #endif // COMPILER2
 9997 
 9998     if (UseChaCha20Intrinsics) {
 9999       StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
10000     }
10001 
10002     if (UseDilithiumIntrinsics) {
10003       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
10004       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
10005       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
10006       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
10007       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
10008     }
10009 
10010     if (UseBASE64Intrinsics) {
10011         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
10012         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
10013     }
10014 
10015     // data cache line writeback
10016     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
10017     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
10018 
10019     if (UseAESIntrinsics) {
10020       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
10021       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
10022       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
10023       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
10024       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
10025     }
10026     if (UseGHASHIntrinsics) {
10027       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
10028       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
10029     }
10030     if (UseAESIntrinsics && UseGHASHIntrinsics) {
10031       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
10032     }
10033 
10034     if (UseMD5Intrinsics) {
10035       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
10036       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
10037     }
10038     if (UseSHA1Intrinsics) {
10039       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
10040       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
10041     }
10042     if (UseSHA256Intrinsics) {
10043       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
10044       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
10045     }
10046     if (UseSHA512Intrinsics) {
10047       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
10048       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
10049     }
10050     if (UseSHA3Intrinsics) {
10051       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
10052       StubRoutines::_double_keccak         = generate_double_keccak();
10053       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
10054     }
10055 
10056     if (UsePoly1305Intrinsics) {
10057       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
10058     }
10059 
10060     // generate Adler32 intrinsics code
10061     if (UseAdler32Intrinsics) {
10062       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
10063     }
10064 
10065 #endif // COMPILER2_OR_JVMCI
10066   }
10067 
10068  public:
10069   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
10070     switch(blob_id) {
10071     case initial_id:
10072       generate_initial_stubs();
10073       break;
10074      case continuation_id:
10075       generate_continuation_stubs();
10076       break;
10077     case compiler_id:
10078       generate_compiler_stubs();
10079       break;
10080     case final_id:
10081       generate_final_stubs();
10082       break;
10083     default:
10084       fatal("unexpected blob id: %d", blob_id);
10085       break;
10086     };
10087   }
10088 }; // end class declaration
10089 
10090 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
10091   StubGenerator g(code, blob_id);
10092 }
10093 
10094 
10095 #if defined (LINUX)
10096 
10097 // Define pointers to atomic stubs and initialize them to point to the
10098 // code in atomic_aarch64.S.
10099 
10100 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
10101   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
10102     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
10103   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
10104     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
10105 
10106 DEFAULT_ATOMIC_OP(fetch_add, 4, )
10107 DEFAULT_ATOMIC_OP(fetch_add, 8, )
10108 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
10109 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
10110 DEFAULT_ATOMIC_OP(xchg, 4, )
10111 DEFAULT_ATOMIC_OP(xchg, 8, )
10112 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
10113 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
10114 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
10115 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
10116 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
10117 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
10118 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
10119 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
10120 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
10121 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
10122 
10123 #undef DEFAULT_ATOMIC_OP
10124 
10125 #endif // LINUX