1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     // All of j_rargN may be used to return inline type fields so be careful
  332     // not to clobber those.
  333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  334     // assignment of Rresult below.
  335     Register Rresult = r14, Rresult_type = r15;
  336     __ ldr(Rresult, result);
  337     Label is_long, is_float, is_double, check_prim, exit;
  338     __ ldr(Rresult_type, result_type);
  339     __ cmp(Rresult_type, (u1)T_OBJECT);
  340     __ br(Assembler::EQ, check_prim);
  341     __ cmp(Rresult_type, (u1)T_LONG);
  342     __ br(Assembler::EQ, is_long);
  343     __ cmp(Rresult_type, (u1)T_FLOAT);
  344     __ br(Assembler::EQ, is_float);
  345     __ cmp(Rresult_type, (u1)T_DOUBLE);
  346     __ br(Assembler::EQ, is_double);
  347 
  348     // handle T_INT case
  349     __ strw(r0, Address(Rresult));
  350 
  351     __ BIND(exit);
  352 
  353     // pop parameters
  354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  355 
  356 #ifdef ASSERT
  357     // verify that threads correspond
  358     {
  359       Label L, S;
  360       __ ldr(rscratch1, thread);
  361       __ cmp(rthread, rscratch1);
  362       __ br(Assembler::NE, S);
  363       __ get_thread(rscratch1);
  364       __ cmp(rthread, rscratch1);
  365       __ br(Assembler::EQ, L);
  366       __ BIND(S);
  367       __ stop("StubRoutines::call_stub: threads must correspond");
  368       __ BIND(L);
  369     }
  370 #endif
  371 
  372     __ pop_cont_fastpath(rthread);
  373 
  374     // restore callee-save registers
  375     __ ldpd(v15, v14,  d15_save);
  376     __ ldpd(v13, v12,  d13_save);
  377     __ ldpd(v11, v10,  d11_save);
  378     __ ldpd(v9,  v8,   d9_save);
  379 
  380     __ ldp(r28, r27,   r28_save);
  381     __ ldp(r26, r25,   r26_save);
  382     __ ldp(r24, r23,   r24_save);
  383     __ ldp(r22, r21,   r22_save);
  384     __ ldp(r20, r19,   r20_save);
  385 
  386     // restore fpcr
  387     __ ldr(rscratch1,  fpcr_save);
  388     __ set_fpcr(rscratch1);
  389 
  390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  391     __ ldrw(c_rarg2, result_type);
  392     __ ldr(c_rarg3,  method);
  393     __ ldp(c_rarg4, c_rarg5,  entry_point);
  394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  395 
  396     // leave frame and return to caller
  397     __ leave();
  398     __ ret(lr);
  399 
  400     // handle return types different from T_INT
  401     __ BIND(check_prim);
  402     if (InlineTypeReturnedAsFields) {
  403       // Check for scalarized return value
  404       __ tbz(r0, 0, is_long);
  405       // Load pack handler address
  406       __ andr(rscratch1, r0, -2);
  407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
  408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  409       __ blr(rscratch1);
  410       __ b(exit);
  411     }
  412 
  413     __ BIND(is_long);
  414     __ str(r0, Address(Rresult, 0));
  415     __ br(Assembler::AL, exit);
  416 
  417     __ BIND(is_float);
  418     __ strs(j_farg0, Address(Rresult, 0));
  419     __ br(Assembler::AL, exit);
  420 
  421     __ BIND(is_double);
  422     __ strd(j_farg0, Address(Rresult, 0));
  423     __ br(Assembler::AL, exit);
  424 
  425     return start;
  426   }
  427 
  428   // Return point for a Java call if there's an exception thrown in
  429   // Java code.  The exception is caught and transformed into a
  430   // pending exception stored in JavaThread that can be tested from
  431   // within the VM.
  432   //
  433   // Note: Usually the parameters are removed by the callee. In case
  434   // of an exception crossing an activation frame boundary, that is
  435   // not the case if the callee is compiled code => need to setup the
  436   // rsp.
  437   //
  438   // r0: exception oop
  439 
  440   address generate_catch_exception() {
  441     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  442     StubCodeMark mark(this, stub_id);
  443     address start = __ pc();
  444 
  445     // same as in generate_call_stub():
  446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  447     const Address thread        (rfp, thread_off         * wordSize);
  448 
  449 #ifdef ASSERT
  450     // verify that threads correspond
  451     {
  452       Label L, S;
  453       __ ldr(rscratch1, thread);
  454       __ cmp(rthread, rscratch1);
  455       __ br(Assembler::NE, S);
  456       __ get_thread(rscratch1);
  457       __ cmp(rthread, rscratch1);
  458       __ br(Assembler::EQ, L);
  459       __ bind(S);
  460       __ stop("StubRoutines::catch_exception: threads must correspond");
  461       __ bind(L);
  462     }
  463 #endif
  464 
  465     // set pending exception
  466     __ verify_oop(r0);
  467 
  468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  469     __ mov(rscratch1, (address)__FILE__);
  470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  471     __ movw(rscratch1, (int)__LINE__);
  472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  473 
  474     // complete return to VM
  475     assert(StubRoutines::_call_stub_return_address != nullptr,
  476            "_call_stub_return_address must have been generated before");
  477     __ b(StubRoutines::_call_stub_return_address);
  478 
  479     return start;
  480   }
  481 
  482   // Continuation point for runtime calls returning with a pending
  483   // exception.  The pending exception check happened in the runtime
  484   // or native call stub.  The pending exception in Thread is
  485   // converted into a Java-level exception.
  486   //
  487   // Contract with Java-level exception handlers:
  488   // r0: exception
  489   // r3: throwing pc
  490   //
  491   // NOTE: At entry of this stub, exception-pc must be in LR !!
  492 
  493   // NOTE: this is always used as a jump target within generated code
  494   // so it just needs to be generated code with no x86 prolog
  495 
  496   address generate_forward_exception() {
  497     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  498     StubCodeMark mark(this, stub_id);
  499     address start = __ pc();
  500 
  501     // Upon entry, LR points to the return address returning into
  502     // Java (interpreted or compiled) code; i.e., the return address
  503     // becomes the throwing pc.
  504     //
  505     // Arguments pushed before the runtime call are still on the stack
  506     // but the exception handler will reset the stack pointer ->
  507     // ignore them.  A potential result in registers can be ignored as
  508     // well.
  509 
  510 #ifdef ASSERT
  511     // make sure this code is only executed if there is a pending exception
  512     {
  513       Label L;
  514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  515       __ cbnz(rscratch1, L);
  516       __ stop("StubRoutines::forward exception: no pending exception (1)");
  517       __ bind(L);
  518     }
  519 #endif
  520 
  521     // compute exception handler into r19
  522 
  523     // call the VM to find the handler address associated with the
  524     // caller address. pass thread in r0 and caller pc (ret address)
  525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  526     // the stack.
  527     __ mov(c_rarg1, lr);
  528     // lr will be trashed by the VM call so we move it to R19
  529     // (callee-saved) because we also need to pass it to the handler
  530     // returned by this call.
  531     __ mov(r19, lr);
  532     BLOCK_COMMENT("call exception_handler_for_return_address");
  533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  534                          SharedRuntime::exception_handler_for_return_address),
  535                     rthread, c_rarg1);
  536     // Reinitialize the ptrue predicate register, in case the external runtime
  537     // call clobbers ptrue reg, as we may return to SVE compiled code.
  538     __ reinitialize_ptrue();
  539 
  540     // we should not really care that lr is no longer the callee
  541     // address. we saved the value the handler needs in r19 so we can
  542     // just copy it to r3. however, the C2 handler will push its own
  543     // frame and then calls into the VM and the VM code asserts that
  544     // the PC for the frame above the handler belongs to a compiled
  545     // Java method. So, we restore lr here to satisfy that assert.
  546     __ mov(lr, r19);
  547     // setup r0 & r3 & clear pending exception
  548     __ mov(r3, r19);
  549     __ mov(r19, r0);
  550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  552 
  553 #ifdef ASSERT
  554     // make sure exception is set
  555     {
  556       Label L;
  557       __ cbnz(r0, L);
  558       __ stop("StubRoutines::forward exception: no pending exception (2)");
  559       __ bind(L);
  560     }
  561 #endif
  562 
  563     // continue at exception handler
  564     // r0: exception
  565     // r3: throwing pc
  566     // r19: exception handler
  567     __ verify_oop(r0);
  568     __ br(r19);
  569 
  570     return start;
  571   }
  572 
  573   // Non-destructive plausibility checks for oops
  574   //
  575   // Arguments:
  576   //    r0: oop to verify
  577   //    rscratch1: error message
  578   //
  579   // Stack after saving c_rarg3:
  580   //    [tos + 0]: saved c_rarg3
  581   //    [tos + 1]: saved c_rarg2
  582   //    [tos + 2]: saved lr
  583   //    [tos + 3]: saved rscratch2
  584   //    [tos + 4]: saved r0
  585   //    [tos + 5]: saved rscratch1
  586   address generate_verify_oop() {
  587     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  588     StubCodeMark mark(this, stub_id);
  589     address start = __ pc();
  590 
  591     Label exit, error;
  592 
  593     // save c_rarg2 and c_rarg3
  594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  595 
  596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  598     __ ldr(c_rarg3, Address(c_rarg2));
  599     __ add(c_rarg3, c_rarg3, 1);
  600     __ str(c_rarg3, Address(c_rarg2));
  601 
  602     // object is in r0
  603     // make sure object is 'reasonable'
  604     __ cbz(r0, exit); // if obj is null it is OK
  605 
  606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  608 
  609     // return if everything seems ok
  610     __ bind(exit);
  611 
  612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  613     __ ret(lr);
  614 
  615     // handle errors
  616     __ bind(error);
  617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  618 
  619     __ push(RegSet::range(r0, r29), sp);
  620     // debug(char* msg, int64_t pc, int64_t regs[])
  621     __ mov(c_rarg0, rscratch1);      // pass address of error message
  622     __ mov(c_rarg1, lr);             // pass return address
  623     __ mov(c_rarg2, sp);             // pass address of regs on stack
  624 #ifndef PRODUCT
  625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  626 #endif
  627     BLOCK_COMMENT("call MacroAssembler::debug");
  628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  629     __ blr(rscratch1);
  630     __ hlt(0);
  631 
  632     return start;
  633   }
  634 
  635   // Generate indices for iota vector.
  636   address generate_iota_indices(StubGenStubId stub_id) {
  637     __ align(CodeEntryAlignment);
  638     StubCodeMark mark(this, stub_id);
  639     address start = __ pc();
  640     // B
  641     __ emit_data64(0x0706050403020100, relocInfo::none);
  642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  643     // H
  644     __ emit_data64(0x0003000200010000, relocInfo::none);
  645     __ emit_data64(0x0007000600050004, relocInfo::none);
  646     // S
  647     __ emit_data64(0x0000000100000000, relocInfo::none);
  648     __ emit_data64(0x0000000300000002, relocInfo::none);
  649     // D
  650     __ emit_data64(0x0000000000000000, relocInfo::none);
  651     __ emit_data64(0x0000000000000001, relocInfo::none);
  652     // S - FP
  653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  655     // D - FP
  656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  658     return start;
  659   }
  660 
  661   // The inner part of zero_words().  This is the bulk operation,
  662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  663   // caller is responsible for zeroing the last few words.
  664   //
  665   // Inputs:
  666   // r10: the HeapWord-aligned base address of an array to zero.
  667   // r11: the count in HeapWords, r11 > 0.
  668   //
  669   // Returns r10 and r11, adjusted for the caller to clear.
  670   // r10: the base address of the tail of words left to clear.
  671   // r11: the number of words in the tail.
  672   //      r11 < MacroAssembler::zero_words_block_size.
  673 
  674   address generate_zero_blocks() {
  675     Label done;
  676     Label base_aligned;
  677 
  678     Register base = r10, cnt = r11;
  679 
  680     __ align(CodeEntryAlignment);
  681     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  682     StubCodeMark mark(this, stub_id);
  683     address start = __ pc();
  684 
  685     if (UseBlockZeroing) {
  686       int zva_length = VM_Version::zva_length();
  687 
  688       // Ensure ZVA length can be divided by 16. This is required by
  689       // the subsequent operations.
  690       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  691 
  692       __ tbz(base, 3, base_aligned);
  693       __ str(zr, Address(__ post(base, 8)));
  694       __ sub(cnt, cnt, 1);
  695       __ bind(base_aligned);
  696 
  697       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  698       // alignment.
  699       Label small;
  700       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  701       __ subs(rscratch1, cnt, low_limit >> 3);
  702       __ br(Assembler::LT, small);
  703       __ zero_dcache_blocks(base, cnt);
  704       __ bind(small);
  705     }
  706 
  707     {
  708       // Number of stp instructions we'll unroll
  709       const int unroll =
  710         MacroAssembler::zero_words_block_size / 2;
  711       // Clear the remaining blocks.
  712       Label loop;
  713       __ subs(cnt, cnt, unroll * 2);
  714       __ br(Assembler::LT, done);
  715       __ bind(loop);
  716       for (int i = 0; i < unroll; i++)
  717         __ stp(zr, zr, __ post(base, 16));
  718       __ subs(cnt, cnt, unroll * 2);
  719       __ br(Assembler::GE, loop);
  720       __ bind(done);
  721       __ add(cnt, cnt, unroll * 2);
  722     }
  723 
  724     __ ret(lr);
  725 
  726     return start;
  727   }
  728 
  729 
  730   typedef enum {
  731     copy_forwards = 1,
  732     copy_backwards = -1
  733   } copy_direction;
  734 
  735   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  736   // for arraycopy stubs.
  737   class ArrayCopyBarrierSetHelper : StackObj {
  738     BarrierSetAssembler* _bs_asm;
  739     MacroAssembler* _masm;
  740     DecoratorSet _decorators;
  741     BasicType _type;
  742     Register _gct1;
  743     Register _gct2;
  744     Register _gct3;
  745     FloatRegister _gcvt1;
  746     FloatRegister _gcvt2;
  747     FloatRegister _gcvt3;
  748 
  749   public:
  750     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  751                               DecoratorSet decorators,
  752                               BasicType type,
  753                               Register gct1,
  754                               Register gct2,
  755                               Register gct3,
  756                               FloatRegister gcvt1,
  757                               FloatRegister gcvt2,
  758                               FloatRegister gcvt3)
  759       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  760         _masm(masm),
  761         _decorators(decorators),
  762         _type(type),
  763         _gct1(gct1),
  764         _gct2(gct2),
  765         _gct3(gct3),
  766         _gcvt1(gcvt1),
  767         _gcvt2(gcvt2),
  768         _gcvt3(gcvt3) {
  769     }
  770 
  771     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  772       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  773                             dst1, dst2, src,
  774                             _gct1, _gct2, _gcvt1);
  775     }
  776 
  777     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  778       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  779                              dst, src1, src2,
  780                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  781     }
  782 
  783     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  784       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  785                             dst1, dst2, src,
  786                             _gct1);
  787     }
  788 
  789     void copy_store_at_16(Address dst, Register src1, Register src2) {
  790       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  791                              dst, src1, src2,
  792                              _gct1, _gct2, _gct3);
  793     }
  794 
  795     void copy_load_at_8(Register dst, Address src) {
  796       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  797                             dst, noreg, src,
  798                             _gct1);
  799     }
  800 
  801     void copy_store_at_8(Address dst, Register src) {
  802       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  803                              dst, src, noreg,
  804                              _gct1, _gct2, _gct3);
  805     }
  806   };
  807 
  808   // Bulk copy of blocks of 8 words.
  809   //
  810   // count is a count of words.
  811   //
  812   // Precondition: count >= 8
  813   //
  814   // Postconditions:
  815   //
  816   // The least significant bit of count contains the remaining count
  817   // of words to copy.  The rest of count is trash.
  818   //
  819   // s and d are adjusted to point to the remaining words to copy
  820   //
  821   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  822     BasicType type;
  823     copy_direction direction;
  824 
  825     switch (stub_id) {
  826     case copy_byte_f_id:
  827       direction = copy_forwards;
  828       type = T_BYTE;
  829       break;
  830     case copy_byte_b_id:
  831       direction = copy_backwards;
  832       type = T_BYTE;
  833       break;
  834     case copy_oop_f_id:
  835       direction = copy_forwards;
  836       type = T_OBJECT;
  837       break;
  838     case copy_oop_b_id:
  839       direction = copy_backwards;
  840       type = T_OBJECT;
  841       break;
  842     case copy_oop_uninit_f_id:
  843       direction = copy_forwards;
  844       type = T_OBJECT;
  845       break;
  846     case copy_oop_uninit_b_id:
  847       direction = copy_backwards;
  848       type = T_OBJECT;
  849       break;
  850     default:
  851       ShouldNotReachHere();
  852     }
  853 
  854     int unit = wordSize * direction;
  855     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  856 
  857     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  858       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  859     const Register stride = r14;
  860     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  861     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  862     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  863 
  864     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  865     assert_different_registers(s, d, count, rscratch1, rscratch2);
  866 
  867     Label again, drain;
  868 
  869     __ align(CodeEntryAlignment);
  870 
  871     StubCodeMark mark(this, stub_id);
  872 
  873     __ bind(start);
  874 
  875     Label unaligned_copy_long;
  876     if (AvoidUnalignedAccesses) {
  877       __ tbnz(d, 3, unaligned_copy_long);
  878     }
  879 
  880     if (direction == copy_forwards) {
  881       __ sub(s, s, bias);
  882       __ sub(d, d, bias);
  883     }
  884 
  885 #ifdef ASSERT
  886     // Make sure we are never given < 8 words
  887     {
  888       Label L;
  889       __ cmp(count, (u1)8);
  890       __ br(Assembler::GE, L);
  891       __ stop("genrate_copy_longs called with < 8 words");
  892       __ bind(L);
  893     }
  894 #endif
  895 
  896     // Fill 8 registers
  897     if (UseSIMDForMemoryOps) {
  898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  899       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  900     } else {
  901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  902       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  903       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  905     }
  906 
  907     __ subs(count, count, 16);
  908     __ br(Assembler::LO, drain);
  909 
  910     int prefetch = PrefetchCopyIntervalInBytes;
  911     bool use_stride = false;
  912     if (direction == copy_backwards) {
  913        use_stride = prefetch > 256;
  914        prefetch = -prefetch;
  915        if (use_stride) __ mov(stride, prefetch);
  916     }
  917 
  918     __ bind(again);
  919 
  920     if (PrefetchCopyIntervalInBytes > 0)
  921       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  922 
  923     if (UseSIMDForMemoryOps) {
  924       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  925       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  926       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  927       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  928     } else {
  929       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  930       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  931       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  932       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  933       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  934       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  937     }
  938 
  939     __ subs(count, count, 8);
  940     __ br(Assembler::HS, again);
  941 
  942     // Drain
  943     __ bind(drain);
  944     if (UseSIMDForMemoryOps) {
  945       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  946       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  947     } else {
  948       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  950       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  951       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  952     }
  953 
  954     {
  955       Label L1, L2;
  956       __ tbz(count, exact_log2(4), L1);
  957       if (UseSIMDForMemoryOps) {
  958         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  959         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  960       } else {
  961         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  962         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  963         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  964         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  965       }
  966       __ bind(L1);
  967 
  968       if (direction == copy_forwards) {
  969         __ add(s, s, bias);
  970         __ add(d, d, bias);
  971       }
  972 
  973       __ tbz(count, 1, L2);
  974       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  975       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  976       __ bind(L2);
  977     }
  978 
  979     __ ret(lr);
  980 
  981     if (AvoidUnalignedAccesses) {
  982       Label drain, again;
  983       // Register order for storing. Order is different for backward copy.
  984 
  985       __ bind(unaligned_copy_long);
  986 
  987       // source address is even aligned, target odd aligned
  988       //
  989       // when forward copying word pairs we read long pairs at offsets
  990       // {0, 2, 4, 6} (in long words). when backwards copying we read
  991       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  992       // address by -2 in the forwards case so we can compute the
  993       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  994       // or -1.
  995       //
  996       // when forward copying we need to store 1 word, 3 pairs and
  997       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  998       // zero offset We adjust the destination by -1 which means we
  999       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1000       //
 1001       // When backwards copyng we need to store 1 word, 3 pairs and
 1002       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1003       // offsets {1, 3, 5, 7, 8} * unit.
 1004 
 1005       if (direction == copy_forwards) {
 1006         __ sub(s, s, 16);
 1007         __ sub(d, d, 8);
 1008       }
 1009 
 1010       // Fill 8 registers
 1011       //
 1012       // for forwards copy s was offset by -16 from the original input
 1013       // value of s so the register contents are at these offsets
 1014       // relative to the 64 bit block addressed by that original input
 1015       // and so on for each successive 64 byte block when s is updated
 1016       //
 1017       // t0 at offset 0,  t1 at offset 8
 1018       // t2 at offset 16, t3 at offset 24
 1019       // t4 at offset 32, t5 at offset 40
 1020       // t6 at offset 48, t7 at offset 56
 1021 
 1022       // for backwards copy s was not offset so the register contents
 1023       // are at these offsets into the preceding 64 byte block
 1024       // relative to that original input and so on for each successive
 1025       // preceding 64 byte block when s is updated. this explains the
 1026       // slightly counter-intuitive looking pattern of register usage
 1027       // in the stp instructions for backwards copy.
 1028       //
 1029       // t0 at offset -16, t1 at offset -8
 1030       // t2 at offset -32, t3 at offset -24
 1031       // t4 at offset -48, t5 at offset -40
 1032       // t6 at offset -64, t7 at offset -56
 1033 
 1034       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1035       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1036       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1037       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1038 
 1039       __ subs(count, count, 16);
 1040       __ br(Assembler::LO, drain);
 1041 
 1042       int prefetch = PrefetchCopyIntervalInBytes;
 1043       bool use_stride = false;
 1044       if (direction == copy_backwards) {
 1045          use_stride = prefetch > 256;
 1046          prefetch = -prefetch;
 1047          if (use_stride) __ mov(stride, prefetch);
 1048       }
 1049 
 1050       __ bind(again);
 1051 
 1052       if (PrefetchCopyIntervalInBytes > 0)
 1053         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1054 
 1055       if (direction == copy_forwards) {
 1056        // allowing for the offset of -8 the store instructions place
 1057        // registers into the target 64 bit block at the following
 1058        // offsets
 1059        //
 1060        // t0 at offset 0
 1061        // t1 at offset 8,  t2 at offset 16
 1062        // t3 at offset 24, t4 at offset 32
 1063        // t5 at offset 40, t6 at offset 48
 1064        // t7 at offset 56
 1065 
 1066         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1067         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1068         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1069         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1070         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1071         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1072         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1073         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1074         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1075       } else {
 1076        // d was not offset when we started so the registers are
 1077        // written into the 64 bit block preceding d with the following
 1078        // offsets
 1079        //
 1080        // t1 at offset -8
 1081        // t3 at offset -24, t0 at offset -16
 1082        // t5 at offset -48, t2 at offset -32
 1083        // t7 at offset -56, t4 at offset -48
 1084        //                   t6 at offset -64
 1085        //
 1086        // note that this matches the offsets previously noted for the
 1087        // loads
 1088 
 1089         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1090         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1091         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1092         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1093         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1094         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1095         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1096         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1097         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1098       }
 1099 
 1100       __ subs(count, count, 8);
 1101       __ br(Assembler::HS, again);
 1102 
 1103       // Drain
 1104       //
 1105       // this uses the same pattern of offsets and register arguments
 1106       // as above
 1107       __ bind(drain);
 1108       if (direction == copy_forwards) {
 1109         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1110         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1111         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1112         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1113         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1114       } else {
 1115         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1116         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1117         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1118         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1119         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1120       }
 1121       // now we need to copy any remaining part block which may
 1122       // include a 4 word block subblock and/or a 2 word subblock.
 1123       // bits 2 and 1 in the count are the tell-tale for whether we
 1124       // have each such subblock
 1125       {
 1126         Label L1, L2;
 1127         __ tbz(count, exact_log2(4), L1);
 1128        // this is the same as above but copying only 4 longs hence
 1129        // with only one intervening stp between the str instructions
 1130        // but note that the offsets and registers still follow the
 1131        // same pattern
 1132         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1133         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1134         if (direction == copy_forwards) {
 1135           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1136           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1137           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1141           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1142         }
 1143         __ bind(L1);
 1144 
 1145         __ tbz(count, 1, L2);
 1146        // this is the same as above but copying only 2 longs hence
 1147        // there is no intervening stp between the str instructions
 1148        // but note that the offset and register patterns are still
 1149        // the same
 1150         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1151         if (direction == copy_forwards) {
 1152           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1153           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1154         } else {
 1155           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1156           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1157         }
 1158         __ bind(L2);
 1159 
 1160        // for forwards copy we need to re-adjust the offsets we
 1161        // applied so that s and d are follow the last words written
 1162 
 1163        if (direction == copy_forwards) {
 1164          __ add(s, s, 16);
 1165          __ add(d, d, 8);
 1166        }
 1167 
 1168       }
 1169 
 1170       __ ret(lr);
 1171       }
 1172   }
 1173 
 1174   // Small copy: less than 16 bytes.
 1175   //
 1176   // NB: Ignores all of the bits of count which represent more than 15
 1177   // bytes, so a caller doesn't have to mask them.
 1178 
 1179   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1180     bool is_backwards = step < 0;
 1181     size_t granularity = uabs(step);
 1182     int direction = is_backwards ? -1 : 1;
 1183 
 1184     Label Lword, Lint, Lshort, Lbyte;
 1185 
 1186     assert(granularity
 1187            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1188 
 1189     const Register t0 = r3;
 1190     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1191     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1192 
 1193     // ??? I don't know if this bit-test-and-branch is the right thing
 1194     // to do.  It does a lot of jumping, resulting in several
 1195     // mispredicted branches.  It might make more sense to do this
 1196     // with something like Duff's device with a single computed branch.
 1197 
 1198     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1199     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1200     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1201     __ bind(Lword);
 1202 
 1203     if (granularity <= sizeof (jint)) {
 1204       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1205       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1206       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1207       __ bind(Lint);
 1208     }
 1209 
 1210     if (granularity <= sizeof (jshort)) {
 1211       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1212       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1213       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1214       __ bind(Lshort);
 1215     }
 1216 
 1217     if (granularity <= sizeof (jbyte)) {
 1218       __ tbz(count, 0, Lbyte);
 1219       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1220       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1221       __ bind(Lbyte);
 1222     }
 1223   }
 1224 
 1225   Label copy_f, copy_b;
 1226   Label copy_obj_f, copy_obj_b;
 1227   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1228 
 1229   // All-singing all-dancing memory copy.
 1230   //
 1231   // Copy count units of memory from s to d.  The size of a unit is
 1232   // step, which can be positive or negative depending on the direction
 1233   // of copy.  If is_aligned is false, we align the source address.
 1234   //
 1235 
 1236   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1237                    Register s, Register d, Register count, int step) {
 1238     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1239     bool is_backwards = step < 0;
 1240     unsigned int granularity = uabs(step);
 1241     const Register t0 = r3, t1 = r4;
 1242 
 1243     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1244     // load all the data before writing anything
 1245     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1246     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1247     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1248     const Register send = r17, dend = r16;
 1249     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1250     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1251     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1252 
 1253     if (PrefetchCopyIntervalInBytes > 0)
 1254       __ prfm(Address(s, 0), PLDL1KEEP);
 1255     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1256     __ br(Assembler::HI, copy_big);
 1257 
 1258     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1259     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1260 
 1261     __ cmp(count, u1(16/granularity));
 1262     __ br(Assembler::LS, copy16);
 1263 
 1264     __ cmp(count, u1(64/granularity));
 1265     __ br(Assembler::HI, copy80);
 1266 
 1267     __ cmp(count, u1(32/granularity));
 1268     __ br(Assembler::LS, copy32);
 1269 
 1270     // 33..64 bytes
 1271     if (UseSIMDForMemoryOps) {
 1272       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1273       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1274       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1275       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1276     } else {
 1277       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1278       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1279       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1280       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1281 
 1282       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1283       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1284       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1285       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1286     }
 1287     __ b(finish);
 1288 
 1289     // 17..32 bytes
 1290     __ bind(copy32);
 1291     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1292     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1293 
 1294     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1295     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1296     __ b(finish);
 1297 
 1298     // 65..80/96 bytes
 1299     // (96 bytes if SIMD because we do 32 byes per instruction)
 1300     __ bind(copy80);
 1301     if (UseSIMDForMemoryOps) {
 1302       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1303       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1304       // Unaligned pointers can be an issue for copying.
 1305       // The issue has more chances to happen when granularity of data is
 1306       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1307       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1308       // The most performance drop has been seen for the range 65-80 bytes.
 1309       // For such cases using the pair of ldp/stp instead of the third pair of
 1310       // ldpq/stpq fixes the performance issue.
 1311       if (granularity < sizeof (jint)) {
 1312         Label copy96;
 1313         __ cmp(count, u1(80/granularity));
 1314         __ br(Assembler::HI, copy96);
 1315         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1316 
 1317         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1318         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1319 
 1320         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1321         __ b(finish);
 1322 
 1323         __ bind(copy96);
 1324       }
 1325       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1326 
 1327       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1328       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1329 
 1330       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1331     } else {
 1332       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1333       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1334       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1335       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1336       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1337 
 1338       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1339       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1340       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1341       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1342       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1343     }
 1344     __ b(finish);
 1345 
 1346     // 0..16 bytes
 1347     __ bind(copy16);
 1348     __ cmp(count, u1(8/granularity));
 1349     __ br(Assembler::LO, copy8);
 1350 
 1351     // 8..16 bytes
 1352     bs.copy_load_at_8(t0, Address(s, 0));
 1353     bs.copy_load_at_8(t1, Address(send, -8));
 1354     bs.copy_store_at_8(Address(d, 0), t0);
 1355     bs.copy_store_at_8(Address(dend, -8), t1);
 1356     __ b(finish);
 1357 
 1358     if (granularity < 8) {
 1359       // 4..7 bytes
 1360       __ bind(copy8);
 1361       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1362       __ ldrw(t0, Address(s, 0));
 1363       __ ldrw(t1, Address(send, -4));
 1364       __ strw(t0, Address(d, 0));
 1365       __ strw(t1, Address(dend, -4));
 1366       __ b(finish);
 1367       if (granularity < 4) {
 1368         // 0..3 bytes
 1369         __ bind(copy4);
 1370         __ cbz(count, finish); // get rid of 0 case
 1371         if (granularity == 2) {
 1372           __ ldrh(t0, Address(s, 0));
 1373           __ strh(t0, Address(d, 0));
 1374         } else { // granularity == 1
 1375           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1376           // the first and last byte.
 1377           // Handle the 3 byte case by loading and storing base + count/2
 1378           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1379           // This does means in the 1 byte case we load/store the same
 1380           // byte 3 times.
 1381           __ lsr(count, count, 1);
 1382           __ ldrb(t0, Address(s, 0));
 1383           __ ldrb(t1, Address(send, -1));
 1384           __ ldrb(t2, Address(s, count));
 1385           __ strb(t0, Address(d, 0));
 1386           __ strb(t1, Address(dend, -1));
 1387           __ strb(t2, Address(d, count));
 1388         }
 1389         __ b(finish);
 1390       }
 1391     }
 1392 
 1393     __ bind(copy_big);
 1394     if (is_backwards) {
 1395       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1396       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1397     }
 1398 
 1399     // Now we've got the small case out of the way we can align the
 1400     // source address on a 2-word boundary.
 1401 
 1402     // Here we will materialize a count in r15, which is used by copy_memory_small
 1403     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1404     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1405     // can not be used as a temp register, as it contains the count.
 1406 
 1407     Label aligned;
 1408 
 1409     if (is_aligned) {
 1410       // We may have to adjust by 1 word to get s 2-word-aligned.
 1411       __ tbz(s, exact_log2(wordSize), aligned);
 1412       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1413       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1414       __ sub(count, count, wordSize/granularity);
 1415     } else {
 1416       if (is_backwards) {
 1417         __ andr(r15, s, 2 * wordSize - 1);
 1418       } else {
 1419         __ neg(r15, s);
 1420         __ andr(r15, r15, 2 * wordSize - 1);
 1421       }
 1422       // r15 is the byte adjustment needed to align s.
 1423       __ cbz(r15, aligned);
 1424       int shift = exact_log2(granularity);
 1425       if (shift > 0) {
 1426         __ lsr(r15, r15, shift);
 1427       }
 1428       __ sub(count, count, r15);
 1429 
 1430 #if 0
 1431       // ?? This code is only correct for a disjoint copy.  It may or
 1432       // may not make sense to use it in that case.
 1433 
 1434       // Copy the first pair; s and d may not be aligned.
 1435       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1436       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1437 
 1438       // Align s and d, adjust count
 1439       if (is_backwards) {
 1440         __ sub(s, s, r15);
 1441         __ sub(d, d, r15);
 1442       } else {
 1443         __ add(s, s, r15);
 1444         __ add(d, d, r15);
 1445       }
 1446 #else
 1447       copy_memory_small(decorators, type, s, d, r15, step);
 1448 #endif
 1449     }
 1450 
 1451     __ bind(aligned);
 1452 
 1453     // s is now 2-word-aligned.
 1454 
 1455     // We have a count of units and some trailing bytes. Adjust the
 1456     // count and do a bulk copy of words. If the shift is zero
 1457     // perform a move instead to benefit from zero latency moves.
 1458     int shift = exact_log2(wordSize/granularity);
 1459     if (shift > 0) {
 1460       __ lsr(r15, count, shift);
 1461     } else {
 1462       __ mov(r15, count);
 1463     }
 1464     if (direction == copy_forwards) {
 1465       if (type != T_OBJECT) {
 1466         __ bl(copy_f);
 1467       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1468         __ bl(copy_obj_uninit_f);
 1469       } else {
 1470         __ bl(copy_obj_f);
 1471       }
 1472     } else {
 1473       if (type != T_OBJECT) {
 1474         __ bl(copy_b);
 1475       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1476         __ bl(copy_obj_uninit_b);
 1477       } else {
 1478         __ bl(copy_obj_b);
 1479       }
 1480     }
 1481 
 1482     // And the tail.
 1483     copy_memory_small(decorators, type, s, d, count, step);
 1484 
 1485     if (granularity >= 8) __ bind(copy8);
 1486     if (granularity >= 4) __ bind(copy4);
 1487     __ bind(finish);
 1488   }
 1489 
 1490 
 1491   void clobber_registers() {
 1492 #ifdef ASSERT
 1493     RegSet clobbered
 1494       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1495     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1496     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1497     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1498       __ mov(*it, rscratch1);
 1499     }
 1500 #endif
 1501 
 1502   }
 1503 
 1504   // Scan over array at a for count oops, verifying each one.
 1505   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1506   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1507     Label loop, end;
 1508     __ mov(rscratch1, a);
 1509     __ mov(rscratch2, zr);
 1510     __ bind(loop);
 1511     __ cmp(rscratch2, count);
 1512     __ br(Assembler::HS, end);
 1513     if (size == wordSize) {
 1514       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1515       __ verify_oop(temp);
 1516     } else {
 1517       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1518       __ decode_heap_oop(temp); // calls verify_oop
 1519     }
 1520     __ add(rscratch2, rscratch2, 1);
 1521     __ b(loop);
 1522     __ bind(end);
 1523   }
 1524 
 1525   // Arguments:
 1526   //   stub_id - is used to name the stub and identify all details of
 1527   //             how to perform the copy.
 1528   //
 1529   //   entry - is assigned to the stub's post push entry point unless
 1530   //           it is null
 1531   //
 1532   // Inputs:
 1533   //   c_rarg0   - source array address
 1534   //   c_rarg1   - destination array address
 1535   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1536   //
 1537   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1538   // the hardware handle it.  The two dwords within qwords that span
 1539   // cache line boundaries will still be loaded and stored atomically.
 1540   //
 1541   // Side Effects: entry is set to the (post push) entry point so it
 1542   //               can be used by the corresponding conjoint copy
 1543   //               method
 1544   //
 1545   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1546     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1547     RegSet saved_reg = RegSet::of(s, d, count);
 1548     int size;
 1549     bool aligned;
 1550     bool is_oop;
 1551     bool dest_uninitialized;
 1552     switch (stub_id) {
 1553     case jbyte_disjoint_arraycopy_id:
 1554       size = sizeof(jbyte);
 1555       aligned = false;
 1556       is_oop = false;
 1557       dest_uninitialized = false;
 1558       break;
 1559     case arrayof_jbyte_disjoint_arraycopy_id:
 1560       size = sizeof(jbyte);
 1561       aligned = true;
 1562       is_oop = false;
 1563       dest_uninitialized = false;
 1564       break;
 1565     case jshort_disjoint_arraycopy_id:
 1566       size = sizeof(jshort);
 1567       aligned = false;
 1568       is_oop = false;
 1569       dest_uninitialized = false;
 1570       break;
 1571     case arrayof_jshort_disjoint_arraycopy_id:
 1572       size = sizeof(jshort);
 1573       aligned = true;
 1574       is_oop = false;
 1575       dest_uninitialized = false;
 1576       break;
 1577     case jint_disjoint_arraycopy_id:
 1578       size = sizeof(jint);
 1579       aligned = false;
 1580       is_oop = false;
 1581       dest_uninitialized = false;
 1582       break;
 1583     case arrayof_jint_disjoint_arraycopy_id:
 1584       size = sizeof(jint);
 1585       aligned = true;
 1586       is_oop = false;
 1587       dest_uninitialized = false;
 1588       break;
 1589     case jlong_disjoint_arraycopy_id:
 1590       // since this is always aligned we can (should!) use the same
 1591       // stub as for case arrayof_jlong_disjoint_arraycopy
 1592       ShouldNotReachHere();
 1593       break;
 1594     case arrayof_jlong_disjoint_arraycopy_id:
 1595       size = sizeof(jlong);
 1596       aligned = true;
 1597       is_oop = false;
 1598       dest_uninitialized = false;
 1599       break;
 1600     case oop_disjoint_arraycopy_id:
 1601       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1602       aligned = !UseCompressedOops;
 1603       is_oop = true;
 1604       dest_uninitialized = false;
 1605       break;
 1606     case arrayof_oop_disjoint_arraycopy_id:
 1607       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1608       aligned = !UseCompressedOops;
 1609       is_oop = true;
 1610       dest_uninitialized = false;
 1611       break;
 1612     case oop_disjoint_arraycopy_uninit_id:
 1613       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1614       aligned = !UseCompressedOops;
 1615       is_oop = true;
 1616       dest_uninitialized = true;
 1617       break;
 1618     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1619       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1620       aligned = !UseCompressedOops;
 1621       is_oop = true;
 1622       dest_uninitialized = true;
 1623       break;
 1624     default:
 1625       ShouldNotReachHere();
 1626       break;
 1627     }
 1628 
 1629     __ align(CodeEntryAlignment);
 1630     StubCodeMark mark(this, stub_id);
 1631     address start = __ pc();
 1632     __ enter();
 1633 
 1634     if (entry != nullptr) {
 1635       *entry = __ pc();
 1636       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1637       BLOCK_COMMENT("Entry:");
 1638     }
 1639 
 1640     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1641     if (dest_uninitialized) {
 1642       decorators |= IS_DEST_UNINITIALIZED;
 1643     }
 1644     if (aligned) {
 1645       decorators |= ARRAYCOPY_ALIGNED;
 1646     }
 1647 
 1648     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1649     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1650 
 1651     if (is_oop) {
 1652       // save regs before copy_memory
 1653       __ push(RegSet::of(d, count), sp);
 1654     }
 1655     {
 1656       // UnsafeMemoryAccess page error: continue after unsafe access
 1657       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1658       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1659       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1660     }
 1661 
 1662     if (is_oop) {
 1663       __ pop(RegSet::of(d, count), sp);
 1664       if (VerifyOops)
 1665         verify_oop_array(size, d, count, r16);
 1666     }
 1667 
 1668     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1669 
 1670     __ leave();
 1671     __ mov(r0, zr); // return 0
 1672     __ ret(lr);
 1673     return start;
 1674   }
 1675 
 1676   // Arguments:
 1677   //   stub_id - is used to name the stub and identify all details of
 1678   //             how to perform the copy.
 1679   //
 1680   //   nooverlap_target - identifes the (post push) entry for the
 1681   //             corresponding disjoint copy routine which can be
 1682   //             jumped to if the ranges do not actually overlap
 1683   //
 1684   //   entry - is assigned to the stub's post push entry point unless
 1685   //           it is null
 1686   //
 1687   //
 1688   // Inputs:
 1689   //   c_rarg0   - source array address
 1690   //   c_rarg1   - destination array address
 1691   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1692   //
 1693   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1694   // the hardware handle it.  The two dwords within qwords that span
 1695   // cache line boundaries will still be loaded and stored atomically.
 1696   //
 1697   // Side Effects:
 1698   //   entry is set to the no-overlap entry point so it can be used by
 1699   //   some other conjoint copy method
 1700   //
 1701   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1702     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1703     RegSet saved_regs = RegSet::of(s, d, count);
 1704     int size;
 1705     bool aligned;
 1706     bool is_oop;
 1707     bool dest_uninitialized;
 1708     switch (stub_id) {
 1709     case jbyte_arraycopy_id:
 1710       size = sizeof(jbyte);
 1711       aligned = false;
 1712       is_oop = false;
 1713       dest_uninitialized = false;
 1714       break;
 1715     case arrayof_jbyte_arraycopy_id:
 1716       size = sizeof(jbyte);
 1717       aligned = true;
 1718       is_oop = false;
 1719       dest_uninitialized = false;
 1720       break;
 1721     case jshort_arraycopy_id:
 1722       size = sizeof(jshort);
 1723       aligned = false;
 1724       is_oop = false;
 1725       dest_uninitialized = false;
 1726       break;
 1727     case arrayof_jshort_arraycopy_id:
 1728       size = sizeof(jshort);
 1729       aligned = true;
 1730       is_oop = false;
 1731       dest_uninitialized = false;
 1732       break;
 1733     case jint_arraycopy_id:
 1734       size = sizeof(jint);
 1735       aligned = false;
 1736       is_oop = false;
 1737       dest_uninitialized = false;
 1738       break;
 1739     case arrayof_jint_arraycopy_id:
 1740       size = sizeof(jint);
 1741       aligned = true;
 1742       is_oop = false;
 1743       dest_uninitialized = false;
 1744       break;
 1745     case jlong_arraycopy_id:
 1746       // since this is always aligned we can (should!) use the same
 1747       // stub as for case arrayof_jlong_disjoint_arraycopy
 1748       ShouldNotReachHere();
 1749       break;
 1750     case arrayof_jlong_arraycopy_id:
 1751       size = sizeof(jlong);
 1752       aligned = true;
 1753       is_oop = false;
 1754       dest_uninitialized = false;
 1755       break;
 1756     case oop_arraycopy_id:
 1757       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1758       aligned = !UseCompressedOops;
 1759       is_oop = true;
 1760       dest_uninitialized = false;
 1761       break;
 1762     case arrayof_oop_arraycopy_id:
 1763       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1764       aligned = !UseCompressedOops;
 1765       is_oop = true;
 1766       dest_uninitialized = false;
 1767       break;
 1768     case oop_arraycopy_uninit_id:
 1769       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1770       aligned = !UseCompressedOops;
 1771       is_oop = true;
 1772       dest_uninitialized = true;
 1773       break;
 1774     case arrayof_oop_arraycopy_uninit_id:
 1775       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1776       aligned = !UseCompressedOops;
 1777       is_oop = true;
 1778       dest_uninitialized = true;
 1779       break;
 1780     default:
 1781       ShouldNotReachHere();
 1782     }
 1783 
 1784     StubCodeMark mark(this, stub_id);
 1785     address start = __ pc();
 1786     __ enter();
 1787 
 1788     if (entry != nullptr) {
 1789       *entry = __ pc();
 1790       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1791       BLOCK_COMMENT("Entry:");
 1792     }
 1793 
 1794     // use fwd copy when (d-s) above_equal (count*size)
 1795     __ sub(rscratch1, d, s);
 1796     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1797     __ br(Assembler::HS, nooverlap_target);
 1798 
 1799     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1800     if (dest_uninitialized) {
 1801       decorators |= IS_DEST_UNINITIALIZED;
 1802     }
 1803     if (aligned) {
 1804       decorators |= ARRAYCOPY_ALIGNED;
 1805     }
 1806 
 1807     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1808     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1809 
 1810     if (is_oop) {
 1811       // save regs before copy_memory
 1812       __ push(RegSet::of(d, count), sp);
 1813     }
 1814     {
 1815       // UnsafeMemoryAccess page error: continue after unsafe access
 1816       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1817       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1818       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1819     }
 1820     if (is_oop) {
 1821       __ pop(RegSet::of(d, count), sp);
 1822       if (VerifyOops)
 1823         verify_oop_array(size, d, count, r16);
 1824     }
 1825     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1826     __ leave();
 1827     __ mov(r0, zr); // return 0
 1828     __ ret(lr);
 1829     return start;
 1830   }
 1831 
 1832   // Helper for generating a dynamic type check.
 1833   // Smashes rscratch1, rscratch2.
 1834   void generate_type_check(Register sub_klass,
 1835                            Register super_check_offset,
 1836                            Register super_klass,
 1837                            Register temp1,
 1838                            Register temp2,
 1839                            Register result,
 1840                            Label& L_success) {
 1841     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1842 
 1843     BLOCK_COMMENT("type_check:");
 1844 
 1845     Label L_miss;
 1846 
 1847     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1848                                      super_check_offset);
 1849     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1850 
 1851     // Fall through on failure!
 1852     __ BIND(L_miss);
 1853   }
 1854 
 1855   //
 1856   //  Generate checkcasting array copy stub
 1857   //
 1858   //  Input:
 1859   //    c_rarg0   - source array address
 1860   //    c_rarg1   - destination array address
 1861   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1862   //    c_rarg3   - size_t ckoff (super_check_offset)
 1863   //    c_rarg4   - oop ckval (super_klass)
 1864   //
 1865   //  Output:
 1866   //    r0 ==  0  -  success
 1867   //    r0 == -1^K - failure, where K is partial transfer count
 1868   //
 1869   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1870     bool dest_uninitialized;
 1871     switch (stub_id) {
 1872     case checkcast_arraycopy_id:
 1873       dest_uninitialized = false;
 1874       break;
 1875     case checkcast_arraycopy_uninit_id:
 1876       dest_uninitialized = true;
 1877       break;
 1878     default:
 1879       ShouldNotReachHere();
 1880     }
 1881 
 1882     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1883 
 1884     // Input registers (after setup_arg_regs)
 1885     const Register from        = c_rarg0;   // source array address
 1886     const Register to          = c_rarg1;   // destination array address
 1887     const Register count       = c_rarg2;   // elementscount
 1888     const Register ckoff       = c_rarg3;   // super_check_offset
 1889     const Register ckval       = c_rarg4;   // super_klass
 1890 
 1891     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1892     RegSet wb_post_saved_regs = RegSet::of(count);
 1893 
 1894     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1895     const Register copied_oop  = r22;       // actual oop copied
 1896     const Register count_save  = r21;       // orig elementscount
 1897     const Register start_to    = r20;       // destination array start address
 1898     const Register r19_klass   = r19;       // oop._klass
 1899 
 1900     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1901     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1902 
 1903     //---------------------------------------------------------------
 1904     // Assembler stub will be used for this call to arraycopy
 1905     // if the two arrays are subtypes of Object[] but the
 1906     // destination array type is not equal to or a supertype
 1907     // of the source type.  Each element must be separately
 1908     // checked.
 1909 
 1910     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1911                                copied_oop, r19_klass, count_save);
 1912 
 1913     __ align(CodeEntryAlignment);
 1914     StubCodeMark mark(this, stub_id);
 1915     address start = __ pc();
 1916 
 1917     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1918 
 1919 #ifdef ASSERT
 1920     // caller guarantees that the arrays really are different
 1921     // otherwise, we would have to make conjoint checks
 1922     { Label L;
 1923       __ b(L);                  // conjoint check not yet implemented
 1924       __ stop("checkcast_copy within a single array");
 1925       __ bind(L);
 1926     }
 1927 #endif //ASSERT
 1928 
 1929     // Caller of this entry point must set up the argument registers.
 1930     if (entry != nullptr) {
 1931       *entry = __ pc();
 1932       BLOCK_COMMENT("Entry:");
 1933     }
 1934 
 1935      // Empty array:  Nothing to do.
 1936     __ cbz(count, L_done);
 1937     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1938 
 1939 #ifdef ASSERT
 1940     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1941     // The ckoff and ckval must be mutually consistent,
 1942     // even though caller generates both.
 1943     { Label L;
 1944       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1945       __ ldrw(start_to, Address(ckval, sco_offset));
 1946       __ cmpw(ckoff, start_to);
 1947       __ br(Assembler::EQ, L);
 1948       __ stop("super_check_offset inconsistent");
 1949       __ bind(L);
 1950     }
 1951 #endif //ASSERT
 1952 
 1953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1954     bool is_oop = true;
 1955     int element_size = UseCompressedOops ? 4 : 8;
 1956     if (dest_uninitialized) {
 1957       decorators |= IS_DEST_UNINITIALIZED;
 1958     }
 1959 
 1960     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1961     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1962 
 1963     // save the original count
 1964     __ mov(count_save, count);
 1965 
 1966     // Copy from low to high addresses
 1967     __ mov(start_to, to);              // Save destination array start address
 1968     __ b(L_load_element);
 1969 
 1970     // ======== begin loop ========
 1971     // (Loop is rotated; its entry is L_load_element.)
 1972     // Loop control:
 1973     //   for (; count != 0; count--) {
 1974     //     copied_oop = load_heap_oop(from++);
 1975     //     ... generate_type_check ...;
 1976     //     store_heap_oop(to++, copied_oop);
 1977     //   }
 1978     __ align(OptoLoopAlignment);
 1979 
 1980     __ BIND(L_store_element);
 1981     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1982                       __ post(to, element_size), copied_oop, noreg,
 1983                       gct1, gct2, gct3);
 1984     __ sub(count, count, 1);
 1985     __ cbz(count, L_do_card_marks);
 1986 
 1987     // ======== loop entry is here ========
 1988     __ BIND(L_load_element);
 1989     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1990                      copied_oop, noreg, __ post(from, element_size),
 1991                      gct1);
 1992     __ cbz(copied_oop, L_store_element);
 1993 
 1994     __ load_klass(r19_klass, copied_oop);// query the object klass
 1995 
 1996     BLOCK_COMMENT("type_check:");
 1997     generate_type_check(/*sub_klass*/r19_klass,
 1998                         /*super_check_offset*/ckoff,
 1999                         /*super_klass*/ckval,
 2000                         /*r_array_base*/gct1,
 2001                         /*temp2*/gct2,
 2002                         /*result*/r10, L_store_element);
 2003 
 2004     // Fall through on failure!
 2005 
 2006     // ======== end loop ========
 2007 
 2008     // It was a real error; we must depend on the caller to finish the job.
 2009     // Register count = remaining oops, count_orig = total oops.
 2010     // Emit GC store barriers for the oops we have copied and report
 2011     // their number to the caller.
 2012 
 2013     __ subs(count, count_save, count);     // K = partially copied oop count
 2014     __ eon(count, count, zr);              // report (-1^K) to caller
 2015     __ br(Assembler::EQ, L_done_pop);
 2016 
 2017     __ BIND(L_do_card_marks);
 2018     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2019 
 2020     __ bind(L_done_pop);
 2021     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2022     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2023 
 2024     __ bind(L_done);
 2025     __ mov(r0, count);
 2026     __ leave();
 2027     __ ret(lr);
 2028 
 2029     return start;
 2030   }
 2031 
 2032   // Perform range checks on the proposed arraycopy.
 2033   // Kills temp, but nothing else.
 2034   // Also, clean the sign bits of src_pos and dst_pos.
 2035   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2036                               Register src_pos, // source position (c_rarg1)
 2037                               Register dst,     // destination array oo (c_rarg2)
 2038                               Register dst_pos, // destination position (c_rarg3)
 2039                               Register length,
 2040                               Register temp,
 2041                               Label& L_failed) {
 2042     BLOCK_COMMENT("arraycopy_range_checks:");
 2043 
 2044     assert_different_registers(rscratch1, temp);
 2045 
 2046     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2047     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2048     __ addw(temp, length, src_pos);
 2049     __ cmpw(temp, rscratch1);
 2050     __ br(Assembler::HI, L_failed);
 2051 
 2052     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2053     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2054     __ addw(temp, length, dst_pos);
 2055     __ cmpw(temp, rscratch1);
 2056     __ br(Assembler::HI, L_failed);
 2057 
 2058     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2059     __ movw(src_pos, src_pos);
 2060     __ movw(dst_pos, dst_pos);
 2061 
 2062     BLOCK_COMMENT("arraycopy_range_checks done");
 2063   }
 2064 
 2065   // These stubs get called from some dumb test routine.
 2066   // I'll write them properly when they're called from
 2067   // something that's actually doing something.
 2068   static void fake_arraycopy_stub(address src, address dst, int count) {
 2069     assert(count == 0, "huh?");
 2070   }
 2071 
 2072 
 2073   //
 2074   //  Generate 'unsafe' array copy stub
 2075   //  Though just as safe as the other stubs, it takes an unscaled
 2076   //  size_t argument instead of an element count.
 2077   //
 2078   //  Input:
 2079   //    c_rarg0   - source array address
 2080   //    c_rarg1   - destination array address
 2081   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2082   //
 2083   // Examines the alignment of the operands and dispatches
 2084   // to a long, int, short, or byte copy loop.
 2085   //
 2086   address generate_unsafe_copy(address byte_copy_entry,
 2087                                address short_copy_entry,
 2088                                address int_copy_entry,
 2089                                address long_copy_entry) {
 2090     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2091 
 2092     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2093     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2094 
 2095     __ align(CodeEntryAlignment);
 2096     StubCodeMark mark(this, stub_id);
 2097     address start = __ pc();
 2098     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2099 
 2100     // bump this on entry, not on exit:
 2101     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2102 
 2103     __ orr(rscratch1, s, d);
 2104     __ orr(rscratch1, rscratch1, count);
 2105 
 2106     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2107     __ cbz(rscratch1, L_long_aligned);
 2108     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2109     __ cbz(rscratch1, L_int_aligned);
 2110     __ tbz(rscratch1, 0, L_short_aligned);
 2111     __ b(RuntimeAddress(byte_copy_entry));
 2112 
 2113     __ BIND(L_short_aligned);
 2114     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2115     __ b(RuntimeAddress(short_copy_entry));
 2116     __ BIND(L_int_aligned);
 2117     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2118     __ b(RuntimeAddress(int_copy_entry));
 2119     __ BIND(L_long_aligned);
 2120     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2121     __ b(RuntimeAddress(long_copy_entry));
 2122 
 2123     return start;
 2124   }
 2125 
 2126   //
 2127   //  Generate generic array copy stubs
 2128   //
 2129   //  Input:
 2130   //    c_rarg0    -  src oop
 2131   //    c_rarg1    -  src_pos (32-bits)
 2132   //    c_rarg2    -  dst oop
 2133   //    c_rarg3    -  dst_pos (32-bits)
 2134   //    c_rarg4    -  element count (32-bits)
 2135   //
 2136   //  Output:
 2137   //    r0 ==  0  -  success
 2138   //    r0 == -1^K - failure, where K is partial transfer count
 2139   //
 2140   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2141                                 address int_copy_entry, address oop_copy_entry,
 2142                                 address long_copy_entry, address checkcast_copy_entry) {
 2143     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2144 
 2145     Label L_failed, L_objArray;
 2146     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2147 
 2148     // Input registers
 2149     const Register src        = c_rarg0;  // source array oop
 2150     const Register src_pos    = c_rarg1;  // source position
 2151     const Register dst        = c_rarg2;  // destination array oop
 2152     const Register dst_pos    = c_rarg3;  // destination position
 2153     const Register length     = c_rarg4;
 2154 
 2155 
 2156     // Registers used as temps
 2157     const Register dst_klass  = c_rarg5;
 2158 
 2159     __ align(CodeEntryAlignment);
 2160 
 2161     StubCodeMark mark(this, stub_id);
 2162 
 2163     address start = __ pc();
 2164 
 2165     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2166 
 2167     // bump this on entry, not on exit:
 2168     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2169 
 2170     //-----------------------------------------------------------------------
 2171     // Assembler stub will be used for this call to arraycopy
 2172     // if the following conditions are met:
 2173     //
 2174     // (1) src and dst must not be null.
 2175     // (2) src_pos must not be negative.
 2176     // (3) dst_pos must not be negative.
 2177     // (4) length  must not be negative.
 2178     // (5) src klass and dst klass should be the same and not null.
 2179     // (6) src and dst should be arrays.
 2180     // (7) src_pos + length must not exceed length of src.
 2181     // (8) dst_pos + length must not exceed length of dst.
 2182     //
 2183 
 2184     //  if (src == nullptr) return -1;
 2185     __ cbz(src, L_failed);
 2186 
 2187     //  if (src_pos < 0) return -1;
 2188     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     //  if (dst == nullptr) return -1;
 2191     __ cbz(dst, L_failed);
 2192 
 2193     //  if (dst_pos < 0) return -1;
 2194     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2195 
 2196     // registers used as temp
 2197     const Register scratch_length    = r16; // elements count to copy
 2198     const Register scratch_src_klass = r17; // array klass
 2199     const Register lh                = r15; // layout helper
 2200 
 2201     //  if (length < 0) return -1;
 2202     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2203     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2204 
 2205     __ load_klass(scratch_src_klass, src);
 2206 #ifdef ASSERT
 2207     //  assert(src->klass() != nullptr);
 2208     {
 2209       BLOCK_COMMENT("assert klasses not null {");
 2210       Label L1, L2;
 2211       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2212       __ bind(L1);
 2213       __ stop("broken null klass");
 2214       __ bind(L2);
 2215       __ load_klass(rscratch1, dst);
 2216       __ cbz(rscratch1, L1);     // this would be broken also
 2217       BLOCK_COMMENT("} assert klasses not null done");
 2218     }
 2219 #endif
 2220 
 2221     // Load layout helper (32-bits)
 2222     //
 2223     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2224     // 32        30    24            16              8     2                 0
 2225     //
 2226     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2227     //
 2228 
 2229     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2230 
 2231     // Handle objArrays completely differently...
 2232     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2233     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2234     __ movw(rscratch1, objArray_lh);
 2235     __ eorw(rscratch2, lh, rscratch1);
 2236     __ cbzw(rscratch2, L_objArray);
 2237 
 2238     //  if (src->klass() != dst->klass()) return -1;
 2239     __ load_klass(rscratch2, dst);
 2240     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2241     __ cbnz(rscratch2, L_failed);
 2242 
 2243     // Check for flat inline type array -> return -1
 2244     __ test_flat_array_oop(src, rscratch2, L_failed);
 2245 
 2246     // Check for null-free (non-flat) inline type array -> handle as object array
 2247     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2248 
 2249     //  if (!src->is_Array()) return -1;
 2250     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2251 
 2252     // At this point, it is known to be a typeArray (array_tag 0x3).
 2253 #ifdef ASSERT
 2254     {
 2255       BLOCK_COMMENT("assert primitive array {");
 2256       Label L;
 2257       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2258       __ cmpw(lh, rscratch2);
 2259       __ br(Assembler::GE, L);
 2260       __ stop("must be a primitive array");
 2261       __ bind(L);
 2262       BLOCK_COMMENT("} assert primitive array done");
 2263     }
 2264 #endif
 2265 
 2266     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2267                            rscratch2, L_failed);
 2268 
 2269     // TypeArrayKlass
 2270     //
 2271     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2272     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2273     //
 2274 
 2275     const Register rscratch1_offset = rscratch1;    // array offset
 2276     const Register r15_elsize = lh; // element size
 2277 
 2278     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2279            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2280     __ add(src, src, rscratch1_offset);           // src array offset
 2281     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2282     BLOCK_COMMENT("choose copy loop based on element size");
 2283 
 2284     // next registers should be set before the jump to corresponding stub
 2285     const Register from     = c_rarg0;  // source array address
 2286     const Register to       = c_rarg1;  // destination array address
 2287     const Register count    = c_rarg2;  // elements count
 2288 
 2289     // 'from', 'to', 'count' registers should be set in such order
 2290     // since they are the same as 'src', 'src_pos', 'dst'.
 2291 
 2292     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2293 
 2294     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2295     // size in bytes).  We do a simple bitwise binary search.
 2296   __ BIND(L_copy_bytes);
 2297     __ tbnz(r15_elsize, 1, L_copy_ints);
 2298     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2299     __ lea(from, Address(src, src_pos));// src_addr
 2300     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2301     __ movw(count, scratch_length); // length
 2302     __ b(RuntimeAddress(byte_copy_entry));
 2303 
 2304   __ BIND(L_copy_shorts);
 2305     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2307     __ movw(count, scratch_length); // length
 2308     __ b(RuntimeAddress(short_copy_entry));
 2309 
 2310   __ BIND(L_copy_ints);
 2311     __ tbnz(r15_elsize, 0, L_copy_longs);
 2312     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2313     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2314     __ movw(count, scratch_length); // length
 2315     __ b(RuntimeAddress(int_copy_entry));
 2316 
 2317   __ BIND(L_copy_longs);
 2318 #ifdef ASSERT
 2319     {
 2320       BLOCK_COMMENT("assert long copy {");
 2321       Label L;
 2322       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2323       __ cmpw(r15_elsize, LogBytesPerLong);
 2324       __ br(Assembler::EQ, L);
 2325       __ stop("must be long copy, but elsize is wrong");
 2326       __ bind(L);
 2327       BLOCK_COMMENT("} assert long copy done");
 2328     }
 2329 #endif
 2330     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2331     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2332     __ movw(count, scratch_length); // length
 2333     __ b(RuntimeAddress(long_copy_entry));
 2334 
 2335     // ObjArrayKlass
 2336   __ BIND(L_objArray);
 2337     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2338 
 2339     Label L_plain_copy, L_checkcast_copy;
 2340     //  test array classes for subtyping
 2341     __ load_klass(r15, dst);
 2342     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2343     __ br(Assembler::NE, L_checkcast_copy);
 2344 
 2345     // Identically typed arrays can be copied without element-wise checks.
 2346     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                            rscratch2, L_failed);
 2348 
 2349     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2350     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2351     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2352     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353     __ movw(count, scratch_length); // length
 2354   __ BIND(L_plain_copy);
 2355     __ b(RuntimeAddress(oop_copy_entry));
 2356 
 2357   __ BIND(L_checkcast_copy);
 2358     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2359     {
 2360       // Before looking at dst.length, make sure dst is also an objArray.
 2361       __ ldrw(rscratch1, Address(r15, lh_offset));
 2362       __ movw(rscratch2, objArray_lh);
 2363       __ eorw(rscratch1, rscratch1, rscratch2);
 2364       __ cbnzw(rscratch1, L_failed);
 2365 
 2366       // It is safe to examine both src.length and dst.length.
 2367       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2368                              r15, L_failed);
 2369 
 2370       __ load_klass(dst_klass, dst); // reload
 2371 
 2372       // Marshal the base address arguments now, freeing registers.
 2373       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2374       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2375       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2376       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2377       __ movw(count, length);           // length (reloaded)
 2378       Register sco_temp = c_rarg3;      // this register is free now
 2379       assert_different_registers(from, to, count, sco_temp,
 2380                                  dst_klass, scratch_src_klass);
 2381       // assert_clean_int(count, sco_temp);
 2382 
 2383       // Generate the type check.
 2384       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2386 
 2387       // Smashes rscratch1, rscratch2
 2388       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2389                           L_plain_copy);
 2390 
 2391       // Fetch destination element klass from the ObjArrayKlass header.
 2392       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2393       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2394       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2395 
 2396       // the checkcast_copy loop needs two extra arguments:
 2397       assert(c_rarg3 == sco_temp, "#3 already in place");
 2398       // Set up arguments for checkcast_copy_entry.
 2399       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2400       __ b(RuntimeAddress(checkcast_copy_entry));
 2401     }
 2402 
 2403   __ BIND(L_failed);
 2404     __ mov(r0, -1);
 2405     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2406     __ ret(lr);
 2407 
 2408     return start;
 2409   }
 2410 
 2411   //
 2412   // Generate stub for array fill. If "aligned" is true, the
 2413   // "to" address is assumed to be heapword aligned.
 2414   //
 2415   // Arguments for generated stub:
 2416   //   to:    c_rarg0
 2417   //   value: c_rarg1
 2418   //   count: c_rarg2 treated as signed
 2419   //
 2420   address generate_fill(StubGenStubId stub_id) {
 2421     BasicType t;
 2422     bool aligned;
 2423 
 2424     switch (stub_id) {
 2425     case jbyte_fill_id:
 2426       t = T_BYTE;
 2427       aligned = false;
 2428       break;
 2429     case jshort_fill_id:
 2430       t = T_SHORT;
 2431       aligned = false;
 2432       break;
 2433     case jint_fill_id:
 2434       t = T_INT;
 2435       aligned = false;
 2436       break;
 2437     case arrayof_jbyte_fill_id:
 2438       t = T_BYTE;
 2439       aligned = true;
 2440       break;
 2441     case arrayof_jshort_fill_id:
 2442       t = T_SHORT;
 2443       aligned = true;
 2444       break;
 2445     case arrayof_jint_fill_id:
 2446       t = T_INT;
 2447       aligned = true;
 2448       break;
 2449     default:
 2450       ShouldNotReachHere();
 2451     };
 2452 
 2453     __ align(CodeEntryAlignment);
 2454     StubCodeMark mark(this, stub_id);
 2455     address start = __ pc();
 2456 
 2457     BLOCK_COMMENT("Entry:");
 2458 
 2459     const Register to        = c_rarg0;  // source array address
 2460     const Register value     = c_rarg1;  // value
 2461     const Register count     = c_rarg2;  // elements count
 2462 
 2463     const Register bz_base = r10;        // base for block_zero routine
 2464     const Register cnt_words = r11;      // temp register
 2465 
 2466     __ enter();
 2467 
 2468     Label L_fill_elements, L_exit1;
 2469 
 2470     int shift = -1;
 2471     switch (t) {
 2472       case T_BYTE:
 2473         shift = 0;
 2474         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2475         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2476         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2477         __ br(Assembler::LO, L_fill_elements);
 2478         break;
 2479       case T_SHORT:
 2480         shift = 1;
 2481         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2482         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2483         __ br(Assembler::LO, L_fill_elements);
 2484         break;
 2485       case T_INT:
 2486         shift = 2;
 2487         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2488         __ br(Assembler::LO, L_fill_elements);
 2489         break;
 2490       default: ShouldNotReachHere();
 2491     }
 2492 
 2493     // Align source address at 8 bytes address boundary.
 2494     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2495     if (!aligned) {
 2496       switch (t) {
 2497         case T_BYTE:
 2498           // One byte misalignment happens only for byte arrays.
 2499           __ tbz(to, 0, L_skip_align1);
 2500           __ strb(value, Address(__ post(to, 1)));
 2501           __ subw(count, count, 1);
 2502           __ bind(L_skip_align1);
 2503           // Fallthrough
 2504         case T_SHORT:
 2505           // Two bytes misalignment happens only for byte and short (char) arrays.
 2506           __ tbz(to, 1, L_skip_align2);
 2507           __ strh(value, Address(__ post(to, 2)));
 2508           __ subw(count, count, 2 >> shift);
 2509           __ bind(L_skip_align2);
 2510           // Fallthrough
 2511         case T_INT:
 2512           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2513           __ tbz(to, 2, L_skip_align4);
 2514           __ strw(value, Address(__ post(to, 4)));
 2515           __ subw(count, count, 4 >> shift);
 2516           __ bind(L_skip_align4);
 2517           break;
 2518         default: ShouldNotReachHere();
 2519       }
 2520     }
 2521 
 2522     //
 2523     //  Fill large chunks
 2524     //
 2525     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2526     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2527     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2528     if (UseBlockZeroing) {
 2529       Label non_block_zeroing, rest;
 2530       // If the fill value is zero we can use the fast zero_words().
 2531       __ cbnz(value, non_block_zeroing);
 2532       __ mov(bz_base, to);
 2533       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2534       address tpc = __ zero_words(bz_base, cnt_words);
 2535       if (tpc == nullptr) {
 2536         fatal("CodeCache is full at generate_fill");
 2537       }
 2538       __ b(rest);
 2539       __ bind(non_block_zeroing);
 2540       __ fill_words(to, cnt_words, value);
 2541       __ bind(rest);
 2542     } else {
 2543       __ fill_words(to, cnt_words, value);
 2544     }
 2545 
 2546     // Remaining count is less than 8 bytes. Fill it by a single store.
 2547     // Note that the total length is no less than 8 bytes.
 2548     if (t == T_BYTE || t == T_SHORT) {
 2549       Label L_exit1;
 2550       __ cbzw(count, L_exit1);
 2551       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2552       __ str(value, Address(to, -8));    // overwrite some elements
 2553       __ bind(L_exit1);
 2554       __ leave();
 2555       __ ret(lr);
 2556     }
 2557 
 2558     // Handle copies less than 8 bytes.
 2559     Label L_fill_2, L_fill_4, L_exit2;
 2560     __ bind(L_fill_elements);
 2561     switch (t) {
 2562       case T_BYTE:
 2563         __ tbz(count, 0, L_fill_2);
 2564         __ strb(value, Address(__ post(to, 1)));
 2565         __ bind(L_fill_2);
 2566         __ tbz(count, 1, L_fill_4);
 2567         __ strh(value, Address(__ post(to, 2)));
 2568         __ bind(L_fill_4);
 2569         __ tbz(count, 2, L_exit2);
 2570         __ strw(value, Address(to));
 2571         break;
 2572       case T_SHORT:
 2573         __ tbz(count, 0, L_fill_4);
 2574         __ strh(value, Address(__ post(to, 2)));
 2575         __ bind(L_fill_4);
 2576         __ tbz(count, 1, L_exit2);
 2577         __ strw(value, Address(to));
 2578         break;
 2579       case T_INT:
 2580         __ cbzw(count, L_exit2);
 2581         __ strw(value, Address(to));
 2582         break;
 2583       default: ShouldNotReachHere();
 2584     }
 2585     __ bind(L_exit2);
 2586     __ leave();
 2587     __ ret(lr);
 2588     return start;
 2589   }
 2590 
 2591   address generate_data_cache_writeback() {
 2592     const Register line        = c_rarg0;  // address of line to write back
 2593 
 2594     __ align(CodeEntryAlignment);
 2595 
 2596     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2597     StubCodeMark mark(this, stub_id);
 2598 
 2599     address start = __ pc();
 2600     __ enter();
 2601     __ cache_wb(Address(line, 0));
 2602     __ leave();
 2603     __ ret(lr);
 2604 
 2605     return start;
 2606   }
 2607 
 2608   address generate_data_cache_writeback_sync() {
 2609     const Register is_pre     = c_rarg0;  // pre or post sync
 2610 
 2611     __ align(CodeEntryAlignment);
 2612 
 2613     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2614     StubCodeMark mark(this, stub_id);
 2615 
 2616     // pre wbsync is a no-op
 2617     // post wbsync translates to an sfence
 2618 
 2619     Label skip;
 2620     address start = __ pc();
 2621     __ enter();
 2622     __ cbnz(is_pre, skip);
 2623     __ cache_wbsync(false);
 2624     __ bind(skip);
 2625     __ leave();
 2626     __ ret(lr);
 2627 
 2628     return start;
 2629   }
 2630 
 2631   void generate_arraycopy_stubs() {
 2632     address entry;
 2633     address entry_jbyte_arraycopy;
 2634     address entry_jshort_arraycopy;
 2635     address entry_jint_arraycopy;
 2636     address entry_oop_arraycopy;
 2637     address entry_jlong_arraycopy;
 2638     address entry_checkcast_arraycopy;
 2639 
 2640     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2641     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2642 
 2643     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2644     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2645 
 2646     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2647     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2648 
 2649     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2650 
 2651     //*** jbyte
 2652     // Always need aligned and unaligned versions
 2653     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2654     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2655     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2656     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2657 
 2658     //*** jshort
 2659     // Always need aligned and unaligned versions
 2660     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2661     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2662     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2663     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2664 
 2665     //*** jint
 2666     // Aligned versions
 2667     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2668     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2669     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2670     // entry_jint_arraycopy always points to the unaligned version
 2671     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2672     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2673 
 2674     //*** jlong
 2675     // It is always aligned
 2676     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2677     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2678     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2679     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2680 
 2681     //*** oops
 2682     {
 2683       // With compressed oops we need unaligned versions; notice that
 2684       // we overwrite entry_oop_arraycopy.
 2685       bool aligned = !UseCompressedOops;
 2686 
 2687       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2688         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2689       StubRoutines::_arrayof_oop_arraycopy
 2690         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2691       // Aligned versions without pre-barriers
 2692       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2693         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2694       StubRoutines::_arrayof_oop_arraycopy_uninit
 2695         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2696     }
 2697 
 2698     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2699     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2700     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2701     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2702 
 2703     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2704     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2705 
 2706     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2707                                                               entry_jshort_arraycopy,
 2708                                                               entry_jint_arraycopy,
 2709                                                               entry_jlong_arraycopy);
 2710 
 2711     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2712                                                                entry_jshort_arraycopy,
 2713                                                                entry_jint_arraycopy,
 2714                                                                entry_oop_arraycopy,
 2715                                                                entry_jlong_arraycopy,
 2716                                                                entry_checkcast_arraycopy);
 2717 
 2718     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2719     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2720     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2721     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2722     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2723     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2724   }
 2725 
 2726   void generate_math_stubs() { Unimplemented(); }
 2727 
 2728   // Arguments:
 2729   //
 2730   // Inputs:
 2731   //   c_rarg0   - source byte array address
 2732   //   c_rarg1   - destination byte array address
 2733   //   c_rarg2   - K (key) in little endian int array
 2734   //
 2735   address generate_aescrypt_encryptBlock() {
 2736     __ align(CodeEntryAlignment);
 2737     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2738     StubCodeMark mark(this, stub_id);
 2739 
 2740     const Register from        = c_rarg0;  // source array address
 2741     const Register to          = c_rarg1;  // destination array address
 2742     const Register key         = c_rarg2;  // key array address
 2743     const Register keylen      = rscratch1;
 2744 
 2745     address start = __ pc();
 2746     __ enter();
 2747 
 2748     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2749 
 2750     __ aesenc_loadkeys(key, keylen);
 2751     __ aesecb_encrypt(from, to, keylen);
 2752 
 2753     __ mov(r0, 0);
 2754 
 2755     __ leave();
 2756     __ ret(lr);
 2757 
 2758     return start;
 2759   }
 2760 
 2761   // Arguments:
 2762   //
 2763   // Inputs:
 2764   //   c_rarg0   - source byte array address
 2765   //   c_rarg1   - destination byte array address
 2766   //   c_rarg2   - K (key) in little endian int array
 2767   //
 2768   address generate_aescrypt_decryptBlock() {
 2769     assert(UseAES, "need AES cryptographic extension support");
 2770     __ align(CodeEntryAlignment);
 2771     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2772     StubCodeMark mark(this, stub_id);
 2773     Label L_doLast;
 2774 
 2775     const Register from        = c_rarg0;  // source array address
 2776     const Register to          = c_rarg1;  // destination array address
 2777     const Register key         = c_rarg2;  // key array address
 2778     const Register keylen      = rscratch1;
 2779 
 2780     address start = __ pc();
 2781     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2782 
 2783     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2784 
 2785     __ aesecb_decrypt(from, to, key, keylen);
 2786 
 2787     __ mov(r0, 0);
 2788 
 2789     __ leave();
 2790     __ ret(lr);
 2791 
 2792     return start;
 2793   }
 2794 
 2795   // Arguments:
 2796   //
 2797   // Inputs:
 2798   //   c_rarg0   - source byte array address
 2799   //   c_rarg1   - destination byte array address
 2800   //   c_rarg2   - K (key) in little endian int array
 2801   //   c_rarg3   - r vector byte array address
 2802   //   c_rarg4   - input length
 2803   //
 2804   // Output:
 2805   //   x0        - input length
 2806   //
 2807   address generate_cipherBlockChaining_encryptAESCrypt() {
 2808     assert(UseAES, "need AES cryptographic extension support");
 2809     __ align(CodeEntryAlignment);
 2810     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2811     StubCodeMark mark(this, stub_id);
 2812 
 2813     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2814 
 2815     const Register from        = c_rarg0;  // source array address
 2816     const Register to          = c_rarg1;  // destination array address
 2817     const Register key         = c_rarg2;  // key array address
 2818     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2819                                            // and left with the results of the last encryption block
 2820     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2821     const Register keylen      = rscratch1;
 2822 
 2823     address start = __ pc();
 2824 
 2825       __ enter();
 2826 
 2827       __ movw(rscratch2, len_reg);
 2828 
 2829       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2830 
 2831       __ ld1(v0, __ T16B, rvec);
 2832 
 2833       __ cmpw(keylen, 52);
 2834       __ br(Assembler::CC, L_loadkeys_44);
 2835       __ br(Assembler::EQ, L_loadkeys_52);
 2836 
 2837       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2838       __ rev32(v17, __ T16B, v17);
 2839       __ rev32(v18, __ T16B, v18);
 2840     __ BIND(L_loadkeys_52);
 2841       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2842       __ rev32(v19, __ T16B, v19);
 2843       __ rev32(v20, __ T16B, v20);
 2844     __ BIND(L_loadkeys_44);
 2845       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2846       __ rev32(v21, __ T16B, v21);
 2847       __ rev32(v22, __ T16B, v22);
 2848       __ rev32(v23, __ T16B, v23);
 2849       __ rev32(v24, __ T16B, v24);
 2850       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2851       __ rev32(v25, __ T16B, v25);
 2852       __ rev32(v26, __ T16B, v26);
 2853       __ rev32(v27, __ T16B, v27);
 2854       __ rev32(v28, __ T16B, v28);
 2855       __ ld1(v29, v30, v31, __ T16B, key);
 2856       __ rev32(v29, __ T16B, v29);
 2857       __ rev32(v30, __ T16B, v30);
 2858       __ rev32(v31, __ T16B, v31);
 2859 
 2860     __ BIND(L_aes_loop);
 2861       __ ld1(v1, __ T16B, __ post(from, 16));
 2862       __ eor(v0, __ T16B, v0, v1);
 2863 
 2864       __ br(Assembler::CC, L_rounds_44);
 2865       __ br(Assembler::EQ, L_rounds_52);
 2866 
 2867       __ aese(v0, v17); __ aesmc(v0, v0);
 2868       __ aese(v0, v18); __ aesmc(v0, v0);
 2869     __ BIND(L_rounds_52);
 2870       __ aese(v0, v19); __ aesmc(v0, v0);
 2871       __ aese(v0, v20); __ aesmc(v0, v0);
 2872     __ BIND(L_rounds_44);
 2873       __ aese(v0, v21); __ aesmc(v0, v0);
 2874       __ aese(v0, v22); __ aesmc(v0, v0);
 2875       __ aese(v0, v23); __ aesmc(v0, v0);
 2876       __ aese(v0, v24); __ aesmc(v0, v0);
 2877       __ aese(v0, v25); __ aesmc(v0, v0);
 2878       __ aese(v0, v26); __ aesmc(v0, v0);
 2879       __ aese(v0, v27); __ aesmc(v0, v0);
 2880       __ aese(v0, v28); __ aesmc(v0, v0);
 2881       __ aese(v0, v29); __ aesmc(v0, v0);
 2882       __ aese(v0, v30);
 2883       __ eor(v0, __ T16B, v0, v31);
 2884 
 2885       __ st1(v0, __ T16B, __ post(to, 16));
 2886 
 2887       __ subw(len_reg, len_reg, 16);
 2888       __ cbnzw(len_reg, L_aes_loop);
 2889 
 2890       __ st1(v0, __ T16B, rvec);
 2891 
 2892       __ mov(r0, rscratch2);
 2893 
 2894       __ leave();
 2895       __ ret(lr);
 2896 
 2897       return start;
 2898   }
 2899 
 2900   // Arguments:
 2901   //
 2902   // Inputs:
 2903   //   c_rarg0   - source byte array address
 2904   //   c_rarg1   - destination byte array address
 2905   //   c_rarg2   - K (key) in little endian int array
 2906   //   c_rarg3   - r vector byte array address
 2907   //   c_rarg4   - input length
 2908   //
 2909   // Output:
 2910   //   r0        - input length
 2911   //
 2912   address generate_cipherBlockChaining_decryptAESCrypt() {
 2913     assert(UseAES, "need AES cryptographic extension support");
 2914     __ align(CodeEntryAlignment);
 2915     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2916     StubCodeMark mark(this, stub_id);
 2917 
 2918     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2919 
 2920     const Register from        = c_rarg0;  // source array address
 2921     const Register to          = c_rarg1;  // destination array address
 2922     const Register key         = c_rarg2;  // key array address
 2923     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2924                                            // and left with the results of the last encryption block
 2925     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2926     const Register keylen      = rscratch1;
 2927 
 2928     address start = __ pc();
 2929 
 2930       __ enter();
 2931 
 2932       __ movw(rscratch2, len_reg);
 2933 
 2934       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2935 
 2936       __ ld1(v2, __ T16B, rvec);
 2937 
 2938       __ ld1(v31, __ T16B, __ post(key, 16));
 2939       __ rev32(v31, __ T16B, v31);
 2940 
 2941       __ cmpw(keylen, 52);
 2942       __ br(Assembler::CC, L_loadkeys_44);
 2943       __ br(Assembler::EQ, L_loadkeys_52);
 2944 
 2945       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2946       __ rev32(v17, __ T16B, v17);
 2947       __ rev32(v18, __ T16B, v18);
 2948     __ BIND(L_loadkeys_52);
 2949       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2950       __ rev32(v19, __ T16B, v19);
 2951       __ rev32(v20, __ T16B, v20);
 2952     __ BIND(L_loadkeys_44);
 2953       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2954       __ rev32(v21, __ T16B, v21);
 2955       __ rev32(v22, __ T16B, v22);
 2956       __ rev32(v23, __ T16B, v23);
 2957       __ rev32(v24, __ T16B, v24);
 2958       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2959       __ rev32(v25, __ T16B, v25);
 2960       __ rev32(v26, __ T16B, v26);
 2961       __ rev32(v27, __ T16B, v27);
 2962       __ rev32(v28, __ T16B, v28);
 2963       __ ld1(v29, v30, __ T16B, key);
 2964       __ rev32(v29, __ T16B, v29);
 2965       __ rev32(v30, __ T16B, v30);
 2966 
 2967     __ BIND(L_aes_loop);
 2968       __ ld1(v0, __ T16B, __ post(from, 16));
 2969       __ orr(v1, __ T16B, v0, v0);
 2970 
 2971       __ br(Assembler::CC, L_rounds_44);
 2972       __ br(Assembler::EQ, L_rounds_52);
 2973 
 2974       __ aesd(v0, v17); __ aesimc(v0, v0);
 2975       __ aesd(v0, v18); __ aesimc(v0, v0);
 2976     __ BIND(L_rounds_52);
 2977       __ aesd(v0, v19); __ aesimc(v0, v0);
 2978       __ aesd(v0, v20); __ aesimc(v0, v0);
 2979     __ BIND(L_rounds_44);
 2980       __ aesd(v0, v21); __ aesimc(v0, v0);
 2981       __ aesd(v0, v22); __ aesimc(v0, v0);
 2982       __ aesd(v0, v23); __ aesimc(v0, v0);
 2983       __ aesd(v0, v24); __ aesimc(v0, v0);
 2984       __ aesd(v0, v25); __ aesimc(v0, v0);
 2985       __ aesd(v0, v26); __ aesimc(v0, v0);
 2986       __ aesd(v0, v27); __ aesimc(v0, v0);
 2987       __ aesd(v0, v28); __ aesimc(v0, v0);
 2988       __ aesd(v0, v29); __ aesimc(v0, v0);
 2989       __ aesd(v0, v30);
 2990       __ eor(v0, __ T16B, v0, v31);
 2991       __ eor(v0, __ T16B, v0, v2);
 2992 
 2993       __ st1(v0, __ T16B, __ post(to, 16));
 2994       __ orr(v2, __ T16B, v1, v1);
 2995 
 2996       __ subw(len_reg, len_reg, 16);
 2997       __ cbnzw(len_reg, L_aes_loop);
 2998 
 2999       __ st1(v2, __ T16B, rvec);
 3000 
 3001       __ mov(r0, rscratch2);
 3002 
 3003       __ leave();
 3004       __ ret(lr);
 3005 
 3006     return start;
 3007   }
 3008 
 3009   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3010   // Inputs: 128-bits. in is preserved.
 3011   // The least-significant 64-bit word is in the upper dword of each vector.
 3012   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3013   // Output: result
 3014   void be_add_128_64(FloatRegister result, FloatRegister in,
 3015                      FloatRegister inc, FloatRegister tmp) {
 3016     assert_different_registers(result, tmp, inc);
 3017 
 3018     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3019                                            // input
 3020     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3021     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3022                                            // MSD == 0 (must be!) to LSD
 3023     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3024   }
 3025 
 3026   // CTR AES crypt.
 3027   // Arguments:
 3028   //
 3029   // Inputs:
 3030   //   c_rarg0   - source byte array address
 3031   //   c_rarg1   - destination byte array address
 3032   //   c_rarg2   - K (key) in little endian int array
 3033   //   c_rarg3   - counter vector byte array address
 3034   //   c_rarg4   - input length
 3035   //   c_rarg5   - saved encryptedCounter start
 3036   //   c_rarg6   - saved used length
 3037   //
 3038   // Output:
 3039   //   r0       - input length
 3040   //
 3041   address generate_counterMode_AESCrypt() {
 3042     const Register in = c_rarg0;
 3043     const Register out = c_rarg1;
 3044     const Register key = c_rarg2;
 3045     const Register counter = c_rarg3;
 3046     const Register saved_len = c_rarg4, len = r10;
 3047     const Register saved_encrypted_ctr = c_rarg5;
 3048     const Register used_ptr = c_rarg6, used = r12;
 3049 
 3050     const Register offset = r7;
 3051     const Register keylen = r11;
 3052 
 3053     const unsigned char block_size = 16;
 3054     const int bulk_width = 4;
 3055     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3056     // performance with larger data sizes, but it also means that the
 3057     // fast path isn't used until you have at least 8 blocks, and up
 3058     // to 127 bytes of data will be executed on the slow path. For
 3059     // that reason, and also so as not to blow away too much icache, 4
 3060     // blocks seems like a sensible compromise.
 3061 
 3062     // Algorithm:
 3063     //
 3064     //    if (len == 0) {
 3065     //        goto DONE;
 3066     //    }
 3067     //    int result = len;
 3068     //    do {
 3069     //        if (used >= blockSize) {
 3070     //            if (len >= bulk_width * blockSize) {
 3071     //                CTR_large_block();
 3072     //                if (len == 0)
 3073     //                    goto DONE;
 3074     //            }
 3075     //            for (;;) {
 3076     //                16ByteVector v0 = counter;
 3077     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3078     //                used = 0;
 3079     //                if (len < blockSize)
 3080     //                    break;    /* goto NEXT */
 3081     //                16ByteVector v1 = load16Bytes(in, offset);
 3082     //                v1 = v1 ^ encryptedCounter;
 3083     //                store16Bytes(out, offset);
 3084     //                used = blockSize;
 3085     //                offset += blockSize;
 3086     //                len -= blockSize;
 3087     //                if (len == 0)
 3088     //                    goto DONE;
 3089     //            }
 3090     //        }
 3091     //      NEXT:
 3092     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3093     //        len--;
 3094     //    } while (len != 0);
 3095     //  DONE:
 3096     //    return result;
 3097     //
 3098     // CTR_large_block()
 3099     //    Wide bulk encryption of whole blocks.
 3100 
 3101     __ align(CodeEntryAlignment);
 3102     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3103     StubCodeMark mark(this, stub_id);
 3104     const address start = __ pc();
 3105     __ enter();
 3106 
 3107     Label DONE, CTR_large_block, large_block_return;
 3108     __ ldrw(used, Address(used_ptr));
 3109     __ cbzw(saved_len, DONE);
 3110 
 3111     __ mov(len, saved_len);
 3112     __ mov(offset, 0);
 3113 
 3114     // Compute #rounds for AES based on the length of the key array
 3115     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3116 
 3117     __ aesenc_loadkeys(key, keylen);
 3118 
 3119     {
 3120       Label L_CTR_loop, NEXT;
 3121 
 3122       __ bind(L_CTR_loop);
 3123 
 3124       __ cmp(used, block_size);
 3125       __ br(__ LO, NEXT);
 3126 
 3127       // Maybe we have a lot of data
 3128       __ subsw(rscratch1, len, bulk_width * block_size);
 3129       __ br(__ HS, CTR_large_block);
 3130       __ BIND(large_block_return);
 3131       __ cbzw(len, DONE);
 3132 
 3133       // Setup the counter
 3134       __ movi(v4, __ T4S, 0);
 3135       __ movi(v5, __ T4S, 1);
 3136       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3137 
 3138       // 128-bit big-endian increment
 3139       __ ld1(v0, __ T16B, counter);
 3140       __ rev64(v16, __ T16B, v0);
 3141       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3142       __ rev64(v16, __ T16B, v16);
 3143       __ st1(v16, __ T16B, counter);
 3144       // Previous counter value is in v0
 3145       // v4 contains { 0, 1 }
 3146 
 3147       {
 3148         // We have fewer than bulk_width blocks of data left. Encrypt
 3149         // them one by one until there is less than a full block
 3150         // remaining, being careful to save both the encrypted counter
 3151         // and the counter.
 3152 
 3153         Label inner_loop;
 3154         __ bind(inner_loop);
 3155         // Counter to encrypt is in v0
 3156         __ aesecb_encrypt(noreg, noreg, keylen);
 3157         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3158 
 3159         // Do we have a remaining full block?
 3160 
 3161         __ mov(used, 0);
 3162         __ cmp(len, block_size);
 3163         __ br(__ LO, NEXT);
 3164 
 3165         // Yes, we have a full block
 3166         __ ldrq(v1, Address(in, offset));
 3167         __ eor(v1, __ T16B, v1, v0);
 3168         __ strq(v1, Address(out, offset));
 3169         __ mov(used, block_size);
 3170         __ add(offset, offset, block_size);
 3171 
 3172         __ subw(len, len, block_size);
 3173         __ cbzw(len, DONE);
 3174 
 3175         // Increment the counter, store it back
 3176         __ orr(v0, __ T16B, v16, v16);
 3177         __ rev64(v16, __ T16B, v16);
 3178         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3179         __ rev64(v16, __ T16B, v16);
 3180         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3181 
 3182         __ b(inner_loop);
 3183       }
 3184 
 3185       __ BIND(NEXT);
 3186 
 3187       // Encrypt a single byte, and loop.
 3188       // We expect this to be a rare event.
 3189       __ ldrb(rscratch1, Address(in, offset));
 3190       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3191       __ eor(rscratch1, rscratch1, rscratch2);
 3192       __ strb(rscratch1, Address(out, offset));
 3193       __ add(offset, offset, 1);
 3194       __ add(used, used, 1);
 3195       __ subw(len, len,1);
 3196       __ cbnzw(len, L_CTR_loop);
 3197     }
 3198 
 3199     __ bind(DONE);
 3200     __ strw(used, Address(used_ptr));
 3201     __ mov(r0, saved_len);
 3202 
 3203     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3204     __ ret(lr);
 3205 
 3206     // Bulk encryption
 3207 
 3208     __ BIND (CTR_large_block);
 3209     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3210 
 3211     if (bulk_width == 8) {
 3212       __ sub(sp, sp, 4 * 16);
 3213       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3214     }
 3215     __ sub(sp, sp, 4 * 16);
 3216     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3217     RegSet saved_regs = (RegSet::of(in, out, offset)
 3218                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3219     __ push(saved_regs, sp);
 3220     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3221     __ add(in, in, offset);
 3222     __ add(out, out, offset);
 3223 
 3224     // Keys should already be loaded into the correct registers
 3225 
 3226     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3227     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3228 
 3229     // AES/CTR loop
 3230     {
 3231       Label L_CTR_loop;
 3232       __ BIND(L_CTR_loop);
 3233 
 3234       // Setup the counters
 3235       __ movi(v8, __ T4S, 0);
 3236       __ movi(v9, __ T4S, 1);
 3237       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3238 
 3239       for (int i = 0; i < bulk_width; i++) {
 3240         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3241         __ rev64(v0_ofs, __ T16B, v16);
 3242         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3243       }
 3244 
 3245       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3246 
 3247       // Encrypt the counters
 3248       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3249 
 3250       if (bulk_width == 8) {
 3251         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3252       }
 3253 
 3254       // XOR the encrypted counters with the inputs
 3255       for (int i = 0; i < bulk_width; i++) {
 3256         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3257         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3258         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3259       }
 3260 
 3261       // Write the encrypted data
 3262       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3263       if (bulk_width == 8) {
 3264         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3265       }
 3266 
 3267       __ subw(len, len, 16 * bulk_width);
 3268       __ cbnzw(len, L_CTR_loop);
 3269     }
 3270 
 3271     // Save the counter back where it goes
 3272     __ rev64(v16, __ T16B, v16);
 3273     __ st1(v16, __ T16B, counter);
 3274 
 3275     __ pop(saved_regs, sp);
 3276 
 3277     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3278     if (bulk_width == 8) {
 3279       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3280     }
 3281 
 3282     __ andr(rscratch1, len, -16 * bulk_width);
 3283     __ sub(len, len, rscratch1);
 3284     __ add(offset, offset, rscratch1);
 3285     __ mov(used, 16);
 3286     __ strw(used, Address(used_ptr));
 3287     __ b(large_block_return);
 3288 
 3289     return start;
 3290   }
 3291 
 3292   // Vector AES Galois Counter Mode implementation. Parameters:
 3293   //
 3294   // in = c_rarg0
 3295   // len = c_rarg1
 3296   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3297   // out = c_rarg3
 3298   // key = c_rarg4
 3299   // state = c_rarg5 - GHASH.state
 3300   // subkeyHtbl = c_rarg6 - powers of H
 3301   // counter = c_rarg7 - 16 bytes of CTR
 3302   // return - number of processed bytes
 3303   address generate_galoisCounterMode_AESCrypt() {
 3304     address ghash_polynomial = __ pc();
 3305     __ emit_int64(0x87);  // The low-order bits of the field
 3306                           // polynomial (i.e. p = z^7+z^2+z+1)
 3307                           // repeated in the low and high parts of a
 3308                           // 128-bit vector
 3309     __ emit_int64(0x87);
 3310 
 3311     __ align(CodeEntryAlignment);
 3312     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3313     StubCodeMark mark(this, stub_id);
 3314     address start = __ pc();
 3315     __ enter();
 3316 
 3317     const Register in = c_rarg0;
 3318     const Register len = c_rarg1;
 3319     const Register ct = c_rarg2;
 3320     const Register out = c_rarg3;
 3321     // and updated with the incremented counter in the end
 3322 
 3323     const Register key = c_rarg4;
 3324     const Register state = c_rarg5;
 3325 
 3326     const Register subkeyHtbl = c_rarg6;
 3327 
 3328     const Register counter = c_rarg7;
 3329 
 3330     const Register keylen = r10;
 3331     // Save state before entering routine
 3332     __ sub(sp, sp, 4 * 16);
 3333     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3334     __ sub(sp, sp, 4 * 16);
 3335     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3336 
 3337     // __ andr(len, len, -512);
 3338     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3339     __ str(len, __ pre(sp, -2 * wordSize));
 3340 
 3341     Label DONE;
 3342     __ cbz(len, DONE);
 3343 
 3344     // Compute #rounds for AES based on the length of the key array
 3345     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3346 
 3347     __ aesenc_loadkeys(key, keylen);
 3348     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3349     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3350 
 3351     // AES/CTR loop
 3352     {
 3353       Label L_CTR_loop;
 3354       __ BIND(L_CTR_loop);
 3355 
 3356       // Setup the counters
 3357       __ movi(v8, __ T4S, 0);
 3358       __ movi(v9, __ T4S, 1);
 3359       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3360 
 3361       assert(v0->encoding() < v8->encoding(), "");
 3362       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3363         FloatRegister f = as_FloatRegister(i);
 3364         __ rev32(f, __ T16B, v16);
 3365         __ addv(v16, __ T4S, v16, v8);
 3366       }
 3367 
 3368       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3369 
 3370       // Encrypt the counters
 3371       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3372 
 3373       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3374 
 3375       // XOR the encrypted counters with the inputs
 3376       for (int i = 0; i < 8; i++) {
 3377         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3378         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3379         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3380       }
 3381       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3382       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3383 
 3384       __ subw(len, len, 16 * 8);
 3385       __ cbnzw(len, L_CTR_loop);
 3386     }
 3387 
 3388     __ rev32(v16, __ T16B, v16);
 3389     __ st1(v16, __ T16B, counter);
 3390 
 3391     __ ldr(len, Address(sp));
 3392     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3393 
 3394     // GHASH/CTR loop
 3395     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3396                                 len, /*unrolls*/4);
 3397 
 3398 #ifdef ASSERT
 3399     { Label L;
 3400       __ cmp(len, (unsigned char)0);
 3401       __ br(Assembler::EQ, L);
 3402       __ stop("stubGenerator: abort");
 3403       __ bind(L);
 3404   }
 3405 #endif
 3406 
 3407   __ bind(DONE);
 3408     // Return the number of bytes processed
 3409     __ ldr(r0, __ post(sp, 2 * wordSize));
 3410 
 3411     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3412     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3413 
 3414     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3415     __ ret(lr);
 3416      return start;
 3417   }
 3418 
 3419   class Cached64Bytes {
 3420   private:
 3421     MacroAssembler *_masm;
 3422     Register _regs[8];
 3423 
 3424   public:
 3425     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3426       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3427       auto it = rs.begin();
 3428       for (auto &r: _regs) {
 3429         r = *it;
 3430         ++it;
 3431       }
 3432     }
 3433 
 3434     void gen_loads(Register base) {
 3435       for (int i = 0; i < 8; i += 2) {
 3436         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3437       }
 3438     }
 3439 
 3440     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3441     void extract_u32(Register dest, int i) {
 3442       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3443     }
 3444   };
 3445 
 3446   // Utility routines for md5.
 3447   // Clobbers r10 and r11.
 3448   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3449               int k, int s, int t) {
 3450     Register rscratch3 = r10;
 3451     Register rscratch4 = r11;
 3452 
 3453     __ eorw(rscratch3, r3, r4);
 3454     __ movw(rscratch2, t);
 3455     __ andw(rscratch3, rscratch3, r2);
 3456     __ addw(rscratch4, r1, rscratch2);
 3457     reg_cache.extract_u32(rscratch1, k);
 3458     __ eorw(rscratch3, rscratch3, r4);
 3459     __ addw(rscratch4, rscratch4, rscratch1);
 3460     __ addw(rscratch3, rscratch3, rscratch4);
 3461     __ rorw(rscratch2, rscratch3, 32 - s);
 3462     __ addw(r1, rscratch2, r2);
 3463   }
 3464 
 3465   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3466               int k, int s, int t) {
 3467     Register rscratch3 = r10;
 3468     Register rscratch4 = r11;
 3469 
 3470     reg_cache.extract_u32(rscratch1, k);
 3471     __ movw(rscratch2, t);
 3472     __ addw(rscratch4, r1, rscratch2);
 3473     __ addw(rscratch4, rscratch4, rscratch1);
 3474     __ bicw(rscratch2, r3, r4);
 3475     __ andw(rscratch3, r2, r4);
 3476     __ addw(rscratch2, rscratch2, rscratch4);
 3477     __ addw(rscratch2, rscratch2, rscratch3);
 3478     __ rorw(rscratch2, rscratch2, 32 - s);
 3479     __ addw(r1, rscratch2, r2);
 3480   }
 3481 
 3482   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3483               int k, int s, int t) {
 3484     Register rscratch3 = r10;
 3485     Register rscratch4 = r11;
 3486 
 3487     __ eorw(rscratch3, r3, r4);
 3488     __ movw(rscratch2, t);
 3489     __ addw(rscratch4, r1, rscratch2);
 3490     reg_cache.extract_u32(rscratch1, k);
 3491     __ eorw(rscratch3, rscratch3, r2);
 3492     __ addw(rscratch4, rscratch4, rscratch1);
 3493     __ addw(rscratch3, rscratch3, rscratch4);
 3494     __ rorw(rscratch2, rscratch3, 32 - s);
 3495     __ addw(r1, rscratch2, r2);
 3496   }
 3497 
 3498   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3499               int k, int s, int t) {
 3500     Register rscratch3 = r10;
 3501     Register rscratch4 = r11;
 3502 
 3503     __ movw(rscratch3, t);
 3504     __ ornw(rscratch2, r2, r4);
 3505     __ addw(rscratch4, r1, rscratch3);
 3506     reg_cache.extract_u32(rscratch1, k);
 3507     __ eorw(rscratch3, rscratch2, r3);
 3508     __ addw(rscratch4, rscratch4, rscratch1);
 3509     __ addw(rscratch3, rscratch3, rscratch4);
 3510     __ rorw(rscratch2, rscratch3, 32 - s);
 3511     __ addw(r1, rscratch2, r2);
 3512   }
 3513 
 3514   // Arguments:
 3515   //
 3516   // Inputs:
 3517   //   c_rarg0   - byte[]  source+offset
 3518   //   c_rarg1   - int[]   SHA.state
 3519   //   c_rarg2   - int     offset
 3520   //   c_rarg3   - int     limit
 3521   //
 3522   address generate_md5_implCompress(StubGenStubId stub_id) {
 3523     bool multi_block;
 3524     switch (stub_id) {
 3525     case md5_implCompress_id:
 3526       multi_block = false;
 3527       break;
 3528     case md5_implCompressMB_id:
 3529       multi_block = true;
 3530       break;
 3531     default:
 3532       ShouldNotReachHere();
 3533     }
 3534     __ align(CodeEntryAlignment);
 3535 
 3536     StubCodeMark mark(this, stub_id);
 3537     address start = __ pc();
 3538 
 3539     Register buf       = c_rarg0;
 3540     Register state     = c_rarg1;
 3541     Register ofs       = c_rarg2;
 3542     Register limit     = c_rarg3;
 3543     Register a         = r4;
 3544     Register b         = r5;
 3545     Register c         = r6;
 3546     Register d         = r7;
 3547     Register rscratch3 = r10;
 3548     Register rscratch4 = r11;
 3549 
 3550     Register state_regs[2] = { r12, r13 };
 3551     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3552     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3553 
 3554     __ push(saved_regs, sp);
 3555 
 3556     __ ldp(state_regs[0], state_regs[1], Address(state));
 3557     __ ubfx(a, state_regs[0],  0, 32);
 3558     __ ubfx(b, state_regs[0], 32, 32);
 3559     __ ubfx(c, state_regs[1],  0, 32);
 3560     __ ubfx(d, state_regs[1], 32, 32);
 3561 
 3562     Label md5_loop;
 3563     __ BIND(md5_loop);
 3564 
 3565     reg_cache.gen_loads(buf);
 3566 
 3567     // Round 1
 3568     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3569     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3570     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3571     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3572     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3573     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3574     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3575     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3576     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3577     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3578     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3579     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3580     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3581     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3582     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3583     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3584 
 3585     // Round 2
 3586     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3587     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3588     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3589     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3590     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3591     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3592     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3593     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3594     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3595     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3596     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3597     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3598     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3599     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3600     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3601     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3602 
 3603     // Round 3
 3604     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3605     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3606     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3607     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3608     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3609     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3610     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3611     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3612     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3613     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3614     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3615     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3616     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3617     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3618     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3619     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3620 
 3621     // Round 4
 3622     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3623     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3624     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3625     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3626     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3627     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3628     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3629     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3630     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3631     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3632     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3633     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3634     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3635     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3636     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3637     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3638 
 3639     __ addw(a, state_regs[0], a);
 3640     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3641     __ addw(b, rscratch2, b);
 3642     __ addw(c, state_regs[1], c);
 3643     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3644     __ addw(d, rscratch4, d);
 3645 
 3646     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3647     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3648 
 3649     if (multi_block) {
 3650       __ add(buf, buf, 64);
 3651       __ add(ofs, ofs, 64);
 3652       __ cmp(ofs, limit);
 3653       __ br(Assembler::LE, md5_loop);
 3654       __ mov(c_rarg0, ofs); // return ofs
 3655     }
 3656 
 3657     // write hash values back in the correct order
 3658     __ stp(state_regs[0], state_regs[1], Address(state));
 3659 
 3660     __ pop(saved_regs, sp);
 3661 
 3662     __ ret(lr);
 3663 
 3664     return start;
 3665   }
 3666 
 3667   // Arguments:
 3668   //
 3669   // Inputs:
 3670   //   c_rarg0   - byte[]  source+offset
 3671   //   c_rarg1   - int[]   SHA.state
 3672   //   c_rarg2   - int     offset
 3673   //   c_rarg3   - int     limit
 3674   //
 3675   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3676     bool multi_block;
 3677     switch (stub_id) {
 3678     case sha1_implCompress_id:
 3679       multi_block = false;
 3680       break;
 3681     case sha1_implCompressMB_id:
 3682       multi_block = true;
 3683       break;
 3684     default:
 3685       ShouldNotReachHere();
 3686     }
 3687 
 3688     __ align(CodeEntryAlignment);
 3689 
 3690     StubCodeMark mark(this, stub_id);
 3691     address start = __ pc();
 3692 
 3693     Register buf   = c_rarg0;
 3694     Register state = c_rarg1;
 3695     Register ofs   = c_rarg2;
 3696     Register limit = c_rarg3;
 3697 
 3698     Label keys;
 3699     Label sha1_loop;
 3700 
 3701     // load the keys into v0..v3
 3702     __ adr(rscratch1, keys);
 3703     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3704     // load 5 words state into v6, v7
 3705     __ ldrq(v6, Address(state, 0));
 3706     __ ldrs(v7, Address(state, 16));
 3707 
 3708 
 3709     __ BIND(sha1_loop);
 3710     // load 64 bytes of data into v16..v19
 3711     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3712     __ rev32(v16, __ T16B, v16);
 3713     __ rev32(v17, __ T16B, v17);
 3714     __ rev32(v18, __ T16B, v18);
 3715     __ rev32(v19, __ T16B, v19);
 3716 
 3717     // do the sha1
 3718     __ addv(v4, __ T4S, v16, v0);
 3719     __ orr(v20, __ T16B, v6, v6);
 3720 
 3721     FloatRegister d0 = v16;
 3722     FloatRegister d1 = v17;
 3723     FloatRegister d2 = v18;
 3724     FloatRegister d3 = v19;
 3725 
 3726     for (int round = 0; round < 20; round++) {
 3727       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3728       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3729       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3730       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3731       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3732 
 3733       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3734       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3735       __ sha1h(tmp2, __ T4S, v20);
 3736       if (round < 5)
 3737         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3738       else if (round < 10 || round >= 15)
 3739         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3740       else
 3741         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3742       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3743 
 3744       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3745     }
 3746 
 3747     __ addv(v7, __ T2S, v7, v21);
 3748     __ addv(v6, __ T4S, v6, v20);
 3749 
 3750     if (multi_block) {
 3751       __ add(ofs, ofs, 64);
 3752       __ cmp(ofs, limit);
 3753       __ br(Assembler::LE, sha1_loop);
 3754       __ mov(c_rarg0, ofs); // return ofs
 3755     }
 3756 
 3757     __ strq(v6, Address(state, 0));
 3758     __ strs(v7, Address(state, 16));
 3759 
 3760     __ ret(lr);
 3761 
 3762     __ bind(keys);
 3763     __ emit_int32(0x5a827999);
 3764     __ emit_int32(0x6ed9eba1);
 3765     __ emit_int32(0x8f1bbcdc);
 3766     __ emit_int32(0xca62c1d6);
 3767 
 3768     return start;
 3769   }
 3770 
 3771 
 3772   // Arguments:
 3773   //
 3774   // Inputs:
 3775   //   c_rarg0   - byte[]  source+offset
 3776   //   c_rarg1   - int[]   SHA.state
 3777   //   c_rarg2   - int     offset
 3778   //   c_rarg3   - int     limit
 3779   //
 3780   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3781     bool multi_block;
 3782     switch (stub_id) {
 3783     case sha256_implCompress_id:
 3784       multi_block = false;
 3785       break;
 3786     case sha256_implCompressMB_id:
 3787       multi_block = true;
 3788       break;
 3789     default:
 3790       ShouldNotReachHere();
 3791     }
 3792 
 3793     static const uint32_t round_consts[64] = {
 3794       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3795       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3796       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3797       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3798       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3799       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3800       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3801       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3802       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3803       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3804       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3805       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3806       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3807       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3808       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3809       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3810     };
 3811 
 3812     __ align(CodeEntryAlignment);
 3813 
 3814     StubCodeMark mark(this, stub_id);
 3815     address start = __ pc();
 3816 
 3817     Register buf   = c_rarg0;
 3818     Register state = c_rarg1;
 3819     Register ofs   = c_rarg2;
 3820     Register limit = c_rarg3;
 3821 
 3822     Label sha1_loop;
 3823 
 3824     __ stpd(v8, v9, __ pre(sp, -32));
 3825     __ stpd(v10, v11, Address(sp, 16));
 3826 
 3827 // dga == v0
 3828 // dgb == v1
 3829 // dg0 == v2
 3830 // dg1 == v3
 3831 // dg2 == v4
 3832 // t0 == v6
 3833 // t1 == v7
 3834 
 3835     // load 16 keys to v16..v31
 3836     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3837     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3838     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3839     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3840     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3841 
 3842     // load 8 words (256 bits) state
 3843     __ ldpq(v0, v1, state);
 3844 
 3845     __ BIND(sha1_loop);
 3846     // load 64 bytes of data into v8..v11
 3847     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3848     __ rev32(v8, __ T16B, v8);
 3849     __ rev32(v9, __ T16B, v9);
 3850     __ rev32(v10, __ T16B, v10);
 3851     __ rev32(v11, __ T16B, v11);
 3852 
 3853     __ addv(v6, __ T4S, v8, v16);
 3854     __ orr(v2, __ T16B, v0, v0);
 3855     __ orr(v3, __ T16B, v1, v1);
 3856 
 3857     FloatRegister d0 = v8;
 3858     FloatRegister d1 = v9;
 3859     FloatRegister d2 = v10;
 3860     FloatRegister d3 = v11;
 3861 
 3862 
 3863     for (int round = 0; round < 16; round++) {
 3864       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3865       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3866       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3867       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3868 
 3869       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3870        __ orr(v4, __ T16B, v2, v2);
 3871       if (round < 15)
 3872         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3873       __ sha256h(v2, __ T4S, v3, tmp2);
 3874       __ sha256h2(v3, __ T4S, v4, tmp2);
 3875       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3876 
 3877       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3878     }
 3879 
 3880     __ addv(v0, __ T4S, v0, v2);
 3881     __ addv(v1, __ T4S, v1, v3);
 3882 
 3883     if (multi_block) {
 3884       __ add(ofs, ofs, 64);
 3885       __ cmp(ofs, limit);
 3886       __ br(Assembler::LE, sha1_loop);
 3887       __ mov(c_rarg0, ofs); // return ofs
 3888     }
 3889 
 3890     __ ldpd(v10, v11, Address(sp, 16));
 3891     __ ldpd(v8, v9, __ post(sp, 32));
 3892 
 3893     __ stpq(v0, v1, state);
 3894 
 3895     __ ret(lr);
 3896 
 3897     return start;
 3898   }
 3899 
 3900   // Double rounds for sha512.
 3901   void sha512_dround(int dr,
 3902                      FloatRegister vi0, FloatRegister vi1,
 3903                      FloatRegister vi2, FloatRegister vi3,
 3904                      FloatRegister vi4, FloatRegister vrc0,
 3905                      FloatRegister vrc1, FloatRegister vin0,
 3906                      FloatRegister vin1, FloatRegister vin2,
 3907                      FloatRegister vin3, FloatRegister vin4) {
 3908       if (dr < 36) {
 3909         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3910       }
 3911       __ addv(v5, __ T2D, vrc0, vin0);
 3912       __ ext(v6, __ T16B, vi2, vi3, 8);
 3913       __ ext(v5, __ T16B, v5, v5, 8);
 3914       __ ext(v7, __ T16B, vi1, vi2, 8);
 3915       __ addv(vi3, __ T2D, vi3, v5);
 3916       if (dr < 32) {
 3917         __ ext(v5, __ T16B, vin3, vin4, 8);
 3918         __ sha512su0(vin0, __ T2D, vin1);
 3919       }
 3920       __ sha512h(vi3, __ T2D, v6, v7);
 3921       if (dr < 32) {
 3922         __ sha512su1(vin0, __ T2D, vin2, v5);
 3923       }
 3924       __ addv(vi4, __ T2D, vi1, vi3);
 3925       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3926   }
 3927 
 3928   // Arguments:
 3929   //
 3930   // Inputs:
 3931   //   c_rarg0   - byte[]  source+offset
 3932   //   c_rarg1   - int[]   SHA.state
 3933   //   c_rarg2   - int     offset
 3934   //   c_rarg3   - int     limit
 3935   //
 3936   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3937     bool multi_block;
 3938     switch (stub_id) {
 3939     case sha512_implCompress_id:
 3940       multi_block = false;
 3941       break;
 3942     case sha512_implCompressMB_id:
 3943       multi_block = true;
 3944       break;
 3945     default:
 3946       ShouldNotReachHere();
 3947     }
 3948 
 3949     static const uint64_t round_consts[80] = {
 3950       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3951       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3952       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3953       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3954       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3955       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3956       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3957       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3958       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3959       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3960       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3961       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3962       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3963       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3964       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3965       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3966       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3967       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3968       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3969       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3970       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3971       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3972       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3973       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3974       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3975       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3976       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3977     };
 3978 
 3979     __ align(CodeEntryAlignment);
 3980 
 3981     StubCodeMark mark(this, stub_id);
 3982     address start = __ pc();
 3983 
 3984     Register buf   = c_rarg0;
 3985     Register state = c_rarg1;
 3986     Register ofs   = c_rarg2;
 3987     Register limit = c_rarg3;
 3988 
 3989     __ stpd(v8, v9, __ pre(sp, -64));
 3990     __ stpd(v10, v11, Address(sp, 16));
 3991     __ stpd(v12, v13, Address(sp, 32));
 3992     __ stpd(v14, v15, Address(sp, 48));
 3993 
 3994     Label sha512_loop;
 3995 
 3996     // load state
 3997     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3998 
 3999     // load first 4 round constants
 4000     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4001     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4002 
 4003     __ BIND(sha512_loop);
 4004     // load 128B of data into v12..v19
 4005     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4006     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4007     __ rev64(v12, __ T16B, v12);
 4008     __ rev64(v13, __ T16B, v13);
 4009     __ rev64(v14, __ T16B, v14);
 4010     __ rev64(v15, __ T16B, v15);
 4011     __ rev64(v16, __ T16B, v16);
 4012     __ rev64(v17, __ T16B, v17);
 4013     __ rev64(v18, __ T16B, v18);
 4014     __ rev64(v19, __ T16B, v19);
 4015 
 4016     __ mov(rscratch2, rscratch1);
 4017 
 4018     __ mov(v0, __ T16B, v8);
 4019     __ mov(v1, __ T16B, v9);
 4020     __ mov(v2, __ T16B, v10);
 4021     __ mov(v3, __ T16B, v11);
 4022 
 4023     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4024     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4025     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4026     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4027     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4028     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4029     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4030     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4031     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4032     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4033     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4034     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4035     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4036     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4037     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4038     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4039     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4040     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4041     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4042     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4043     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4044     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4045     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4046     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4047     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4048     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4049     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4050     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4051     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4052     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4053     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4054     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4055     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4056     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4057     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4058     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4059     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4060     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4061     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4062     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4063 
 4064     __ addv(v8, __ T2D, v8, v0);
 4065     __ addv(v9, __ T2D, v9, v1);
 4066     __ addv(v10, __ T2D, v10, v2);
 4067     __ addv(v11, __ T2D, v11, v3);
 4068 
 4069     if (multi_block) {
 4070       __ add(ofs, ofs, 128);
 4071       __ cmp(ofs, limit);
 4072       __ br(Assembler::LE, sha512_loop);
 4073       __ mov(c_rarg0, ofs); // return ofs
 4074     }
 4075 
 4076     __ st1(v8, v9, v10, v11, __ T2D, state);
 4077 
 4078     __ ldpd(v14, v15, Address(sp, 48));
 4079     __ ldpd(v12, v13, Address(sp, 32));
 4080     __ ldpd(v10, v11, Address(sp, 16));
 4081     __ ldpd(v8, v9, __ post(sp, 64));
 4082 
 4083     __ ret(lr);
 4084 
 4085     return start;
 4086   }
 4087 
 4088   // Execute one round of keccak of two computations in parallel.
 4089   // One of the states should be loaded into the lower halves of
 4090   // the vector registers v0-v24, the other should be loaded into
 4091   // the upper halves of those registers. The ld1r instruction loads
 4092   // the round constant into both halves of register v31.
 4093   // Intermediate results c0...c5 and d0...d5 are computed
 4094   // in registers v25...v30.
 4095   // All vector instructions that are used operate on both register
 4096   // halves in parallel.
 4097   // If only a single computation is needed, one can only load the lower halves.
 4098   void keccak_round(Register rscratch1) {
 4099   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4100   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4101   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4102   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4103   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4104   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4105   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4106   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4107   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4108   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4109 
 4110   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4111   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4112   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4113   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4114   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4115 
 4116   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4117   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4118   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4119   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4120   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4121   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4122   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4123   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4124   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4125   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4126   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4127   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4128   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4129   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4130   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4131   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4132   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4133   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4134   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4135   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4136   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4137   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4138   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4139   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4140   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4141 
 4142   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4143   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4144   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4145   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4146   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4147 
 4148   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4149 
 4150   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4151   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4152   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4153   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4154   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4155 
 4156   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4157   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4158   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4159   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4160   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4161 
 4162   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4163   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4164   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4165   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4166   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4167 
 4168   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4169   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4170   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4171   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4172   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4173 
 4174   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4175   }
 4176 
 4177   // Arguments:
 4178   //
 4179   // Inputs:
 4180   //   c_rarg0   - byte[]  source+offset
 4181   //   c_rarg1   - byte[]  SHA.state
 4182   //   c_rarg2   - int     block_size
 4183   //   c_rarg3   - int     offset
 4184   //   c_rarg4   - int     limit
 4185   //
 4186   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4187     bool multi_block;
 4188     switch (stub_id) {
 4189     case sha3_implCompress_id:
 4190       multi_block = false;
 4191       break;
 4192     case sha3_implCompressMB_id:
 4193       multi_block = true;
 4194       break;
 4195     default:
 4196       ShouldNotReachHere();
 4197     }
 4198 
 4199     static const uint64_t round_consts[24] = {
 4200       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4201       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4202       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4203       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4204       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4205       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4206       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4207       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4208     };
 4209 
 4210     __ align(CodeEntryAlignment);
 4211 
 4212     StubCodeMark mark(this, stub_id);
 4213     address start = __ pc();
 4214 
 4215     Register buf           = c_rarg0;
 4216     Register state         = c_rarg1;
 4217     Register block_size    = c_rarg2;
 4218     Register ofs           = c_rarg3;
 4219     Register limit         = c_rarg4;
 4220 
 4221     Label sha3_loop, rounds24_loop;
 4222     Label sha3_512_or_sha3_384, shake128;
 4223 
 4224     __ stpd(v8, v9, __ pre(sp, -64));
 4225     __ stpd(v10, v11, Address(sp, 16));
 4226     __ stpd(v12, v13, Address(sp, 32));
 4227     __ stpd(v14, v15, Address(sp, 48));
 4228 
 4229     // load state
 4230     __ add(rscratch1, state, 32);
 4231     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4232     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4233     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4234     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4235     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4236     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4237     __ ld1(v24, __ T1D, rscratch1);
 4238 
 4239     __ BIND(sha3_loop);
 4240 
 4241     // 24 keccak rounds
 4242     __ movw(rscratch2, 24);
 4243 
 4244     // load round_constants base
 4245     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4246 
 4247     // load input
 4248     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4249     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4250     __ eor(v0, __ T8B, v0, v25);
 4251     __ eor(v1, __ T8B, v1, v26);
 4252     __ eor(v2, __ T8B, v2, v27);
 4253     __ eor(v3, __ T8B, v3, v28);
 4254     __ eor(v4, __ T8B, v4, v29);
 4255     __ eor(v5, __ T8B, v5, v30);
 4256     __ eor(v6, __ T8B, v6, v31);
 4257 
 4258     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4259     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4260 
 4261     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4262     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4263     __ eor(v7, __ T8B, v7, v25);
 4264     __ eor(v8, __ T8B, v8, v26);
 4265     __ eor(v9, __ T8B, v9, v27);
 4266     __ eor(v10, __ T8B, v10, v28);
 4267     __ eor(v11, __ T8B, v11, v29);
 4268     __ eor(v12, __ T8B, v12, v30);
 4269     __ eor(v13, __ T8B, v13, v31);
 4270 
 4271     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4272     __ eor(v14, __ T8B, v14, v25);
 4273     __ eor(v15, __ T8B, v15, v26);
 4274     __ eor(v16, __ T8B, v16, v27);
 4275 
 4276     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4277     __ andw(c_rarg5, block_size, 48);
 4278     __ cbzw(c_rarg5, rounds24_loop);
 4279 
 4280     __ tbnz(block_size, 5, shake128);
 4281     // block_size == 144, bit5 == 0, SHA3-224
 4282     __ ldrd(v28, __ post(buf, 8));
 4283     __ eor(v17, __ T8B, v17, v28);
 4284     __ b(rounds24_loop);
 4285 
 4286     __ BIND(shake128);
 4287     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4288     __ eor(v17, __ T8B, v17, v28);
 4289     __ eor(v18, __ T8B, v18, v29);
 4290     __ eor(v19, __ T8B, v19, v30);
 4291     __ eor(v20, __ T8B, v20, v31);
 4292     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4293 
 4294     __ BIND(sha3_512_or_sha3_384);
 4295     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4296     __ eor(v7, __ T8B, v7, v25);
 4297     __ eor(v8, __ T8B, v8, v26);
 4298     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4299 
 4300     // SHA3-384
 4301     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4302     __ eor(v9,  __ T8B, v9,  v27);
 4303     __ eor(v10, __ T8B, v10, v28);
 4304     __ eor(v11, __ T8B, v11, v29);
 4305     __ eor(v12, __ T8B, v12, v30);
 4306 
 4307     __ BIND(rounds24_loop);
 4308     __ subw(rscratch2, rscratch2, 1);
 4309 
 4310     keccak_round(rscratch1);
 4311 
 4312     __ cbnzw(rscratch2, rounds24_loop);
 4313 
 4314     if (multi_block) {
 4315       __ add(ofs, ofs, block_size);
 4316       __ cmp(ofs, limit);
 4317       __ br(Assembler::LE, sha3_loop);
 4318       __ mov(c_rarg0, ofs); // return ofs
 4319     }
 4320 
 4321     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4322     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4323     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4324     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4325     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4326     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4327     __ st1(v24, __ T1D, state);
 4328 
 4329     // restore callee-saved registers
 4330     __ ldpd(v14, v15, Address(sp, 48));
 4331     __ ldpd(v12, v13, Address(sp, 32));
 4332     __ ldpd(v10, v11, Address(sp, 16));
 4333     __ ldpd(v8, v9, __ post(sp, 64));
 4334 
 4335     __ ret(lr);
 4336 
 4337     return start;
 4338   }
 4339 
 4340   // Inputs:
 4341   //   c_rarg0   - long[]  state0
 4342   //   c_rarg1   - long[]  state1
 4343   address generate_double_keccak() {
 4344     static const uint64_t round_consts[24] = {
 4345       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4346       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4347       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4348       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4349       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4350       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4351       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4352       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4353     };
 4354 
 4355     // Implements the double_keccak() method of the
 4356     // sun.secyrity.provider.SHA3Parallel class
 4357     __ align(CodeEntryAlignment);
 4358     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4359     address start = __ pc();
 4360     __ enter();
 4361 
 4362     Register state0        = c_rarg0;
 4363     Register state1        = c_rarg1;
 4364 
 4365     Label rounds24_loop;
 4366 
 4367     // save callee-saved registers
 4368     __ stpd(v8, v9, __ pre(sp, -64));
 4369     __ stpd(v10, v11, Address(sp, 16));
 4370     __ stpd(v12, v13, Address(sp, 32));
 4371     __ stpd(v14, v15, Address(sp, 48));
 4372 
 4373     // load states
 4374     __ add(rscratch1, state0, 32);
 4375     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4376     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4377     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4378     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4379     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4380     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4381     __ ld1(v24, __ D, 0, rscratch1);
 4382     __ add(rscratch1, state1, 32);
 4383     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4384     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4385     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4386     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4387     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4388     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4389     __ ld1(v24, __ D, 1, rscratch1);
 4390 
 4391     // 24 keccak rounds
 4392     __ movw(rscratch2, 24);
 4393 
 4394     // load round_constants base
 4395     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4396 
 4397     __ BIND(rounds24_loop);
 4398     __ subw(rscratch2, rscratch2, 1);
 4399     keccak_round(rscratch1);
 4400     __ cbnzw(rscratch2, rounds24_loop);
 4401 
 4402     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4403     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4404     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4405     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4406     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4407     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4408     __ st1(v24, __ D, 0, state0);
 4409     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4410     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4411     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4412     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4413     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4414     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4415     __ st1(v24, __ D, 1, state1);
 4416 
 4417     // restore callee-saved vector registers
 4418     __ ldpd(v14, v15, Address(sp, 48));
 4419     __ ldpd(v12, v13, Address(sp, 32));
 4420     __ ldpd(v10, v11, Address(sp, 16));
 4421     __ ldpd(v8, v9, __ post(sp, 64));
 4422 
 4423     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4424     __ mov(r0, zr); // return 0
 4425     __ ret(lr);
 4426 
 4427     return start;
 4428   }
 4429 
 4430   /**
 4431    *  Arguments:
 4432    *
 4433    * Inputs:
 4434    *   c_rarg0   - int crc
 4435    *   c_rarg1   - byte* buf
 4436    *   c_rarg2   - int length
 4437    *
 4438    * Output:
 4439    *       rax   - int crc result
 4440    */
 4441   address generate_updateBytesCRC32() {
 4442     assert(UseCRC32Intrinsics, "what are we doing here?");
 4443 
 4444     __ align(CodeEntryAlignment);
 4445     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 4446     StubCodeMark mark(this, stub_id);
 4447 
 4448     address start = __ pc();
 4449 
 4450     const Register crc   = c_rarg0;  // crc
 4451     const Register buf   = c_rarg1;  // source java byte array address
 4452     const Register len   = c_rarg2;  // length
 4453     const Register table0 = c_rarg3; // crc_table address
 4454     const Register table1 = c_rarg4;
 4455     const Register table2 = c_rarg5;
 4456     const Register table3 = c_rarg6;
 4457     const Register tmp3 = c_rarg7;
 4458 
 4459     BLOCK_COMMENT("Entry:");
 4460     __ enter(); // required for proper stackwalking of RuntimeStub frame
 4461 
 4462     __ kernel_crc32(crc, buf, len,
 4463               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 4464 
 4465     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4466     __ ret(lr);
 4467 
 4468     return start;
 4469   }
 4470 
 4471   // ChaCha20 block function.  This version parallelizes 4 quarter
 4472   // round operations at a time.  It uses 16 SIMD registers to
 4473   // produce 4 blocks of key stream.
 4474   //
 4475   // state (int[16]) = c_rarg0
 4476   // keystream (byte[256]) = c_rarg1
 4477   // return - number of bytes of keystream (always 256)
 4478   //
 4479   // In this approach, we load the 512-bit start state sequentially into
 4480   // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
 4481   // state, with each successive set of 4 vectors having a +1 added into
 4482   // the first 32-bit lane of the 4th vector in that group (the counter).
 4483   // By doing this, we can perform the block function on 4 512-bit blocks
 4484   // within one run of this intrinsic.
 4485   // The alignment of the data across the 4-vector group is such that at
 4486   // the start it is already aligned for the first round of each two-round
 4487   // loop iteration.  In other words, the corresponding lanes of each vector
 4488   // will contain the values needed for that quarter round operation (e.g.
 4489   // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
 4490   // In between each full round, a lane shift must occur.  Within a loop
 4491   // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
 4492   // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
 4493   // is effectively a diagonal orientation in columnar form.  After the
 4494   // second full round, those registers are left-rotated again, this time
 4495   // 96, 64, and 32 bits - returning the vectors to their columnar organization.
 4496   // After all 10 iterations, the original state is added to each 4-vector
 4497   // working state along with the add mask, and the 4 vector groups are
 4498   // sequentially written to the memory dedicated for the output key stream.
 4499   //
 4500   // For a more detailed explanation, see Goll and Gueron, "Vectorization of
 4501   // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
 4502   // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
 4503   address generate_chacha20Block_qrpar() {
 4504     Label L_Q_twoRounds, L_Q_cc20_const;
 4505     // The constant data is broken into two 128-bit segments to be loaded
 4506     // onto SIMD registers.  The first 128 bits are a counter add overlay
 4507     // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
 4508     // The second 128-bits is a table constant used for 8-bit left rotations.
 4509     // on 32-bit lanes within a SIMD register.
 4510     __ BIND(L_Q_cc20_const);
 4511     __ emit_int64(0x0000000000000001UL);
 4512     __ emit_int64(0x0000000000000000UL);
 4513     __ emit_int64(0x0605040702010003UL);
 4514     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4515 
 4516     __ align(CodeEntryAlignment);
 4517     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4518     StubCodeMark mark(this, stub_id);
 4519     address start = __ pc();
 4520     __ enter();
 4521 
 4522     const Register state = c_rarg0;
 4523     const Register keystream = c_rarg1;
 4524     const Register loopCtr = r10;
 4525     const Register tmpAddr = r11;
 4526 
 4527     const FloatRegister aState = v0;
 4528     const FloatRegister bState = v1;
 4529     const FloatRegister cState = v2;
 4530     const FloatRegister dState = v3;
 4531     const FloatRegister a1Vec = v4;
 4532     const FloatRegister b1Vec = v5;
 4533     const FloatRegister c1Vec = v6;
 4534     const FloatRegister d1Vec = v7;
 4535     // Skip the callee-saved registers v8 - v15
 4536     const FloatRegister a2Vec = v16;
 4537     const FloatRegister b2Vec = v17;
 4538     const FloatRegister c2Vec = v18;
 4539     const FloatRegister d2Vec = v19;
 4540     const FloatRegister a3Vec = v20;
 4541     const FloatRegister b3Vec = v21;
 4542     const FloatRegister c3Vec = v22;
 4543     const FloatRegister d3Vec = v23;
 4544     const FloatRegister a4Vec = v24;
 4545     const FloatRegister b4Vec = v25;
 4546     const FloatRegister c4Vec = v26;
 4547     const FloatRegister d4Vec = v27;
 4548     const FloatRegister scratch = v28;
 4549     const FloatRegister addMask = v29;
 4550     const FloatRegister lrot8Tbl = v30;
 4551 
 4552     // Load the initial state in the first 4 quadword registers,
 4553     // then copy the initial state into the next 4 quadword registers
 4554     // that will be used for the working state.
 4555     __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
 4556 
 4557     // Load the index register for 2 constant 128-bit data fields.
 4558     // The first represents the +1/+0/+0/+0 add mask.  The second is
 4559     // the 8-bit left rotation.
 4560     __ adr(tmpAddr, L_Q_cc20_const);
 4561     __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
 4562 
 4563     __ mov(a1Vec, __ T16B, aState);
 4564     __ mov(b1Vec, __ T16B, bState);
 4565     __ mov(c1Vec, __ T16B, cState);
 4566     __ mov(d1Vec, __ T16B, dState);
 4567 
 4568     __ mov(a2Vec, __ T16B, aState);
 4569     __ mov(b2Vec, __ T16B, bState);
 4570     __ mov(c2Vec, __ T16B, cState);
 4571     __ addv(d2Vec, __ T4S, d1Vec, addMask);
 4572 
 4573     __ mov(a3Vec, __ T16B, aState);
 4574     __ mov(b3Vec, __ T16B, bState);
 4575     __ mov(c3Vec, __ T16B, cState);
 4576     __ addv(d3Vec, __ T4S, d2Vec, addMask);
 4577 
 4578     __ mov(a4Vec, __ T16B, aState);
 4579     __ mov(b4Vec, __ T16B, bState);
 4580     __ mov(c4Vec, __ T16B, cState);
 4581     __ addv(d4Vec, __ T4S, d3Vec, addMask);
 4582 
 4583     // Set up the 10 iteration loop
 4584     __ mov(loopCtr, 10);
 4585     __ BIND(L_Q_twoRounds);
 4586 
 4587     // The first set of operations on the vectors covers the first 4 quarter
 4588     // round operations:
 4589     //  Qround(state, 0, 4, 8,12)
 4590     //  Qround(state, 1, 5, 9,13)
 4591     //  Qround(state, 2, 6,10,14)
 4592     //  Qround(state, 3, 7,11,15)
 4593     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4594     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4595     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4596     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4597 
 4598     // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
 4599     // diagonals. The a1Vec does not need to change orientation.
 4600     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
 4601     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
 4602     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
 4603     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
 4604 
 4605     // The second set of operations on the vectors covers the second 4 quarter
 4606     // round operations, now acting on the diagonals:
 4607     //  Qround(state, 0, 5,10,15)
 4608     //  Qround(state, 1, 6,11,12)
 4609     //  Qround(state, 2, 7, 8,13)
 4610     //  Qround(state, 3, 4, 9,14)
 4611     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4612     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4613     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4614     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4615 
 4616     // Before we start the next iteration, we need to perform shuffles
 4617     // on the b/c/d vectors to move them back to columnar organizations
 4618     // from their current diagonal orientation.
 4619     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
 4620     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
 4621     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
 4622     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
 4623 
 4624     // Decrement and iterate
 4625     __ sub(loopCtr, loopCtr, 1);
 4626     __ cbnz(loopCtr, L_Q_twoRounds);
 4627 
 4628     // Once the counter reaches zero, we fall out of the loop
 4629     // and need to add the initial state back into the working state
 4630     // represented by the a/b/c/d1Vec registers.  This is destructive
 4631     // on the dState register but we no longer will need it.
 4632     __ addv(a1Vec, __ T4S, a1Vec, aState);
 4633     __ addv(b1Vec, __ T4S, b1Vec, bState);
 4634     __ addv(c1Vec, __ T4S, c1Vec, cState);
 4635     __ addv(d1Vec, __ T4S, d1Vec, dState);
 4636 
 4637     __ addv(a2Vec, __ T4S, a2Vec, aState);
 4638     __ addv(b2Vec, __ T4S, b2Vec, bState);
 4639     __ addv(c2Vec, __ T4S, c2Vec, cState);
 4640     __ addv(dState, __ T4S, dState, addMask);
 4641     __ addv(d2Vec, __ T4S, d2Vec, dState);
 4642 
 4643     __ addv(a3Vec, __ T4S, a3Vec, aState);
 4644     __ addv(b3Vec, __ T4S, b3Vec, bState);
 4645     __ addv(c3Vec, __ T4S, c3Vec, cState);
 4646     __ addv(dState, __ T4S, dState, addMask);
 4647     __ addv(d3Vec, __ T4S, d3Vec, dState);
 4648 
 4649     __ addv(a4Vec, __ T4S, a4Vec, aState);
 4650     __ addv(b4Vec, __ T4S, b4Vec, bState);
 4651     __ addv(c4Vec, __ T4S, c4Vec, cState);
 4652     __ addv(dState, __ T4S, dState, addMask);
 4653     __ addv(d4Vec, __ T4S, d4Vec, dState);
 4654 
 4655     // Write the final state back to the result buffer
 4656     __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
 4657     __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
 4658     __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
 4659     __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
 4660 
 4661     __ mov(r0, 256);             // Return length of output keystream
 4662     __ leave();
 4663     __ ret(lr);
 4664 
 4665     return start;
 4666   }
 4667 
 4668   void dilithium_load16zetas(int o0, Register zetas) {
 4669     __ ldpq(as_FloatRegister(o0), as_FloatRegister(o0 + 1), __ post (zetas, 32));
 4670     __ ldpq(as_FloatRegister(o0 + 2), as_FloatRegister(o0 + 3), __ post (zetas, 32));
 4671 
 4672   }
 4673 
 4674   void dilithium_load32zetas(Register zetas) {
 4675     dilithium_load16zetas(16, zetas);
 4676     dilithium_load16zetas(20, zetas);
 4677   }
 4678 
 4679   // 2x16 32-bit Montgomery multiplications in parallel
 4680   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4681   // Here MONT_R_BITS is 32, so the right shift by it is implicit.
 4682   // The constants qInv = MONT_Q_INV_MOD_R and q = MONT_Q are loaded in
 4683   // (all 32-bit chunks of) vector registers v30 and v31, resp.
 4684   // The inputs are b[i]s in v0-v7 and c[i]s v16-v23 and
 4685   // the results are a[i]s in v16-v23, four 32-bit values in each register
 4686   // and we do a_i = b_i * c_i * 2^-32 mod MONT_Q for all
 4687   void dilithium_montmul32(bool by_constant) {
 4688     FloatRegister vr0 = by_constant ? v29 : v0;
 4689     FloatRegister vr1 = by_constant ? v29 : v1;
 4690     FloatRegister vr2 = by_constant ? v29 : v2;
 4691     FloatRegister vr3 = by_constant ? v29 : v3;
 4692     FloatRegister vr4 = by_constant ? v29 : v4;
 4693     FloatRegister vr5 = by_constant ? v29 : v5;
 4694     FloatRegister vr6 = by_constant ? v29 : v6;
 4695     FloatRegister vr7 = by_constant ? v29 : v7;
 4696 
 4697     __ sqdmulh(v24, __ T4S, vr0, v16); // aHigh = hi32(2 * b * c)
 4698     __ mulv(v16, __ T4S, vr0, v16);    // aLow = lo32(b * c)
 4699     __ sqdmulh(v25, __ T4S, vr1, v17);
 4700     __ mulv(v17, __ T4S, vr1, v17);
 4701     __ sqdmulh(v26, __ T4S, vr2, v18);
 4702     __ mulv(v18, __ T4S, vr2, v18);
 4703     __ sqdmulh(v27, __ T4S, vr3, v19);
 4704     __ mulv(v19, __ T4S, vr3, v19);
 4705 
 4706     __ mulv(v16, __ T4S, v16, v30);     // m = aLow * qinv
 4707     __ mulv(v17, __ T4S, v17, v30);
 4708     __ mulv(v18, __ T4S, v18, v30);
 4709     __ mulv(v19, __ T4S, v19, v30);
 4710 
 4711     __ sqdmulh(v16, __ T4S, v16, v31);  // n = hi32(2 * m * q)
 4712     __ sqdmulh(v17, __ T4S, v17, v31);
 4713     __ sqdmulh(v18, __ T4S, v18, v31);
 4714     __ sqdmulh(v19, __ T4S, v19, v31);
 4715 
 4716     __ shsubv(v16, __ T4S, v24, v16);   // a = (aHigh - n) / 2
 4717     __ shsubv(v17, __ T4S, v25, v17);
 4718     __ shsubv(v18, __ T4S, v26, v18);
 4719     __ shsubv(v19, __ T4S, v27, v19);
 4720 
 4721     __ sqdmulh(v24, __ T4S, vr4, v20);
 4722     __ mulv(v20, __ T4S, vr4, v20);
 4723     __ sqdmulh(v25, __ T4S, vr5, v21);
 4724     __ mulv(v21, __ T4S, vr5, v21);
 4725     __ sqdmulh(v26, __ T4S, vr6, v22);
 4726     __ mulv(v22, __ T4S, vr6, v22);
 4727     __ sqdmulh(v27, __ T4S, vr7, v23);
 4728     __ mulv(v23, __ T4S, vr7, v23);
 4729 
 4730     __ mulv(v20, __ T4S, v20, v30);
 4731     __ mulv(v21, __ T4S, v21, v30);
 4732     __ mulv(v22, __ T4S, v22, v30);
 4733     __ mulv(v23, __ T4S, v23, v30);
 4734 
 4735     __ sqdmulh(v20, __ T4S, v20, v31);
 4736     __ sqdmulh(v21, __ T4S, v21, v31);
 4737     __ sqdmulh(v22, __ T4S, v22, v31);
 4738     __ sqdmulh(v23, __ T4S, v23, v31);
 4739 
 4740     __ shsubv(v20, __ T4S, v24, v20);
 4741     __ shsubv(v21, __ T4S, v25, v21);
 4742     __ shsubv(v22, __ T4S, v26, v22);
 4743     __ shsubv(v23, __ T4S, v27, v23);
 4744   }
 4745 
 4746  // Do the addition and subtraction done in the ntt algorithm.
 4747  // See sun.security.provider.ML_DSA.implDilithiumAlmostNttJava()
 4748   void dilithium_add_sub32() {
 4749     __ addv(v24, __ T4S, v0, v16); // coeffs[j] = coeffs[j] + tmp;
 4750     __ addv(v25, __ T4S, v1, v17);
 4751     __ addv(v26, __ T4S, v2, v18);
 4752     __ addv(v27, __ T4S, v3, v19);
 4753     __ addv(v28, __ T4S, v4, v20);
 4754     __ addv(v29, __ T4S, v5, v21);
 4755     __ addv(v30, __ T4S, v6, v22);
 4756     __ addv(v31, __ T4S, v7, v23);
 4757 
 4758     __ subv(v0, __ T4S, v0, v16);  // coeffs[j + l] = coeffs[j] - tmp;
 4759     __ subv(v1, __ T4S, v1, v17);
 4760     __ subv(v2, __ T4S, v2, v18);
 4761     __ subv(v3, __ T4S, v3, v19);
 4762     __ subv(v4, __ T4S, v4, v20);
 4763     __ subv(v5, __ T4S, v5, v21);
 4764     __ subv(v6, __ T4S, v6, v22);
 4765     __ subv(v7, __ T4S, v7, v23);
 4766   }
 4767 
 4768   // Do the same computation that
 4769   // dilithium_montmul32() and dilithium_add_sub32() does,
 4770   // except for only 4x4 32-bit vector elements and with
 4771   // different register usage.
 4772   void dilithium_montmul_sub_add16() {
 4773     __ sqdmulh(v24, __ T4S, v1, v16);
 4774     __ mulv(v16, __ T4S, v1, v16);
 4775     __ sqdmulh(v25, __ T4S, v3, v17);
 4776     __ mulv(v17, __ T4S, v3, v17);
 4777     __ sqdmulh(v26, __ T4S, v5, v18);
 4778     __ mulv(v18, __ T4S, v5, v18);
 4779     __ sqdmulh(v27, __ T4S, v7, v19);
 4780     __ mulv(v19, __ T4S, v7, v19);
 4781 
 4782     __ mulv(v16, __ T4S, v16, v30);
 4783     __ mulv(v17, __ T4S, v17, v30);
 4784     __ mulv(v18, __ T4S, v18, v30);
 4785     __ mulv(v19, __ T4S, v19, v30);
 4786 
 4787     __ sqdmulh(v16, __ T4S, v16, v31);
 4788     __ sqdmulh(v17, __ T4S, v17, v31);
 4789     __ sqdmulh(v18, __ T4S, v18, v31);
 4790     __ sqdmulh(v19, __ T4S, v19, v31);
 4791 
 4792     __ shsubv(v16, __ T4S, v24, v16);
 4793     __ shsubv(v17, __ T4S, v25, v17);
 4794     __ shsubv(v18, __ T4S, v26, v18);
 4795     __ shsubv(v19, __ T4S, v27, v19);
 4796 
 4797     __ subv(v1, __ T4S, v0, v16);
 4798     __ subv(v3, __ T4S, v2, v17);
 4799     __ subv(v5, __ T4S, v4, v18);
 4800     __ subv(v7, __ T4S, v6, v19);
 4801 
 4802     __ addv(v0, __ T4S, v0, v16);
 4803     __ addv(v2, __ T4S, v2, v17);
 4804     __ addv(v4, __ T4S, v4, v18);
 4805     __ addv(v6, __ T4S, v6, v19);
 4806   }
 4807 
 4808   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 4809   // in the Java implementation come in sequences of at least 8, so we
 4810   // can use ldpq to collect the corresponding data into pairs of vector
 4811   // registers.
 4812   // We collect the coefficients corresponding to the 'j+l' indexes into
 4813   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 4814   // then we do the (Montgomery) multiplications by the zetas in parallel
 4815   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 4816   // v0-v7, then do the additions into v24-v31 and the subtractions into
 4817   // v0-v7 and finally save the results back to the coeffs array.
 4818   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 4819     const Register coeffs, const Register zetas) {
 4820     int c1 = 0;
 4821     int c2 = 512;
 4822     int startIncr;
 4823     int incr1 = 32;
 4824     int incr2 = 64;
 4825     int incr3 = 96;
 4826 
 4827     for (int level = 0; level < 5; level++) {
 4828       int c1Start = c1;
 4829       int c2Start = c2;
 4830       if (level == 3) {
 4831         incr1 = 32;
 4832         incr2 = 128;
 4833         incr3 = 160;
 4834       } else if (level == 4) {
 4835         incr1 = 64;
 4836         incr2 = 128;
 4837         incr3 = 192;
 4838       }
 4839 
 4840       for (int i = 0; i < 4; i++) {
 4841         __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q
 4842         __ ldpq(v0, v1, Address(coeffs, c2Start));
 4843         __ ldpq(v2, v3, Address(coeffs, c2Start + incr1));
 4844         __ ldpq(v4, v5, Address(coeffs, c2Start + incr2));
 4845         __ ldpq(v6, v7, Address(coeffs, c2Start + incr3));
 4846         dilithium_load32zetas(zetas);
 4847         dilithium_montmul32(false);
 4848         __ ldpq(v0, v1, Address(coeffs, c1Start));
 4849         __ ldpq(v2, v3, Address(coeffs, c1Start + incr1));
 4850         __ ldpq(v4, v5, Address(coeffs, c1Start + incr2));
 4851         __ ldpq(v6, v7, Address(coeffs, c1Start + incr3));
 4852         dilithium_add_sub32();
 4853         __ stpq(v24, v25, Address(coeffs, c1Start));
 4854         __ stpq(v26, v27, Address(coeffs, c1Start + incr1));
 4855         __ stpq(v28, v29, Address(coeffs, c1Start + incr2));
 4856         __ stpq(v30, v31, Address(coeffs, c1Start + incr3));
 4857         __ stpq(v0, v1, Address(coeffs, c2Start));
 4858         __ stpq(v2, v3, Address(coeffs, c2Start + incr1));
 4859         __ stpq(v4, v5, Address(coeffs, c2Start + incr2));
 4860         __ stpq(v6, v7, Address(coeffs, c2Start + incr3));
 4861 
 4862         int k = 4 * level + i;
 4863 
 4864         if (k > 7) {
 4865           startIncr = 256;
 4866         } else if (k == 5) {
 4867           startIncr = 384;
 4868         } else {
 4869           startIncr = 128;
 4870         }
 4871 
 4872         c1Start += startIncr;
 4873         c2Start += startIncr;
 4874       }
 4875 
 4876       c2 /= 2;
 4877     }
 4878   }
 4879 
 4880   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 4881   // Implements the method
 4882   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 4883   // of the Java class sun.security.provider
 4884   //
 4885   // coeffs (int[256]) = c_rarg0
 4886   // zetas (int[256]) = c_rarg1
 4887   address generate_dilithiumAlmostNtt() {
 4888 
 4889     __ align(CodeEntryAlignment);
 4890     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 4891     StubCodeMark mark(this, stub_id);
 4892     address start = __ pc();
 4893     __ enter();
 4894 
 4895     const Register coeffs = c_rarg0;
 4896     const Register zetas = c_rarg1;
 4897 
 4898     const Register tmpAddr = r9;
 4899     const Register dilithiumConsts = r10;
 4900     const Register result = r11;
 4901 
 4902     __ add(result, coeffs, 0);
 4903     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 4904 
 4905     // Each level represents one iteration of the outer for loop of the Java version
 4906 
 4907     // level 0-4
 4908     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 4909 
 4910     // level 5
 4911     for (int i = 0; i < 1024; i += 256) {
 4912       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4913       __ ldr(v0, __ Q, Address(coeffs, i + 16));
 4914       __ ldr(v1, __ Q, Address(coeffs, i + 48));
 4915       __ ldr(v2, __ Q, Address(coeffs, i + 80));
 4916       __ ldr(v3, __ Q, Address(coeffs, i + 112));
 4917       __ ldr(v4, __ Q, Address(coeffs, i + 144));
 4918       __ ldr(v5, __ Q, Address(coeffs, i + 176));
 4919       __ ldr(v6, __ Q, Address(coeffs, i + 208));
 4920       __ ldr(v7, __ Q, Address(coeffs, i + 240));
 4921       dilithium_load32zetas(zetas);
 4922       dilithium_montmul32(false);
 4923       __ ldr(v0, __ Q, Address(coeffs, i));
 4924       __ ldr(v1, __ Q, Address(coeffs, i + 32));
 4925       __ ldr(v2, __ Q, Address(coeffs, i + 64));
 4926       __ ldr(v3, __ Q, Address(coeffs, i + 96));
 4927       __ ldr(v4, __ Q, Address(coeffs, i + 128));
 4928       __ ldr(v5, __ Q, Address(coeffs, i + 160));
 4929       __ ldr(v6, __ Q, Address(coeffs, i + 192));
 4930       __ ldr(v7, __ Q, Address(coeffs, i + 224));
 4931       dilithium_add_sub32();
 4932       __ str(v24, __ Q, Address(coeffs, i));
 4933       __ str(v25, __ Q, Address(coeffs, i + 32));
 4934       __ str(v26, __ Q, Address(coeffs, i + 64));
 4935       __ str(v27, __ Q, Address(coeffs, i + 96));
 4936       __ str(v28, __ Q, Address(coeffs, i + 128));
 4937       __ str(v29, __ Q, Address(coeffs, i + 160));
 4938       __ str(v30, __ Q, Address(coeffs, i + 192));
 4939       __ str(v31, __ Q, Address(coeffs, i + 224));
 4940       __ str(v0, __ Q, Address(coeffs, i + 16));
 4941       __ str(v1, __ Q, Address(coeffs, i + 48));
 4942       __ str(v2, __ Q, Address(coeffs, i + 80));
 4943       __ str(v3, __ Q, Address(coeffs, i + 112));
 4944       __ str(v4, __ Q, Address(coeffs, i + 144));
 4945       __ str(v5, __ Q, Address(coeffs, i + 176));
 4946       __ str(v6, __ Q, Address(coeffs, i + 208));
 4947       __ str(v7, __ Q, Address(coeffs, i + 240));
 4948     }
 4949 
 4950     // level 6
 4951     for (int i = 0; i < 1024; i += 128) {
 4952       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4953       __ add(tmpAddr, coeffs, i);
 4954       __ ld2(v0, v1, __ T2D, tmpAddr);
 4955       __ add(tmpAddr, coeffs, i + 32);
 4956       __ ld2(v2, v3, __ T2D, tmpAddr);
 4957       __ add(tmpAddr, coeffs, i + 64);
 4958       __ ld2(v4, v5, __ T2D, tmpAddr);
 4959       __ add(tmpAddr, coeffs, i + 96);
 4960       __ ld2(v6, v7, __ T2D, tmpAddr);
 4961       dilithium_load16zetas(16, zetas);
 4962       dilithium_montmul_sub_add16();
 4963       __ add(tmpAddr, coeffs, i);
 4964       __ st2(v0, v1, __ T2D, tmpAddr);
 4965       __ add(tmpAddr, coeffs, i + 32);
 4966       __ st2(v2, v3, __ T2D, tmpAddr);
 4967       __ add(tmpAddr, coeffs, i + 64);
 4968       __ st2(v4, v5, __ T2D, tmpAddr);
 4969       __ add(tmpAddr, coeffs, i + 96);
 4970       __ st2(v6, v7, __ T2D, tmpAddr);
 4971     }
 4972 
 4973     // level 7
 4974     for (int i = 0; i < 1024; i += 128) {
 4975       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4976       __ add(tmpAddr, coeffs, i);
 4977       __ ld2(v0, v1, __ T4S, tmpAddr);
 4978       __ add(tmpAddr, coeffs, i + 32);
 4979       __ ld2(v2, v3, __ T4S, tmpAddr);
 4980       __ add(tmpAddr, coeffs, i + 64);
 4981       __ ld2(v4, v5, __ T4S, tmpAddr);
 4982       __ add(tmpAddr, coeffs, i + 96);
 4983       __ ld2(v6, v7, __ T4S, tmpAddr);
 4984       dilithium_load16zetas(16, zetas);
 4985       dilithium_montmul_sub_add16();
 4986       __ add(tmpAddr, coeffs, i);
 4987       __ st2(v0, v1, __ T4S, tmpAddr);
 4988       __ add(tmpAddr, coeffs, i + 32);
 4989       __ st2(v2, v3, __ T4S, tmpAddr);
 4990       __ add(tmpAddr, coeffs, i + 64);
 4991       __ st2(v4, v5, __ T4S, tmpAddr);
 4992       __ add(tmpAddr, coeffs, i + 96);
 4993       __ st2(v6, v7, __ T4S, tmpAddr);
 4994     }
 4995     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4996     __ mov(r0, zr); // return 0
 4997     __ ret(lr);
 4998 
 4999     return start;
 5000 
 5001   }
 5002 
 5003   // Do the computations that can be found in the body of the loop in
 5004   // sun.security.provider.ML_DSA.implDilithiumAlmostInverseNttJava()
 5005   // for 16 coefficients in parallel:
 5006   // tmp = coeffs[j];
 5007   // coeffs[j] = (tmp + coeffs[j + l]);
 5008   // coeffs[j + l] = montMul(tmp - coeffs[j + l], -MONT_ZETAS_FOR_NTT[m]);
 5009   // coefss[j]s are loaded in v0, v2, v4 and v6,
 5010   // coeffs[j + l]s in v1, v3, v5 and v7,
 5011   // the corresponding zetas in v16, v17, v18 and v19.
 5012   void dilithium_sub_add_montmul16() {
 5013     __ subv(v20, __ T4S, v0, v1);
 5014     __ subv(v21, __ T4S, v2, v3);
 5015     __ subv(v22, __ T4S, v4, v5);
 5016     __ subv(v23, __ T4S, v6, v7);
 5017 
 5018     __ addv(v0, __ T4S, v0, v1);
 5019     __ addv(v2, __ T4S, v2, v3);
 5020     __ addv(v4, __ T4S, v4, v5);
 5021     __ addv(v6, __ T4S, v6, v7);
 5022 
 5023     __ sqdmulh(v24, __ T4S, v20, v16); // aHigh = hi32(2 * b * c)
 5024     __ mulv(v1, __ T4S, v20, v16);     // aLow = lo32(b * c)
 5025     __ sqdmulh(v25, __ T4S, v21, v17);
 5026     __ mulv(v3, __ T4S, v21, v17);
 5027     __ sqdmulh(v26, __ T4S, v22, v18);
 5028     __ mulv(v5, __ T4S, v22, v18);
 5029     __ sqdmulh(v27, __ T4S, v23, v19);
 5030     __ mulv(v7, __ T4S, v23, v19);
 5031 
 5032     __ mulv(v1, __ T4S, v1, v30);      // m = (aLow * q)
 5033     __ mulv(v3, __ T4S, v3, v30);
 5034     __ mulv(v5, __ T4S, v5, v30);
 5035     __ mulv(v7, __ T4S, v7, v30);
 5036 
 5037     __ sqdmulh(v1, __ T4S, v1, v31);  // n = hi32(2 * m * q)
 5038     __ sqdmulh(v3, __ T4S, v3, v31);
 5039     __ sqdmulh(v5, __ T4S, v5, v31);
 5040     __ sqdmulh(v7, __ T4S, v7, v31);
 5041 
 5042     __ shsubv(v1, __ T4S, v24, v1);  // a = (aHigh  - n) / 2
 5043     __ shsubv(v3, __ T4S, v25, v3);
 5044     __ shsubv(v5, __ T4S, v26, v5);
 5045     __ shsubv(v7, __ T4S, v27, v7);
 5046   }
 5047 
 5048   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 5049   // in the Java implementation come in sequences of at least 8, so we
 5050   // can use ldpq to collect the corresponding data into pairs of vector
 5051   // registers
 5052   // We collect the coefficients that correspond to the 'j's into v0-v7
 5053   // the coefficiets that correspond to the 'j+l's into v16-v23 then
 5054   // do the additions into v24-v31 and the subtractions into v0-v7 then
 5055   // save the result of the additions, load the zetas into v16-v23
 5056   // do the (Montgomery) multiplications by zeta in parallel into v16-v23
 5057   // finally save the results back to the coeffs array
 5058   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 5059     const Register coeffs, const Register zetas) {
 5060     int c1 = 0;
 5061     int c2 = 32;
 5062     int startIncr;
 5063     int incr1;
 5064     int incr2;
 5065     int incr3;
 5066 
 5067     for (int level = 3; level < 8; level++) {
 5068       int c1Start = c1;
 5069       int c2Start = c2;
 5070       if (level == 3) {
 5071         incr1 = 64;
 5072         incr2 = 128;
 5073         incr3 = 192;
 5074       } else if (level == 4) {
 5075         incr1 = 32;
 5076         incr2 = 128;
 5077         incr3 = 160;
 5078       } else {
 5079         incr1 = 32;
 5080         incr2 = 64;
 5081         incr3 = 96;
 5082       }
 5083 
 5084       for (int i = 0; i < 4; i++) {
 5085         __ ldpq(v0, v1, Address(coeffs, c1Start));
 5086         __ ldpq(v2, v3, Address(coeffs, c1Start + incr1));
 5087         __ ldpq(v4, v5, Address(coeffs, c1Start + incr2));
 5088         __ ldpq(v6, v7, Address(coeffs, c1Start + incr3));
 5089         __ ldpq(v16, v17, Address(coeffs, c2Start));
 5090         __ ldpq(v18, v19, Address(coeffs, c2Start + incr1));
 5091         __ ldpq(v20, v21, Address(coeffs, c2Start + incr2));
 5092         __ ldpq(v22, v23, Address(coeffs, c2Start + incr3));
 5093         dilithium_add_sub32();
 5094         __ stpq(v24, v25, Address(coeffs, c1Start));
 5095         __ stpq(v26, v27, Address(coeffs, c1Start + incr1));
 5096         __ stpq(v28, v29, Address(coeffs, c1Start + incr2));
 5097         __ stpq(v30, v31, Address(coeffs, c1Start + incr3));
 5098         __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5099         dilithium_load32zetas(zetas);
 5100         dilithium_montmul32(false);
 5101         __ stpq(v16, v17, Address(coeffs, c2Start));
 5102         __ stpq(v18, v19, Address(coeffs, c2Start + incr1));
 5103         __ stpq(v20, v21, Address(coeffs, c2Start + incr2));
 5104         __ stpq(v22, v23, Address(coeffs, c2Start + incr3));
 5105 
 5106         int k = 4 * level + i;
 5107 
 5108         if (k < 24) {
 5109           startIncr = 256;
 5110         } else if (k == 25) {
 5111           startIncr = 384;
 5112         } else {
 5113           startIncr = 128;
 5114         }
 5115 
 5116         c1Start += startIncr;
 5117         c2Start += startIncr;
 5118       }
 5119 
 5120       c2 *= 2;
 5121     }
 5122   }
 5123 
 5124   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 5125   // Implements the method
 5126   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 5127   // the sun.security.provider.ML_DSA class.
 5128   //
 5129   // coeffs (int[256]) = c_rarg0
 5130   // zetas (int[256]) = c_rarg1
 5131   address generate_dilithiumAlmostInverseNtt() {
 5132 
 5133     __ align(CodeEntryAlignment);
 5134     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 5135     StubCodeMark mark(this, stub_id);
 5136     address start = __ pc();
 5137     __ enter();
 5138 
 5139     const Register coeffs = c_rarg0;
 5140     const Register zetas = c_rarg1;
 5141 
 5142     const Register tmpAddr = r9;
 5143     const Register dilithiumConsts = r10;
 5144     const Register result = r11;
 5145 
 5146     __ add(result, coeffs, 0);
 5147     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5148 
 5149     // Each level represents one iteration of the outer for loop of the Java version
 5150     // level0
 5151     for (int i = 0; i < 1024; i += 128) {
 5152       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 5153       __ add(tmpAddr, coeffs, i);
 5154       __ ld2(v0, v1, __ T4S, tmpAddr);
 5155       __ add(tmpAddr, coeffs, i + 32);
 5156       __ ld2(v2, v3, __ T4S, tmpAddr);
 5157       __ add(tmpAddr, coeffs, i + 64);
 5158       __ ld2(v4, v5, __ T4S, tmpAddr);
 5159       __ add(tmpAddr, coeffs, i + 96);
 5160       __ ld2(v6, v7, __ T4S, tmpAddr);
 5161       dilithium_load16zetas(16, zetas);
 5162       dilithium_sub_add_montmul16();
 5163       __ add(tmpAddr, coeffs, i);
 5164       __ st2(v0, v1, __ T4S, tmpAddr);
 5165       __ add(tmpAddr, coeffs, i + 32);
 5166       __ st2(v2, v3, __ T4S, tmpAddr);
 5167       __ add(tmpAddr, coeffs, i + 64);
 5168       __ st2(v4, v5, __ T4S, tmpAddr);
 5169       __ add(tmpAddr, coeffs, i + 96);
 5170       __ st2(v6, v7, __ T4S, tmpAddr);
 5171     }
 5172 
 5173     // level 1
 5174     for (int i = 0; i < 1024; i += 128) {
 5175       __ add(tmpAddr, coeffs, i);
 5176       __ ld2(v0, v1, __ T2D, tmpAddr);
 5177       __ add(tmpAddr, coeffs, i + 32);
 5178       __ ld2(v2, v3, __ T2D, tmpAddr);
 5179       __ add(tmpAddr, coeffs, i + 64);
 5180       __ ld2(v4, v5, __ T2D, tmpAddr);
 5181       __ add(tmpAddr, coeffs, i + 96);
 5182       __ ld2(v6, v7, __ T2D, tmpAddr);
 5183       dilithium_load16zetas(16, zetas);
 5184       dilithium_sub_add_montmul16();
 5185       __ add(tmpAddr, coeffs, i);
 5186       __ st2(v0, v1, __ T2D, tmpAddr);
 5187       __ add(tmpAddr, coeffs, i + 32);
 5188       __ st2(v2, v3, __ T2D, tmpAddr);
 5189       __ add(tmpAddr, coeffs, i + 64);
 5190       __ st2(v4, v5, __ T2D, tmpAddr);
 5191       __ add(tmpAddr, coeffs, i + 96);
 5192       __ st2(v6, v7, __ T2D, tmpAddr);
 5193     }
 5194 
 5195     //level 2
 5196     for (int i = 0; i < 1024; i += 256) {
 5197       __ ldr(v0, __ Q, Address(coeffs, i));
 5198       __ ldr(v1, __ Q, Address(coeffs, i + 32));
 5199       __ ldr(v2, __ Q, Address(coeffs, i + 64));
 5200       __ ldr(v3, __ Q, Address(coeffs, i + 96));
 5201       __ ldr(v4, __ Q, Address(coeffs, i + 128));
 5202       __ ldr(v5, __ Q, Address(coeffs, i + 160));
 5203       __ ldr(v6, __ Q, Address(coeffs, i + 192));
 5204       __ ldr(v7, __ Q, Address(coeffs, i + 224));
 5205       __ ldr(v16, __ Q, Address(coeffs, i + 16));
 5206       __ ldr(v17, __ Q, Address(coeffs, i + 48));
 5207       __ ldr(v18, __ Q, Address(coeffs, i + 80));
 5208       __ ldr(v19, __ Q, Address(coeffs, i + 112));
 5209       __ ldr(v20, __ Q, Address(coeffs, i + 144));
 5210       __ ldr(v21, __ Q, Address(coeffs, i + 176));
 5211       __ ldr(v22, __ Q, Address(coeffs, i + 208));
 5212       __ ldr(v23, __ Q, Address(coeffs, i + 240));
 5213       dilithium_add_sub32();
 5214       __ str(v24, __ Q, Address(coeffs, i));
 5215       __ str(v25, __ Q, Address(coeffs, i + 32));
 5216       __ str(v26, __ Q, Address(coeffs, i + 64));
 5217       __ str(v27, __ Q, Address(coeffs, i + 96));
 5218       __ str(v28, __ Q, Address(coeffs, i + 128));
 5219       __ str(v29, __ Q, Address(coeffs, i + 160));
 5220       __ str(v30, __ Q, Address(coeffs, i + 192));
 5221       __ str(v31, __ Q, Address(coeffs, i + 224));
 5222       dilithium_load32zetas(zetas);
 5223       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 5224       dilithium_montmul32(false);
 5225       __ str(v16, __ Q, Address(coeffs, i + 16));
 5226       __ str(v17, __ Q, Address(coeffs, i + 48));
 5227       __ str(v18, __ Q, Address(coeffs, i + 80));
 5228       __ str(v19, __ Q, Address(coeffs, i + 112));
 5229       __ str(v20, __ Q, Address(coeffs, i + 144));
 5230       __ str(v21, __ Q, Address(coeffs, i + 176));
 5231       __ str(v22, __ Q, Address(coeffs, i + 208));
 5232       __ str(v23, __ Q, Address(coeffs, i + 240));
 5233     }
 5234 
 5235     // level 3-7
 5236     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 5237 
 5238     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5239     __ mov(r0, zr); // return 0
 5240     __ ret(lr);
 5241 
 5242     return start;
 5243 
 5244   }
 5245 
 5246   // Dilithium multiply polynomials in the NTT domain.
 5247   // Straightforward implementation of the method
 5248   // static int implDilithiumNttMult(
 5249   //              int[] result, int[] ntta, int[] nttb {} of
 5250   // the sun.security.provider.ML_DSA class.
 5251   //
 5252   // result (int[256]) = c_rarg0
 5253   // poly1 (int[256]) = c_rarg1
 5254   // poly2 (int[256]) = c_rarg2
 5255   address generate_dilithiumNttMult() {
 5256 
 5257     __ align(CodeEntryAlignment);
 5258     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 5259     StubCodeMark mark(this, stub_id);
 5260     address start = __ pc();
 5261     __ enter();
 5262 
 5263     Label L_loop;
 5264 
 5265     const Register result = c_rarg0;
 5266     const Register poly1 = c_rarg1;
 5267     const Register poly2 = c_rarg2;
 5268 
 5269     const Register dilithiumConsts = r10;
 5270     const Register len = r11;
 5271 
 5272     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5273 
 5274     __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5275     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 5276 
 5277     __ mov(len, zr);
 5278     __ add(len, len, 1024);
 5279 
 5280     __ BIND(L_loop);
 5281 
 5282     __ ldpq(v0, v1, __ post(poly1, 32));
 5283     __ ldpq(v2, v3, __ post(poly1, 32));
 5284     __ ldpq(v4, v5, __ post(poly1, 32));
 5285     __ ldpq(v6, v7, __ post(poly1, 32));
 5286     __ ldpq(v16, v17, __ post(poly2, 32));
 5287     __ ldpq(v18, v19, __ post(poly2, 32));
 5288     __ ldpq(v20, v21, __ post(poly2, 32));
 5289     __ ldpq(v22, v23, __ post(poly2, 32));
 5290     dilithium_montmul32(false);
 5291     dilithium_montmul32(true);
 5292     __ stpq(v16, v17, __ post(result, 32));
 5293     __ stpq(v18, v19, __ post(result, 32));
 5294     __ stpq(v20, v21, __ post(result, 32));
 5295     __ stpq(v22, v23, __ post(result, 32));
 5296 
 5297     __ sub(len, len, 128);
 5298     __ cmp(len, (u1)128);
 5299     __ br(Assembler::GE, L_loop);
 5300 
 5301     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5302     __ mov(r0, zr); // return 0
 5303     __ ret(lr);
 5304 
 5305     return start;
 5306 
 5307   }
 5308 
 5309   // Dilithium Motgomery multiply an array by a constant.
 5310   // A straightforward implementation of the method
 5311   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 5312   // of the sun.security.provider.MLDSA class
 5313   //
 5314   // coeffs (int[256]) = c_rarg0
 5315   // constant (int) = c_rarg1
 5316   address generate_dilithiumMontMulByConstant() {
 5317 
 5318     __ align(CodeEntryAlignment);
 5319     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 5320     StubCodeMark mark(this, stub_id);
 5321     address start = __ pc();
 5322     __ enter();
 5323 
 5324     Label L_loop;
 5325 
 5326     const Register coeffs = c_rarg0;
 5327     const Register constant = c_rarg1;
 5328 
 5329     const Register dilithiumConsts = r10;
 5330     const Register result = r11;
 5331     const Register len = r12;
 5332 
 5333     __ add(result, coeffs, 0);
 5334     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5335 
 5336     __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5337     __ dup(v29, __ T4S, constant);
 5338     __ mov(len, zr);
 5339     __ add(len, len, 1024);
 5340 
 5341     __ BIND(L_loop);
 5342 
 5343     __ ldpq(v16, v17, __ post(coeffs, 32));
 5344     __ ldpq(v18, v19, __ post(coeffs, 32));
 5345     __ ldpq(v20, v21, __ post(coeffs, 32));
 5346     __ ldpq(v22, v23, __ post(coeffs, 32));
 5347     dilithium_montmul32(true);
 5348     __ stpq(v16, v17, __ post(result, 32));
 5349     __ stpq(v18, v19, __ post(result, 32));
 5350     __ stpq(v20, v21, __ post(result, 32));
 5351     __ stpq(v22, v23, __ post(result, 32));
 5352 
 5353     __ sub(len, len, 128);
 5354     __ cmp(len, (u1)128);
 5355     __ br(Assembler::GE, L_loop);
 5356 
 5357     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5358     __ mov(r0, zr); // return 0
 5359     __ ret(lr);
 5360 
 5361     return start;
 5362   }
 5363 
 5364   // Dilithium decompose poly.
 5365   // Implements the method
 5366   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 5367   // of the sun.security.provider.ML_DSA class
 5368   //
 5369   // input (int[256]) = c_rarg0
 5370   // lowPart (int[256]) = c_rarg1
 5371   // highPart (int[256]) = c_rarg2
 5372   // twoGamma2  (int) = c_rarg3
 5373   // multiplier (int) = c_rarg4
 5374   address generate_dilithiumDecomposePoly() {
 5375 
 5376     __ align(CodeEntryAlignment);
 5377     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 5378     StubCodeMark mark(this, stub_id);
 5379     address start = __ pc();
 5380     __ enter();
 5381 
 5382     Label L_loop;
 5383 
 5384     const Register input = c_rarg0;
 5385     const Register lowPart = c_rarg1;
 5386     const Register highPart = c_rarg2;
 5387     const Register twoGamma2 = c_rarg3;
 5388     const Register multiplier = c_rarg4;
 5389 
 5390     const Register len = r9;
 5391     const Register dilithiumConsts = r10;
 5392     const Register tmp = r11;
 5393 
 5394     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5395 
 5396     // save callee-saved registers
 5397     __ stpd(v8, v9, __ pre(sp, -64));
 5398     __ stpd(v10, v11, Address(sp, 16));
 5399     __ stpd(v12, v13, Address(sp, 32));
 5400     __ stpd(v14, v15, Address(sp, 48));
 5401 
 5402 
 5403     __ mov(tmp, zr);
 5404     __ add(tmp, tmp, 1);
 5405     __ dup(v25, __ T4S, tmp); // 1
 5406     __ ldr(v30, __ Q, Address(dilithiumConsts, 16)); // q
 5407     __ ldr(v31, __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 5408     __ dup(v28, __ T4S, twoGamma2); // 2 * gamma2
 5409     __ dup(v29, __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 5410     __ subv(v26, __ T4S, v30, v25); // q - 1
 5411     __ sshr(v27, __ T4S, v28, 1); // gamma2
 5412 
 5413     __ mov(len, zr);
 5414     __ add(len, len, 1024);
 5415 
 5416     __ BIND(L_loop);
 5417 
 5418     __ ld4(v0, v1, v2, v3, __ T4S, __ post(input, 64));
 5419 
 5420     // rplus in v0
 5421     //  rplus = rplus - ((rplus + 5373807) >> 23) * dilithium_q;
 5422     __ addv(v4, __ T4S, v0, v31);
 5423     __ addv(v5, __ T4S, v1, v31);
 5424     __ addv(v6, __ T4S, v2, v31);
 5425     __ addv(v7, __ T4S, v3, v31);
 5426 
 5427     __ sshr(v4, __ T4S, v4, 23);
 5428     __ sshr(v5, __ T4S, v5, 23);
 5429     __ sshr(v6, __ T4S, v6, 23);
 5430     __ sshr(v7, __ T4S, v7, 23);
 5431 
 5432     __ mulv(v4, __ T4S, v4, v30);
 5433     __ mulv(v5, __ T4S, v5, v30);
 5434     __ mulv(v6, __ T4S, v6, v30);
 5435     __ mulv(v7, __ T4S, v7, v30);
 5436 
 5437     __ subv(v0, __ T4S, v0, v4);
 5438     __ subv(v1, __ T4S, v1, v5);
 5439     __ subv(v2, __ T4S, v2, v6);
 5440     __ subv(v3, __ T4S, v3, v7);
 5441 
 5442     // rplus in v0
 5443     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 5444     __ sshr(v4, __ T4S, v0, 31);
 5445     __ sshr(v5, __ T4S, v1, 31);
 5446     __ sshr(v6, __ T4S, v2, 31);
 5447     __ sshr(v7, __ T4S, v3, 31);
 5448 
 5449     __ andr(v4, __ T16B, v4, v30);
 5450     __ andr(v5, __ T16B, v5, v30);
 5451     __ andr(v6, __ T16B, v6, v30);
 5452     __ andr(v7, __ T16B, v7, v30);
 5453 
 5454     __ addv(v0, __ T4S, v0, v4);
 5455     __ addv(v1, __ T4S, v1, v5);
 5456     __ addv(v2, __ T4S, v2, v6);
 5457     __ addv(v3, __ T4S, v3, v7);
 5458 
 5459     // rplus in v0
 5460     // int quotient = (rplus * multiplier) >> 22;
 5461     __ mulv(v4, __ T4S, v0, v29);
 5462     __ mulv(v5, __ T4S, v1, v29);
 5463     __ mulv(v6, __ T4S, v2, v29);
 5464     __ mulv(v7, __ T4S, v3, v29);
 5465 
 5466     __ sshr(v4, __ T4S, v4, 22);
 5467     __ sshr(v5, __ T4S, v5, 22);
 5468     __ sshr(v6, __ T4S, v6, 22);
 5469     __ sshr(v7, __ T4S, v7, 22);
 5470 
 5471     // quotient in v4
 5472     // int r0 = rplus - quotient * twoGamma2;
 5473     __ mulv(v8, __ T4S, v4, v28);
 5474     __ mulv(v9, __ T4S, v5, v28);
 5475     __ mulv(v10, __ T4S, v6, v28);
 5476     __ mulv(v11, __ T4S, v7, v28);
 5477 
 5478     __ subv(v8, __ T4S, v0, v8);
 5479     __ subv(v9, __ T4S, v1, v9);
 5480     __ subv(v10, __ T4S, v2, v10);
 5481     __ subv(v11, __ T4S, v3, v11);
 5482 
 5483     // r0 in v8
 5484     // int mask = (twoGamma2 - r0) >> 22;
 5485     __ subv(v12, __ T4S, v28, v8);
 5486     __ subv(v13, __ T4S, v28, v9);
 5487     __ subv(v14, __ T4S, v28, v10);
 5488     __ subv(v15, __ T4S, v28, v11);
 5489 
 5490     __ sshr(v12, __ T4S, v12, 22);
 5491     __ sshr(v13, __ T4S, v13, 22);
 5492     __ sshr(v14, __ T4S, v14, 22);
 5493     __ sshr(v15, __ T4S, v15, 22);
 5494 
 5495     // mask in v12
 5496     // r0 -= (mask & twoGamma2);
 5497     __ andr(v16, __ T16B, v12, v28);
 5498     __ andr(v17, __ T16B, v13, v28);
 5499     __ andr(v18, __ T16B, v14, v28);
 5500     __ andr(v19, __ T16B, v15, v28);
 5501 
 5502     __ subv(v8, __ T4S, v8, v16);
 5503     __ subv(v9, __ T4S, v9, v17);
 5504     __ subv(v10, __ T4S, v10, v18);
 5505     __ subv(v11, __ T4S, v11, v19);
 5506 
 5507     // r0 in v8
 5508     //  quotient += (mask & 1);
 5509     __ andr(v16, __ T16B, v12, v25);
 5510     __ andr(v17, __ T16B, v13, v25);
 5511     __ andr(v18, __ T16B, v14, v25);
 5512     __ andr(v19, __ T16B, v15, v25);
 5513 
 5514     __ addv(v4, __ T4S, v4, v16);
 5515     __ addv(v5, __ T4S, v5, v17);
 5516     __ addv(v6, __ T4S, v6, v18);
 5517     __ addv(v7, __ T4S, v7, v19);
 5518 
 5519     // mask = (twoGamma2 / 2 - r0) >> 31;
 5520     __ subv(v12, __ T4S, v27, v8);
 5521     __ subv(v13, __ T4S, v27, v9);
 5522     __ subv(v14, __ T4S, v27, v10);
 5523     __ subv(v15, __ T4S, v27, v11);
 5524 
 5525     __ sshr(v12, __ T4S, v12, 31);
 5526     __ sshr(v13, __ T4S, v13, 31);
 5527     __ sshr(v14, __ T4S, v14, 31);
 5528     __ sshr(v15, __ T4S, v15, 31);
 5529 
 5530     // r0 -= (mask & twoGamma2);
 5531     __ andr(v16, __ T16B, v12, v28);
 5532     __ andr(v17, __ T16B, v13, v28);
 5533     __ andr(v18, __ T16B, v14, v28);
 5534     __ andr(v19, __ T16B, v15, v28);
 5535 
 5536     __ subv(v8, __ T4S, v8, v16);
 5537     __ subv(v9, __ T4S, v9, v17);
 5538     __ subv(v10, __ T4S, v10, v18);
 5539     __ subv(v11, __ T4S, v11, v19);
 5540 
 5541     // quotient += (mask & 1);
 5542     __ andr(v16, __ T16B, v12, v25);
 5543     __ andr(v17, __ T16B, v13, v25);
 5544     __ andr(v18, __ T16B, v14, v25);
 5545     __ andr(v19, __ T16B, v15, v25);
 5546 
 5547     __ addv(v4, __ T4S, v4, v16);
 5548     __ addv(v5, __ T4S, v5, v17);
 5549     __ addv(v6, __ T4S, v6, v18);
 5550     __ addv(v7, __ T4S, v7, v19);
 5551 
 5552     // int r1 = rplus - r0 - (dilithium_q - 1);
 5553     __ subv(v16, __ T4S, v0, v8);
 5554     __ subv(v17, __ T4S, v1, v9);
 5555     __ subv(v18, __ T4S, v2, v10);
 5556     __ subv(v19, __ T4S, v3, v11);
 5557 
 5558     __ subv(v16, __ T4S, v16, v26);
 5559     __ subv(v17, __ T4S, v17, v26);
 5560     __ subv(v18, __ T4S, v18, v26);
 5561     __ subv(v19, __ T4S, v19, v26);
 5562 
 5563     // r1 in v16
 5564     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 5565     __ negr(v20, __ T4S, v16);
 5566     __ negr(v21, __ T4S, v17);
 5567     __ negr(v22, __ T4S, v18);
 5568     __ negr(v23, __ T4S, v19);
 5569 
 5570     __ orr(v16, __ T16B, v16, v20);
 5571     __ orr(v17, __ T16B, v17, v21);
 5572     __ orr(v18, __ T16B, v18, v22);
 5573     __ orr(v19, __ T16B, v19, v23);
 5574 
 5575     __ sshr(v0, __ T4S, v16, 31);
 5576     __ sshr(v1, __ T4S, v17, 31);
 5577     __ sshr(v2, __ T4S, v18, 31);
 5578     __ sshr(v3, __ T4S, v19, 31);
 5579 
 5580     // r1 in v0
 5581     // r0 += ~r1;
 5582     __ notr(v20, __ T16B, v0);
 5583     __ notr(v21, __ T16B, v1);
 5584     __ notr(v22, __ T16B, v2);
 5585     __ notr(v23, __ T16B, v3);
 5586 
 5587     __ addv(v8, __ T4S, v8, v20);
 5588     __ addv(v9, __ T4S, v9, v21);
 5589     __ addv(v10, __ T4S, v10, v22);
 5590     __ addv(v11, __ T4S, v11, v23);
 5591 
 5592     // r0 in v8
 5593     // r1 = r1 & quotient;
 5594     __ andr(v0, __ T16B, v4, v0);
 5595     __ andr(v1, __ T16B, v5, v1);
 5596     __ andr(v2, __ T16B, v6, v2);
 5597     __ andr(v3, __ T16B, v7, v3);
 5598 
 5599     // r1 in v0
 5600     // lowPart[m] = r0;
 5601     // highPart[m] = r1;
 5602     __ st4(v8, v9, v10, v11, __ T4S, __ post(lowPart, 64));
 5603     __ st4(v0, v1, v2, v3, __ T4S, __ post(highPart, 64));
 5604 
 5605 
 5606     __ sub(len, len, 64);
 5607     __ cmp(len, (u1)64);
 5608     __ br(Assembler::GE, L_loop);
 5609 
 5610     // restore callee-saved vector registers
 5611     __ ldpd(v14, v15, Address(sp, 48));
 5612     __ ldpd(v12, v13, Address(sp, 32));
 5613     __ ldpd(v10, v11, Address(sp, 16));
 5614     __ ldpd(v8, v9, __ post(sp, 64));
 5615 
 5616     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5617     __ mov(r0, zr); // return 0
 5618     __ ret(lr);
 5619 
 5620     return start;
 5621   }
 5622 
 5623   /**
 5624    *  Arguments:
 5625    *
 5626    * Inputs:
 5627    *   c_rarg0   - int crc
 5628    *   c_rarg1   - byte* buf
 5629    *   c_rarg2   - int length
 5630    *   c_rarg3   - int* table
 5631    *
 5632    * Output:
 5633    *       r0   - int crc result
 5634    */
 5635   address generate_updateBytesCRC32C() {
 5636     assert(UseCRC32CIntrinsics, "what are we doing here?");
 5637 
 5638     __ align(CodeEntryAlignment);
 5639     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 5640     StubCodeMark mark(this, stub_id);
 5641 
 5642     address start = __ pc();
 5643 
 5644     const Register crc   = c_rarg0;  // crc
 5645     const Register buf   = c_rarg1;  // source java byte array address
 5646     const Register len   = c_rarg2;  // length
 5647     const Register table0 = c_rarg3; // crc_table address
 5648     const Register table1 = c_rarg4;
 5649     const Register table2 = c_rarg5;
 5650     const Register table3 = c_rarg6;
 5651     const Register tmp3 = c_rarg7;
 5652 
 5653     BLOCK_COMMENT("Entry:");
 5654     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5655 
 5656     __ kernel_crc32c(crc, buf, len,
 5657               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 5658 
 5659     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5660     __ ret(lr);
 5661 
 5662     return start;
 5663   }
 5664 
 5665   /***
 5666    *  Arguments:
 5667    *
 5668    *  Inputs:
 5669    *   c_rarg0   - int   adler
 5670    *   c_rarg1   - byte* buff
 5671    *   c_rarg2   - int   len
 5672    *
 5673    * Output:
 5674    *   c_rarg0   - int adler result
 5675    */
 5676   address generate_updateBytesAdler32() {
 5677     __ align(CodeEntryAlignment);
 5678     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 5679     StubCodeMark mark(this, stub_id);
 5680     address start = __ pc();
 5681 
 5682     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 5683 
 5684     // Aliases
 5685     Register adler  = c_rarg0;
 5686     Register s1     = c_rarg0;
 5687     Register s2     = c_rarg3;
 5688     Register buff   = c_rarg1;
 5689     Register len    = c_rarg2;
 5690     Register nmax  = r4;
 5691     Register base  = r5;
 5692     Register count = r6;
 5693     Register temp0 = rscratch1;
 5694     Register temp1 = rscratch2;
 5695     FloatRegister vbytes = v0;
 5696     FloatRegister vs1acc = v1;
 5697     FloatRegister vs2acc = v2;
 5698     FloatRegister vtable = v3;
 5699 
 5700     // Max number of bytes we can process before having to take the mod
 5701     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 5702     uint64_t BASE = 0xfff1;
 5703     uint64_t NMAX = 0x15B0;
 5704 
 5705     __ mov(base, BASE);
 5706     __ mov(nmax, NMAX);
 5707 
 5708     // Load accumulation coefficients for the upper 16 bits
 5709     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 5710     __ ld1(vtable, __ T16B, Address(temp0));
 5711 
 5712     // s1 is initialized to the lower 16 bits of adler
 5713     // s2 is initialized to the upper 16 bits of adler
 5714     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 5715     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 5716 
 5717     // The pipelined loop needs at least 16 elements for 1 iteration
 5718     // It does check this, but it is more effective to skip to the cleanup loop
 5719     __ cmp(len, (u1)16);
 5720     __ br(Assembler::HS, L_nmax);
 5721     __ cbz(len, L_combine);
 5722 
 5723     __ bind(L_simple_by1_loop);
 5724     __ ldrb(temp0, Address(__ post(buff, 1)));
 5725     __ add(s1, s1, temp0);
 5726     __ add(s2, s2, s1);
 5727     __ subs(len, len, 1);
 5728     __ br(Assembler::HI, L_simple_by1_loop);
 5729 
 5730     // s1 = s1 % BASE
 5731     __ subs(temp0, s1, base);
 5732     __ csel(s1, temp0, s1, Assembler::HS);
 5733 
 5734     // s2 = s2 % BASE
 5735     __ lsr(temp0, s2, 16);
 5736     __ lsl(temp1, temp0, 4);
 5737     __ sub(temp1, temp1, temp0);
 5738     __ add(s2, temp1, s2, ext::uxth);
 5739 
 5740     __ subs(temp0, s2, base);
 5741     __ csel(s2, temp0, s2, Assembler::HS);
 5742 
 5743     __ b(L_combine);
 5744 
 5745     __ bind(L_nmax);
 5746     __ subs(len, len, nmax);
 5747     __ sub(count, nmax, 16);
 5748     __ br(Assembler::LO, L_by16);
 5749 
 5750     __ bind(L_nmax_loop);
 5751 
 5752     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5753                                       vbytes, vs1acc, vs2acc, vtable);
 5754 
 5755     __ subs(count, count, 16);
 5756     __ br(Assembler::HS, L_nmax_loop);
 5757 
 5758     // s1 = s1 % BASE
 5759     __ lsr(temp0, s1, 16);
 5760     __ lsl(temp1, temp0, 4);
 5761     __ sub(temp1, temp1, temp0);
 5762     __ add(temp1, temp1, s1, ext::uxth);
 5763 
 5764     __ lsr(temp0, temp1, 16);
 5765     __ lsl(s1, temp0, 4);
 5766     __ sub(s1, s1, temp0);
 5767     __ add(s1, s1, temp1, ext:: uxth);
 5768 
 5769     __ subs(temp0, s1, base);
 5770     __ csel(s1, temp0, s1, Assembler::HS);
 5771 
 5772     // s2 = s2 % BASE
 5773     __ lsr(temp0, s2, 16);
 5774     __ lsl(temp1, temp0, 4);
 5775     __ sub(temp1, temp1, temp0);
 5776     __ add(temp1, temp1, s2, ext::uxth);
 5777 
 5778     __ lsr(temp0, temp1, 16);
 5779     __ lsl(s2, temp0, 4);
 5780     __ sub(s2, s2, temp0);
 5781     __ add(s2, s2, temp1, ext:: uxth);
 5782 
 5783     __ subs(temp0, s2, base);
 5784     __ csel(s2, temp0, s2, Assembler::HS);
 5785 
 5786     __ subs(len, len, nmax);
 5787     __ sub(count, nmax, 16);
 5788     __ br(Assembler::HS, L_nmax_loop);
 5789 
 5790     __ bind(L_by16);
 5791     __ adds(len, len, count);
 5792     __ br(Assembler::LO, L_by1);
 5793 
 5794     __ bind(L_by16_loop);
 5795 
 5796     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5797                                       vbytes, vs1acc, vs2acc, vtable);
 5798 
 5799     __ subs(len, len, 16);
 5800     __ br(Assembler::HS, L_by16_loop);
 5801 
 5802     __ bind(L_by1);
 5803     __ adds(len, len, 15);
 5804     __ br(Assembler::LO, L_do_mod);
 5805 
 5806     __ bind(L_by1_loop);
 5807     __ ldrb(temp0, Address(__ post(buff, 1)));
 5808     __ add(s1, temp0, s1);
 5809     __ add(s2, s2, s1);
 5810     __ subs(len, len, 1);
 5811     __ br(Assembler::HS, L_by1_loop);
 5812 
 5813     __ bind(L_do_mod);
 5814     // s1 = s1 % BASE
 5815     __ lsr(temp0, s1, 16);
 5816     __ lsl(temp1, temp0, 4);
 5817     __ sub(temp1, temp1, temp0);
 5818     __ add(temp1, temp1, s1, ext::uxth);
 5819 
 5820     __ lsr(temp0, temp1, 16);
 5821     __ lsl(s1, temp0, 4);
 5822     __ sub(s1, s1, temp0);
 5823     __ add(s1, s1, temp1, ext:: uxth);
 5824 
 5825     __ subs(temp0, s1, base);
 5826     __ csel(s1, temp0, s1, Assembler::HS);
 5827 
 5828     // s2 = s2 % BASE
 5829     __ lsr(temp0, s2, 16);
 5830     __ lsl(temp1, temp0, 4);
 5831     __ sub(temp1, temp1, temp0);
 5832     __ add(temp1, temp1, s2, ext::uxth);
 5833 
 5834     __ lsr(temp0, temp1, 16);
 5835     __ lsl(s2, temp0, 4);
 5836     __ sub(s2, s2, temp0);
 5837     __ add(s2, s2, temp1, ext:: uxth);
 5838 
 5839     __ subs(temp0, s2, base);
 5840     __ csel(s2, temp0, s2, Assembler::HS);
 5841 
 5842     // Combine lower bits and higher bits
 5843     __ bind(L_combine);
 5844     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 5845 
 5846     __ ret(lr);
 5847 
 5848     return start;
 5849   }
 5850 
 5851   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 5852           Register temp0, Register temp1, FloatRegister vbytes,
 5853           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 5854     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 5855     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 5856     // In non-vectorized code, we update s1 and s2 as:
 5857     //   s1 <- s1 + b1
 5858     //   s2 <- s2 + s1
 5859     //   s1 <- s1 + b2
 5860     //   s2 <- s2 + b1
 5861     //   ...
 5862     //   s1 <- s1 + b16
 5863     //   s2 <- s2 + s1
 5864     // Putting above assignments together, we have:
 5865     //   s1_new = s1 + b1 + b2 + ... + b16
 5866     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 5867     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 5868     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 5869     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 5870 
 5871     // s2 = s2 + s1 * 16
 5872     __ add(s2, s2, s1, Assembler::LSL, 4);
 5873 
 5874     // vs1acc = b1 + b2 + b3 + ... + b16
 5875     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 5876     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 5877     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 5878     __ uaddlv(vs1acc, __ T16B, vbytes);
 5879     __ uaddlv(vs2acc, __ T8H, vs2acc);
 5880 
 5881     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 5882     __ fmovd(temp0, vs1acc);
 5883     __ fmovd(temp1, vs2acc);
 5884     __ add(s1, s1, temp0);
 5885     __ add(s2, s2, temp1);
 5886   }
 5887 
 5888   /**
 5889    *  Arguments:
 5890    *
 5891    *  Input:
 5892    *    c_rarg0   - x address
 5893    *    c_rarg1   - x length
 5894    *    c_rarg2   - y address
 5895    *    c_rarg3   - y length
 5896    *    c_rarg4   - z address
 5897    */
 5898   address generate_multiplyToLen() {
 5899     __ align(CodeEntryAlignment);
 5900     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 5901     StubCodeMark mark(this, stub_id);
 5902 
 5903     address start = __ pc();
 5904     const Register x     = r0;
 5905     const Register xlen  = r1;
 5906     const Register y     = r2;
 5907     const Register ylen  = r3;
 5908     const Register z     = r4;
 5909 
 5910     const Register tmp0  = r5;
 5911     const Register tmp1  = r10;
 5912     const Register tmp2  = r11;
 5913     const Register tmp3  = r12;
 5914     const Register tmp4  = r13;
 5915     const Register tmp5  = r14;
 5916     const Register tmp6  = r15;
 5917     const Register tmp7  = r16;
 5918 
 5919     BLOCK_COMMENT("Entry:");
 5920     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5921     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5922     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5923     __ ret(lr);
 5924 
 5925     return start;
 5926   }
 5927 
 5928   address generate_squareToLen() {
 5929     // squareToLen algorithm for sizes 1..127 described in java code works
 5930     // faster than multiply_to_len on some CPUs and slower on others, but
 5931     // multiply_to_len shows a bit better overall results
 5932     __ align(CodeEntryAlignment);
 5933     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 5934     StubCodeMark mark(this, stub_id);
 5935     address start = __ pc();
 5936 
 5937     const Register x     = r0;
 5938     const Register xlen  = r1;
 5939     const Register z     = r2;
 5940     const Register y     = r4; // == x
 5941     const Register ylen  = r5; // == xlen
 5942 
 5943     const Register tmp0  = r3;
 5944     const Register tmp1  = r10;
 5945     const Register tmp2  = r11;
 5946     const Register tmp3  = r12;
 5947     const Register tmp4  = r13;
 5948     const Register tmp5  = r14;
 5949     const Register tmp6  = r15;
 5950     const Register tmp7  = r16;
 5951 
 5952     RegSet spilled_regs = RegSet::of(y, ylen);
 5953     BLOCK_COMMENT("Entry:");
 5954     __ enter();
 5955     __ push(spilled_regs, sp);
 5956     __ mov(y, x);
 5957     __ mov(ylen, xlen);
 5958     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5959     __ pop(spilled_regs, sp);
 5960     __ leave();
 5961     __ ret(lr);
 5962     return start;
 5963   }
 5964 
 5965   address generate_mulAdd() {
 5966     __ align(CodeEntryAlignment);
 5967     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 5968     StubCodeMark mark(this, stub_id);
 5969 
 5970     address start = __ pc();
 5971 
 5972     const Register out     = r0;
 5973     const Register in      = r1;
 5974     const Register offset  = r2;
 5975     const Register len     = r3;
 5976     const Register k       = r4;
 5977 
 5978     BLOCK_COMMENT("Entry:");
 5979     __ enter();
 5980     __ mul_add(out, in, offset, len, k);
 5981     __ leave();
 5982     __ ret(lr);
 5983 
 5984     return start;
 5985   }
 5986 
 5987   // Arguments:
 5988   //
 5989   // Input:
 5990   //   c_rarg0   - newArr address
 5991   //   c_rarg1   - oldArr address
 5992   //   c_rarg2   - newIdx
 5993   //   c_rarg3   - shiftCount
 5994   //   c_rarg4   - numIter
 5995   //
 5996   address generate_bigIntegerRightShift() {
 5997     __ align(CodeEntryAlignment);
 5998     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 5999     StubCodeMark mark(this, stub_id);
 6000     address start = __ pc();
 6001 
 6002     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6003 
 6004     Register newArr        = c_rarg0;
 6005     Register oldArr        = c_rarg1;
 6006     Register newIdx        = c_rarg2;
 6007     Register shiftCount    = c_rarg3;
 6008     Register numIter       = c_rarg4;
 6009     Register idx           = numIter;
 6010 
 6011     Register newArrCur     = rscratch1;
 6012     Register shiftRevCount = rscratch2;
 6013     Register oldArrCur     = r13;
 6014     Register oldArrNext    = r14;
 6015 
 6016     FloatRegister oldElem0        = v0;
 6017     FloatRegister oldElem1        = v1;
 6018     FloatRegister newElem         = v2;
 6019     FloatRegister shiftVCount     = v3;
 6020     FloatRegister shiftVRevCount  = v4;
 6021 
 6022     __ cbz(idx, Exit);
 6023 
 6024     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6025 
 6026     // left shift count
 6027     __ movw(shiftRevCount, 32);
 6028     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6029 
 6030     // numIter too small to allow a 4-words SIMD loop, rolling back
 6031     __ cmp(numIter, (u1)4);
 6032     __ br(Assembler::LT, ShiftThree);
 6033 
 6034     __ dup(shiftVCount,    __ T4S, shiftCount);
 6035     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 6036     __ negr(shiftVCount,   __ T4S, shiftVCount);
 6037 
 6038     __ BIND(ShiftSIMDLoop);
 6039 
 6040     // Calculate the load addresses
 6041     __ sub(idx, idx, 4);
 6042     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6043     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6044     __ add(oldArrCur,  oldArrNext, 4);
 6045 
 6046     // Load 4 words and process
 6047     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 6048     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 6049     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6050     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6051     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6052     __ st1(newElem,   __ T4S,  Address(newArrCur));
 6053 
 6054     __ cmp(idx, (u1)4);
 6055     __ br(Assembler::LT, ShiftTwoLoop);
 6056     __ b(ShiftSIMDLoop);
 6057 
 6058     __ BIND(ShiftTwoLoop);
 6059     __ cbz(idx, Exit);
 6060     __ cmp(idx, (u1)1);
 6061     __ br(Assembler::EQ, ShiftOne);
 6062 
 6063     // Calculate the load addresses
 6064     __ sub(idx, idx, 2);
 6065     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6066     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6067     __ add(oldArrCur,  oldArrNext, 4);
 6068 
 6069     // Load 2 words and process
 6070     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 6071     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 6072     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 6073     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 6074     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 6075     __ st1(newElem,   __ T2S, Address(newArrCur));
 6076     __ b(ShiftTwoLoop);
 6077 
 6078     __ BIND(ShiftThree);
 6079     __ tbz(idx, 1, ShiftOne);
 6080     __ tbz(idx, 0, ShiftTwo);
 6081     __ ldrw(r10,  Address(oldArr, 12));
 6082     __ ldrw(r11,  Address(oldArr, 8));
 6083     __ lsrvw(r10, r10, shiftCount);
 6084     __ lslvw(r11, r11, shiftRevCount);
 6085     __ orrw(r12,  r10, r11);
 6086     __ strw(r12,  Address(newArr, 8));
 6087 
 6088     __ BIND(ShiftTwo);
 6089     __ ldrw(r10,  Address(oldArr, 8));
 6090     __ ldrw(r11,  Address(oldArr, 4));
 6091     __ lsrvw(r10, r10, shiftCount);
 6092     __ lslvw(r11, r11, shiftRevCount);
 6093     __ orrw(r12,  r10, r11);
 6094     __ strw(r12,  Address(newArr, 4));
 6095 
 6096     __ BIND(ShiftOne);
 6097     __ ldrw(r10,  Address(oldArr, 4));
 6098     __ ldrw(r11,  Address(oldArr));
 6099     __ lsrvw(r10, r10, shiftCount);
 6100     __ lslvw(r11, r11, shiftRevCount);
 6101     __ orrw(r12,  r10, r11);
 6102     __ strw(r12,  Address(newArr));
 6103 
 6104     __ BIND(Exit);
 6105     __ ret(lr);
 6106 
 6107     return start;
 6108   }
 6109 
 6110   // Arguments:
 6111   //
 6112   // Input:
 6113   //   c_rarg0   - newArr address
 6114   //   c_rarg1   - oldArr address
 6115   //   c_rarg2   - newIdx
 6116   //   c_rarg3   - shiftCount
 6117   //   c_rarg4   - numIter
 6118   //
 6119   address generate_bigIntegerLeftShift() {
 6120     __ align(CodeEntryAlignment);
 6121     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 6122     StubCodeMark mark(this, stub_id);
 6123     address start = __ pc();
 6124 
 6125     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6126 
 6127     Register newArr        = c_rarg0;
 6128     Register oldArr        = c_rarg1;
 6129     Register newIdx        = c_rarg2;
 6130     Register shiftCount    = c_rarg3;
 6131     Register numIter       = c_rarg4;
 6132 
 6133     Register shiftRevCount = rscratch1;
 6134     Register oldArrNext    = rscratch2;
 6135 
 6136     FloatRegister oldElem0        = v0;
 6137     FloatRegister oldElem1        = v1;
 6138     FloatRegister newElem         = v2;
 6139     FloatRegister shiftVCount     = v3;
 6140     FloatRegister shiftVRevCount  = v4;
 6141 
 6142     __ cbz(numIter, Exit);
 6143 
 6144     __ add(oldArrNext, oldArr, 4);
 6145     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6146 
 6147     // right shift count
 6148     __ movw(shiftRevCount, 32);
 6149     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6150 
 6151     // numIter too small to allow a 4-words SIMD loop, rolling back
 6152     __ cmp(numIter, (u1)4);
 6153     __ br(Assembler::LT, ShiftThree);
 6154 
 6155     __ dup(shiftVCount,     __ T4S, shiftCount);
 6156     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 6157     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 6158 
 6159     __ BIND(ShiftSIMDLoop);
 6160 
 6161     // load 4 words and process
 6162     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 6163     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 6164     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6165     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6166     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6167     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 6168     __ sub(numIter,   numIter, 4);
 6169 
 6170     __ cmp(numIter, (u1)4);
 6171     __ br(Assembler::LT, ShiftTwoLoop);
 6172     __ b(ShiftSIMDLoop);
 6173 
 6174     __ BIND(ShiftTwoLoop);
 6175     __ cbz(numIter, Exit);
 6176     __ cmp(numIter, (u1)1);
 6177     __ br(Assembler::EQ, ShiftOne);
 6178 
 6179     // load 2 words and process
 6180     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 6181     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 6182     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 6183     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 6184     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 6185     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 6186     __ sub(numIter,   numIter, 2);
 6187     __ b(ShiftTwoLoop);
 6188 
 6189     __ BIND(ShiftThree);
 6190     __ ldrw(r10,  __ post(oldArr, 4));
 6191     __ ldrw(r11,  __ post(oldArrNext, 4));
 6192     __ lslvw(r10, r10, shiftCount);
 6193     __ lsrvw(r11, r11, shiftRevCount);
 6194     __ orrw(r12,  r10, r11);
 6195     __ strw(r12,  __ post(newArr, 4));
 6196     __ tbz(numIter, 1, Exit);
 6197     __ tbz(numIter, 0, ShiftOne);
 6198 
 6199     __ BIND(ShiftTwo);
 6200     __ ldrw(r10,  __ post(oldArr, 4));
 6201     __ ldrw(r11,  __ post(oldArrNext, 4));
 6202     __ lslvw(r10, r10, shiftCount);
 6203     __ lsrvw(r11, r11, shiftRevCount);
 6204     __ orrw(r12,  r10, r11);
 6205     __ strw(r12,  __ post(newArr, 4));
 6206 
 6207     __ BIND(ShiftOne);
 6208     __ ldrw(r10,  Address(oldArr));
 6209     __ ldrw(r11,  Address(oldArrNext));
 6210     __ lslvw(r10, r10, shiftCount);
 6211     __ lsrvw(r11, r11, shiftRevCount);
 6212     __ orrw(r12,  r10, r11);
 6213     __ strw(r12,  Address(newArr));
 6214 
 6215     __ BIND(Exit);
 6216     __ ret(lr);
 6217 
 6218     return start;
 6219   }
 6220 
 6221   address generate_count_positives(address &count_positives_long) {
 6222     const u1 large_loop_size = 64;
 6223     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 6224     int dcache_line = VM_Version::dcache_line_size();
 6225 
 6226     Register ary1 = r1, len = r2, result = r0;
 6227 
 6228     __ align(CodeEntryAlignment);
 6229 
 6230     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 6231     StubCodeMark mark(this, stub_id);
 6232 
 6233     address entry = __ pc();
 6234 
 6235     __ enter();
 6236     // precondition: a copy of len is already in result
 6237     // __ mov(result, len);
 6238 
 6239   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 6240         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 6241 
 6242   __ cmp(len, (u1)15);
 6243   __ br(Assembler::GT, LEN_OVER_15);
 6244   // The only case when execution falls into this code is when pointer is near
 6245   // the end of memory page and we have to avoid reading next page
 6246   __ add(ary1, ary1, len);
 6247   __ subs(len, len, 8);
 6248   __ br(Assembler::GT, LEN_OVER_8);
 6249   __ ldr(rscratch2, Address(ary1, -8));
 6250   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 6251   __ lsrv(rscratch2, rscratch2, rscratch1);
 6252   __ tst(rscratch2, UPPER_BIT_MASK);
 6253   __ csel(result, zr, result, Assembler::NE);
 6254   __ leave();
 6255   __ ret(lr);
 6256   __ bind(LEN_OVER_8);
 6257   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 6258   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 6259   __ tst(rscratch2, UPPER_BIT_MASK);
 6260   __ br(Assembler::NE, RET_NO_POP);
 6261   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 6262   __ lsrv(rscratch1, rscratch1, rscratch2);
 6263   __ tst(rscratch1, UPPER_BIT_MASK);
 6264   __ bind(RET_NO_POP);
 6265   __ csel(result, zr, result, Assembler::NE);
 6266   __ leave();
 6267   __ ret(lr);
 6268 
 6269   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 6270   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 6271 
 6272   count_positives_long = __ pc(); // 2nd entry point
 6273 
 6274   __ enter();
 6275 
 6276   __ bind(LEN_OVER_15);
 6277     __ push(spilled_regs, sp);
 6278     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 6279     __ cbz(rscratch2, ALIGNED);
 6280     __ ldp(tmp6, tmp1, Address(ary1));
 6281     __ mov(tmp5, 16);
 6282     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 6283     __ add(ary1, ary1, rscratch1);
 6284     __ orr(tmp6, tmp6, tmp1);
 6285     __ tst(tmp6, UPPER_BIT_MASK);
 6286     __ br(Assembler::NE, RET_ADJUST);
 6287     __ sub(len, len, rscratch1);
 6288 
 6289   __ bind(ALIGNED);
 6290     __ cmp(len, large_loop_size);
 6291     __ br(Assembler::LT, CHECK_16);
 6292     // Perform 16-byte load as early return in pre-loop to handle situation
 6293     // when initially aligned large array has negative values at starting bytes,
 6294     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 6295     // slower. Cases with negative bytes further ahead won't be affected that
 6296     // much. In fact, it'll be faster due to early loads, less instructions and
 6297     // less branches in LARGE_LOOP.
 6298     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 6299     __ sub(len, len, 16);
 6300     __ orr(tmp6, tmp6, tmp1);
 6301     __ tst(tmp6, UPPER_BIT_MASK);
 6302     __ br(Assembler::NE, RET_ADJUST_16);
 6303     __ cmp(len, large_loop_size);
 6304     __ br(Assembler::LT, CHECK_16);
 6305 
 6306     if (SoftwarePrefetchHintDistance >= 0
 6307         && SoftwarePrefetchHintDistance >= dcache_line) {
 6308       // initial prefetch
 6309       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 6310     }
 6311   __ bind(LARGE_LOOP);
 6312     if (SoftwarePrefetchHintDistance >= 0) {
 6313       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 6314     }
 6315     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 6316     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 6317     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 6318     // instructions per cycle and have less branches, but this approach disables
 6319     // early return, thus, all 64 bytes are loaded and checked every time.
 6320     __ ldp(tmp2, tmp3, Address(ary1));
 6321     __ ldp(tmp4, tmp5, Address(ary1, 16));
 6322     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 6323     __ ldp(tmp6, tmp1, Address(ary1, 48));
 6324     __ add(ary1, ary1, large_loop_size);
 6325     __ sub(len, len, large_loop_size);
 6326     __ orr(tmp2, tmp2, tmp3);
 6327     __ orr(tmp4, tmp4, tmp5);
 6328     __ orr(rscratch1, rscratch1, rscratch2);
 6329     __ orr(tmp6, tmp6, tmp1);
 6330     __ orr(tmp2, tmp2, tmp4);
 6331     __ orr(rscratch1, rscratch1, tmp6);
 6332     __ orr(tmp2, tmp2, rscratch1);
 6333     __ tst(tmp2, UPPER_BIT_MASK);
 6334     __ br(Assembler::NE, RET_ADJUST_LONG);
 6335     __ cmp(len, large_loop_size);
 6336     __ br(Assembler::GE, LARGE_LOOP);
 6337 
 6338   __ bind(CHECK_16); // small 16-byte load pre-loop
 6339     __ cmp(len, (u1)16);
 6340     __ br(Assembler::LT, POST_LOOP16);
 6341 
 6342   __ bind(LOOP16); // small 16-byte load loop
 6343     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 6344     __ sub(len, len, 16);
 6345     __ orr(tmp2, tmp2, tmp3);
 6346     __ tst(tmp2, UPPER_BIT_MASK);
 6347     __ br(Assembler::NE, RET_ADJUST_16);
 6348     __ cmp(len, (u1)16);
 6349     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 6350 
 6351   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 6352     __ cmp(len, (u1)8);
 6353     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 6354     __ ldr(tmp3, Address(__ post(ary1, 8)));
 6355     __ tst(tmp3, UPPER_BIT_MASK);
 6356     __ br(Assembler::NE, RET_ADJUST);
 6357     __ sub(len, len, 8);
 6358 
 6359   __ bind(POST_LOOP16_LOAD_TAIL);
 6360     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 6361     __ ldr(tmp1, Address(ary1));
 6362     __ mov(tmp2, 64);
 6363     __ sub(tmp4, tmp2, len, __ LSL, 3);
 6364     __ lslv(tmp1, tmp1, tmp4);
 6365     __ tst(tmp1, UPPER_BIT_MASK);
 6366     __ br(Assembler::NE, RET_ADJUST);
 6367     // Fallthrough
 6368 
 6369   __ bind(RET_LEN);
 6370     __ pop(spilled_regs, sp);
 6371     __ leave();
 6372     __ ret(lr);
 6373 
 6374     // difference result - len is the count of guaranteed to be
 6375     // positive bytes
 6376 
 6377   __ bind(RET_ADJUST_LONG);
 6378     __ add(len, len, (u1)(large_loop_size - 16));
 6379   __ bind(RET_ADJUST_16);
 6380     __ add(len, len, 16);
 6381   __ bind(RET_ADJUST);
 6382     __ pop(spilled_regs, sp);
 6383     __ leave();
 6384     __ sub(result, result, len);
 6385     __ ret(lr);
 6386 
 6387     return entry;
 6388   }
 6389 
 6390   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 6391         bool usePrefetch, Label &NOT_EQUAL) {
 6392     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6393         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6394         tmp7 = r12, tmp8 = r13;
 6395     Label LOOP;
 6396 
 6397     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6398     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6399     __ bind(LOOP);
 6400     if (usePrefetch) {
 6401       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6402       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6403     }
 6404     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6405     __ eor(tmp1, tmp1, tmp2);
 6406     __ eor(tmp3, tmp3, tmp4);
 6407     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6408     __ orr(tmp1, tmp1, tmp3);
 6409     __ cbnz(tmp1, NOT_EQUAL);
 6410     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6411     __ eor(tmp5, tmp5, tmp6);
 6412     __ eor(tmp7, tmp7, tmp8);
 6413     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6414     __ orr(tmp5, tmp5, tmp7);
 6415     __ cbnz(tmp5, NOT_EQUAL);
 6416     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6417     __ eor(tmp1, tmp1, tmp2);
 6418     __ eor(tmp3, tmp3, tmp4);
 6419     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6420     __ orr(tmp1, tmp1, tmp3);
 6421     __ cbnz(tmp1, NOT_EQUAL);
 6422     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6423     __ eor(tmp5, tmp5, tmp6);
 6424     __ sub(cnt1, cnt1, 8 * wordSize);
 6425     __ eor(tmp7, tmp7, tmp8);
 6426     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6427     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 6428     // cmp) because subs allows an unlimited range of immediate operand.
 6429     __ subs(tmp6, cnt1, loopThreshold);
 6430     __ orr(tmp5, tmp5, tmp7);
 6431     __ cbnz(tmp5, NOT_EQUAL);
 6432     __ br(__ GE, LOOP);
 6433     // post-loop
 6434     __ eor(tmp1, tmp1, tmp2);
 6435     __ eor(tmp3, tmp3, tmp4);
 6436     __ orr(tmp1, tmp1, tmp3);
 6437     __ sub(cnt1, cnt1, 2 * wordSize);
 6438     __ cbnz(tmp1, NOT_EQUAL);
 6439   }
 6440 
 6441   void generate_large_array_equals_loop_simd(int loopThreshold,
 6442         bool usePrefetch, Label &NOT_EQUAL) {
 6443     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6444         tmp2 = rscratch2;
 6445     Label LOOP;
 6446 
 6447     __ bind(LOOP);
 6448     if (usePrefetch) {
 6449       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6450       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6451     }
 6452     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 6453     __ sub(cnt1, cnt1, 8 * wordSize);
 6454     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 6455     __ subs(tmp1, cnt1, loopThreshold);
 6456     __ eor(v0, __ T16B, v0, v4);
 6457     __ eor(v1, __ T16B, v1, v5);
 6458     __ eor(v2, __ T16B, v2, v6);
 6459     __ eor(v3, __ T16B, v3, v7);
 6460     __ orr(v0, __ T16B, v0, v1);
 6461     __ orr(v1, __ T16B, v2, v3);
 6462     __ orr(v0, __ T16B, v0, v1);
 6463     __ umov(tmp1, v0, __ D, 0);
 6464     __ umov(tmp2, v0, __ D, 1);
 6465     __ orr(tmp1, tmp1, tmp2);
 6466     __ cbnz(tmp1, NOT_EQUAL);
 6467     __ br(__ GE, LOOP);
 6468   }
 6469 
 6470   // a1 = r1 - array1 address
 6471   // a2 = r2 - array2 address
 6472   // result = r0 - return value. Already contains "false"
 6473   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 6474   // r3-r5 are reserved temporary registers
 6475   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 6476   address generate_large_array_equals() {
 6477     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6478         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6479         tmp7 = r12, tmp8 = r13;
 6480     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 6481         SMALL_LOOP, POST_LOOP;
 6482     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 6483     // calculate if at least 32 prefetched bytes are used
 6484     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 6485     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 6486     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 6487     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 6488         tmp5, tmp6, tmp7, tmp8);
 6489 
 6490     __ align(CodeEntryAlignment);
 6491 
 6492     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 6493     StubCodeMark mark(this, stub_id);
 6494 
 6495     address entry = __ pc();
 6496     __ enter();
 6497     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 6498     // also advance pointers to use post-increment instead of pre-increment
 6499     __ add(a1, a1, wordSize);
 6500     __ add(a2, a2, wordSize);
 6501     if (AvoidUnalignedAccesses) {
 6502       // both implementations (SIMD/nonSIMD) are using relatively large load
 6503       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 6504       // on some CPUs in case of address is not at least 16-byte aligned.
 6505       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 6506       // load if needed at least for 1st address and make if 16-byte aligned.
 6507       Label ALIGNED16;
 6508       __ tbz(a1, 3, ALIGNED16);
 6509       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6510       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6511       __ sub(cnt1, cnt1, wordSize);
 6512       __ eor(tmp1, tmp1, tmp2);
 6513       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 6514       __ bind(ALIGNED16);
 6515     }
 6516     if (UseSIMDForArrayEquals) {
 6517       if (SoftwarePrefetchHintDistance >= 0) {
 6518         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6519         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6520         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 6521             /* prfm = */ true, NOT_EQUAL);
 6522         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6523         __ br(__ LT, TAIL);
 6524       }
 6525       __ bind(NO_PREFETCH_LARGE_LOOP);
 6526       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 6527           /* prfm = */ false, NOT_EQUAL);
 6528     } else {
 6529       __ push(spilled_regs, sp);
 6530       if (SoftwarePrefetchHintDistance >= 0) {
 6531         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6532         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6533         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 6534             /* prfm = */ true, NOT_EQUAL);
 6535         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6536         __ br(__ LT, TAIL);
 6537       }
 6538       __ bind(NO_PREFETCH_LARGE_LOOP);
 6539       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 6540           /* prfm = */ false, NOT_EQUAL);
 6541     }
 6542     __ bind(TAIL);
 6543       __ cbz(cnt1, EQUAL);
 6544       __ subs(cnt1, cnt1, wordSize);
 6545       __ br(__ LE, POST_LOOP);
 6546     __ bind(SMALL_LOOP);
 6547       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6548       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6549       __ subs(cnt1, cnt1, wordSize);
 6550       __ eor(tmp1, tmp1, tmp2);
 6551       __ cbnz(tmp1, NOT_EQUAL);
 6552       __ br(__ GT, SMALL_LOOP);
 6553     __ bind(POST_LOOP);
 6554       __ ldr(tmp1, Address(a1, cnt1));
 6555       __ ldr(tmp2, Address(a2, cnt1));
 6556       __ eor(tmp1, tmp1, tmp2);
 6557       __ cbnz(tmp1, NOT_EQUAL);
 6558     __ bind(EQUAL);
 6559       __ mov(result, true);
 6560     __ bind(NOT_EQUAL);
 6561       if (!UseSIMDForArrayEquals) {
 6562         __ pop(spilled_regs, sp);
 6563       }
 6564     __ bind(NOT_EQUAL_NO_POP);
 6565     __ leave();
 6566     __ ret(lr);
 6567     return entry;
 6568   }
 6569 
 6570   // result = r0 - return value. Contains initial hashcode value on entry.
 6571   // ary = r1 - array address
 6572   // cnt = r2 - elements count
 6573   // Clobbers: v0-v13, rscratch1, rscratch2
 6574   address generate_large_arrays_hashcode(BasicType eltype) {
 6575     const Register result = r0, ary = r1, cnt = r2;
 6576     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 6577     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 6578     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 6579     const FloatRegister vpowm = v13;
 6580 
 6581     ARRAYS_HASHCODE_REGISTERS;
 6582 
 6583     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 6584 
 6585     unsigned int vf; // vectorization factor
 6586     bool multiply_by_halves;
 6587     Assembler::SIMD_Arrangement load_arrangement;
 6588     switch (eltype) {
 6589     case T_BOOLEAN:
 6590     case T_BYTE:
 6591       load_arrangement = Assembler::T8B;
 6592       multiply_by_halves = true;
 6593       vf = 8;
 6594       break;
 6595     case T_CHAR:
 6596     case T_SHORT:
 6597       load_arrangement = Assembler::T8H;
 6598       multiply_by_halves = true;
 6599       vf = 8;
 6600       break;
 6601     case T_INT:
 6602       load_arrangement = Assembler::T4S;
 6603       multiply_by_halves = false;
 6604       vf = 4;
 6605       break;
 6606     default:
 6607       ShouldNotReachHere();
 6608     }
 6609 
 6610     // Unroll factor
 6611     const unsigned uf = 4;
 6612 
 6613     // Effective vectorization factor
 6614     const unsigned evf = vf * uf;
 6615 
 6616     __ align(CodeEntryAlignment);
 6617 
 6618     StubGenStubId stub_id;
 6619     switch (eltype) {
 6620     case T_BOOLEAN:
 6621       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 6622       break;
 6623     case T_BYTE:
 6624       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 6625       break;
 6626     case T_CHAR:
 6627       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 6628       break;
 6629     case T_SHORT:
 6630       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 6631       break;
 6632     case T_INT:
 6633       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 6634       break;
 6635     default:
 6636       stub_id = StubGenStubId::NO_STUBID;
 6637       ShouldNotReachHere();
 6638     };
 6639 
 6640     StubCodeMark mark(this, stub_id);
 6641 
 6642     address entry = __ pc();
 6643     __ enter();
 6644 
 6645     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 6646     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 6647     // value shouldn't change throughout both loops.
 6648     __ movw(rscratch1, intpow(31U, 3));
 6649     __ mov(vpow, Assembler::S, 0, rscratch1);
 6650     __ movw(rscratch1, intpow(31U, 2));
 6651     __ mov(vpow, Assembler::S, 1, rscratch1);
 6652     __ movw(rscratch1, intpow(31U, 1));
 6653     __ mov(vpow, Assembler::S, 2, rscratch1);
 6654     __ movw(rscratch1, intpow(31U, 0));
 6655     __ mov(vpow, Assembler::S, 3, rscratch1);
 6656 
 6657     __ mov(vmul0, Assembler::T16B, 0);
 6658     __ mov(vmul0, Assembler::S, 3, result);
 6659 
 6660     __ andr(rscratch2, cnt, (uf - 1) * vf);
 6661     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 6662 
 6663     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 6664     __ mov(vpowm, Assembler::S, 0, rscratch1);
 6665 
 6666     // SMALL LOOP
 6667     __ bind(SMALL_LOOP);
 6668 
 6669     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 6670     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6671     __ subsw(rscratch2, rscratch2, vf);
 6672 
 6673     if (load_arrangement == Assembler::T8B) {
 6674       // Extend 8B to 8H to be able to use vector multiply
 6675       // instructions
 6676       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6677       if (is_signed_subword_type(eltype)) {
 6678         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6679       } else {
 6680         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6681       }
 6682     }
 6683 
 6684     switch (load_arrangement) {
 6685     case Assembler::T4S:
 6686       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6687       break;
 6688     case Assembler::T8B:
 6689     case Assembler::T8H:
 6690       assert(is_subword_type(eltype), "subword type expected");
 6691       if (is_signed_subword_type(eltype)) {
 6692         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6693       } else {
 6694         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6695       }
 6696       break;
 6697     default:
 6698       __ should_not_reach_here();
 6699     }
 6700 
 6701     // Process the upper half of a vector
 6702     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6703       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6704       if (is_signed_subword_type(eltype)) {
 6705         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6706       } else {
 6707         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6708       }
 6709     }
 6710 
 6711     __ br(Assembler::HI, SMALL_LOOP);
 6712 
 6713     // SMALL LOOP'S EPILOQUE
 6714     __ lsr(rscratch2, cnt, exact_log2(evf));
 6715     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 6716 
 6717     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6718     __ addv(vmul0, Assembler::T4S, vmul0);
 6719     __ umov(result, vmul0, Assembler::S, 0);
 6720 
 6721     // TAIL
 6722     __ bind(TAIL);
 6723 
 6724     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 6725     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 6726     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 6727     __ andr(rscratch2, cnt, vf - 1);
 6728     __ bind(TAIL_SHORTCUT);
 6729     __ adr(rscratch1, BR_BASE);
 6730     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 6731     __ movw(rscratch2, 0x1f);
 6732     __ br(rscratch1);
 6733 
 6734     for (size_t i = 0; i < vf - 1; ++i) {
 6735       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 6736                                    eltype);
 6737       __ maddw(result, result, rscratch2, rscratch1);
 6738     }
 6739     __ bind(BR_BASE);
 6740 
 6741     __ leave();
 6742     __ ret(lr);
 6743 
 6744     // LARGE LOOP
 6745     __ bind(LARGE_LOOP_PREHEADER);
 6746 
 6747     __ lsr(rscratch2, cnt, exact_log2(evf));
 6748 
 6749     if (multiply_by_halves) {
 6750       // 31^4 - multiplier between lower and upper parts of a register
 6751       __ movw(rscratch1, intpow(31U, vf / 2));
 6752       __ mov(vpowm, Assembler::S, 1, rscratch1);
 6753       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 6754       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 6755       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6756     } else {
 6757       // 31^16
 6758       __ movw(rscratch1, intpow(31U, evf));
 6759       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6760     }
 6761 
 6762     __ mov(vmul3, Assembler::T16B, 0);
 6763     __ mov(vmul2, Assembler::T16B, 0);
 6764     __ mov(vmul1, Assembler::T16B, 0);
 6765 
 6766     __ bind(LARGE_LOOP);
 6767 
 6768     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 6769     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 6770     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 6771     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6772 
 6773     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 6774            Address(__ post(ary, evf * type2aelembytes(eltype))));
 6775 
 6776     if (load_arrangement == Assembler::T8B) {
 6777       // Extend 8B to 8H to be able to use vector multiply
 6778       // instructions
 6779       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6780       if (is_signed_subword_type(eltype)) {
 6781         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6782         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6783         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6784         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6785       } else {
 6786         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6787         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6788         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6789         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6790       }
 6791     }
 6792 
 6793     switch (load_arrangement) {
 6794     case Assembler::T4S:
 6795       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 6796       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 6797       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 6798       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6799       break;
 6800     case Assembler::T8B:
 6801     case Assembler::T8H:
 6802       assert(is_subword_type(eltype), "subword type expected");
 6803       if (is_signed_subword_type(eltype)) {
 6804         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6805         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6806         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6807         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6808       } else {
 6809         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6810         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6811         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6812         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6813       }
 6814       break;
 6815     default:
 6816       __ should_not_reach_here();
 6817     }
 6818 
 6819     // Process the upper half of a vector
 6820     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6821       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 6822       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 6823       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 6824       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 6825       if (is_signed_subword_type(eltype)) {
 6826         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6827         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6828         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6829         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6830       } else {
 6831         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6832         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6833         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6834         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6835       }
 6836     }
 6837 
 6838     __ subsw(rscratch2, rscratch2, 1);
 6839     __ br(Assembler::HI, LARGE_LOOP);
 6840 
 6841     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 6842     __ addv(vmul3, Assembler::T4S, vmul3);
 6843     __ umov(result, vmul3, Assembler::S, 0);
 6844 
 6845     __ mov(rscratch2, intpow(31U, vf));
 6846 
 6847     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 6848     __ addv(vmul2, Assembler::T4S, vmul2);
 6849     __ umov(rscratch1, vmul2, Assembler::S, 0);
 6850     __ maddw(result, result, rscratch2, rscratch1);
 6851 
 6852     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 6853     __ addv(vmul1, Assembler::T4S, vmul1);
 6854     __ umov(rscratch1, vmul1, Assembler::S, 0);
 6855     __ maddw(result, result, rscratch2, rscratch1);
 6856 
 6857     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6858     __ addv(vmul0, Assembler::T4S, vmul0);
 6859     __ umov(rscratch1, vmul0, Assembler::S, 0);
 6860     __ maddw(result, result, rscratch2, rscratch1);
 6861 
 6862     __ andr(rscratch2, cnt, vf - 1);
 6863     __ cbnz(rscratch2, TAIL_SHORTCUT);
 6864 
 6865     __ leave();
 6866     __ ret(lr);
 6867 
 6868     return entry;
 6869   }
 6870 
 6871   address generate_dsin_dcos(bool isCos) {
 6872     __ align(CodeEntryAlignment);
 6873     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 6874     StubCodeMark mark(this, stub_id);
 6875     address start = __ pc();
 6876     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 6877         (address)StubRoutines::aarch64::_two_over_pi,
 6878         (address)StubRoutines::aarch64::_pio2,
 6879         (address)StubRoutines::aarch64::_dsin_coef,
 6880         (address)StubRoutines::aarch64::_dcos_coef);
 6881     return start;
 6882   }
 6883 
 6884   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 6885   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 6886       Label &DIFF2) {
 6887     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 6888     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 6889 
 6890     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 6891     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6892     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 6893     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 6894 
 6895     __ fmovd(tmpL, vtmp3);
 6896     __ eor(rscratch2, tmp3, tmpL);
 6897     __ cbnz(rscratch2, DIFF2);
 6898 
 6899     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6900     __ umov(tmpL, vtmp3, __ D, 1);
 6901     __ eor(rscratch2, tmpU, tmpL);
 6902     __ cbnz(rscratch2, DIFF1);
 6903 
 6904     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 6905     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6906     __ fmovd(tmpL, vtmp);
 6907     __ eor(rscratch2, tmp3, tmpL);
 6908     __ cbnz(rscratch2, DIFF2);
 6909 
 6910     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6911     __ umov(tmpL, vtmp, __ D, 1);
 6912     __ eor(rscratch2, tmpU, tmpL);
 6913     __ cbnz(rscratch2, DIFF1);
 6914   }
 6915 
 6916   // r0  = result
 6917   // r1  = str1
 6918   // r2  = cnt1
 6919   // r3  = str2
 6920   // r4  = cnt2
 6921   // r10 = tmp1
 6922   // r11 = tmp2
 6923   address generate_compare_long_string_different_encoding(bool isLU) {
 6924     __ align(CodeEntryAlignment);
 6925     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 6926     StubCodeMark mark(this, stub_id);
 6927     address entry = __ pc();
 6928     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 6929         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 6930         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 6931     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 6932         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 6933     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 6934     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 6935 
 6936     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 6937 
 6938     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 6939     // cnt2 == amount of characters left to compare
 6940     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 6941     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 6942     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 6943     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 6944     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 6945     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 6946     __ eor(rscratch2, tmp1, tmp2);
 6947     __ mov(rscratch1, tmp2);
 6948     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 6949     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 6950              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 6951     __ push(spilled_regs, sp);
 6952     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 6953     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 6954 
 6955     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6956 
 6957     if (SoftwarePrefetchHintDistance >= 0) {
 6958       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6959       __ br(__ LT, NO_PREFETCH);
 6960       __ bind(LARGE_LOOP_PREFETCH);
 6961         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 6962         __ mov(tmp4, 2);
 6963         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6964         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 6965           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6966           __ subs(tmp4, tmp4, 1);
 6967           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 6968           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6969           __ mov(tmp4, 2);
 6970         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 6971           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6972           __ subs(tmp4, tmp4, 1);
 6973           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 6974           __ sub(cnt2, cnt2, 64);
 6975           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6976           __ br(__ GE, LARGE_LOOP_PREFETCH);
 6977     }
 6978     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 6979     __ bind(NO_PREFETCH);
 6980     __ subs(cnt2, cnt2, 16);
 6981     __ br(__ LT, TAIL);
 6982     __ align(OptoLoopAlignment);
 6983     __ bind(SMALL_LOOP); // smaller loop
 6984       __ subs(cnt2, cnt2, 16);
 6985       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6986       __ br(__ GE, SMALL_LOOP);
 6987       __ cmn(cnt2, (u1)16);
 6988       __ br(__ EQ, LOAD_LAST);
 6989     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 6990       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 6991       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 6992       __ ldr(tmp3, Address(cnt1, -8));
 6993       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 6994       __ b(LOAD_LAST);
 6995     __ bind(DIFF2);
 6996       __ mov(tmpU, tmp3);
 6997     __ bind(DIFF1);
 6998       __ pop(spilled_regs, sp);
 6999       __ b(CALCULATE_DIFFERENCE);
 7000     __ bind(LOAD_LAST);
 7001       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 7002       // No need to load it again
 7003       __ mov(tmpU, tmp3);
 7004       __ pop(spilled_regs, sp);
 7005 
 7006       // tmp2 points to the address of the last 4 Latin1 characters right now
 7007       __ ldrs(vtmp, Address(tmp2));
 7008       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 7009       __ fmovd(tmpL, vtmp);
 7010 
 7011       __ eor(rscratch2, tmpU, tmpL);
 7012       __ cbz(rscratch2, DONE);
 7013 
 7014     // Find the first different characters in the longwords and
 7015     // compute their difference.
 7016     __ bind(CALCULATE_DIFFERENCE);
 7017       __ rev(rscratch2, rscratch2);
 7018       __ clz(rscratch2, rscratch2);
 7019       __ andr(rscratch2, rscratch2, -16);
 7020       __ lsrv(tmp1, tmp1, rscratch2);
 7021       __ uxthw(tmp1, tmp1);
 7022       __ lsrv(rscratch1, rscratch1, rscratch2);
 7023       __ uxthw(rscratch1, rscratch1);
 7024       __ subw(result, tmp1, rscratch1);
 7025     __ bind(DONE);
 7026       __ ret(lr);
 7027     return entry;
 7028   }
 7029 
 7030   // r0 = input (float16)
 7031   // v0 = result (float)
 7032   // v1 = temporary float register
 7033   address generate_float16ToFloat() {
 7034     __ align(CodeEntryAlignment);
 7035     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 7036     StubCodeMark mark(this, stub_id);
 7037     address entry = __ pc();
 7038     BLOCK_COMMENT("Entry:");
 7039     __ flt16_to_flt(v0, r0, v1);
 7040     __ ret(lr);
 7041     return entry;
 7042   }
 7043 
 7044   // v0 = input (float)
 7045   // r0 = result (float16)
 7046   // v1 = temporary float register
 7047   address generate_floatToFloat16() {
 7048     __ align(CodeEntryAlignment);
 7049     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 7050     StubCodeMark mark(this, stub_id);
 7051     address entry = __ pc();
 7052     BLOCK_COMMENT("Entry:");
 7053     __ flt_to_flt16(r0, v0, v1);
 7054     __ ret(lr);
 7055     return entry;
 7056   }
 7057 
 7058   address generate_method_entry_barrier() {
 7059     __ align(CodeEntryAlignment);
 7060     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 7061     StubCodeMark mark(this, stub_id);
 7062 
 7063     Label deoptimize_label;
 7064 
 7065     address start = __ pc();
 7066 
 7067     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 7068 
 7069     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 7070       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 7071       // We can get here despite the nmethod being good, if we have not
 7072       // yet applied our cross modification fence (or data fence).
 7073       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 7074       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 7075       __ ldrw(rscratch2, rscratch2);
 7076       __ strw(rscratch2, thread_epoch_addr);
 7077       __ isb();
 7078       __ membar(__ LoadLoad);
 7079     }
 7080 
 7081     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 7082 
 7083     __ enter();
 7084     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 7085 
 7086     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 7087 
 7088     __ push_call_clobbered_registers();
 7089 
 7090     __ mov(c_rarg0, rscratch2);
 7091     __ call_VM_leaf
 7092          (CAST_FROM_FN_PTR
 7093           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 7094 
 7095     __ reset_last_Java_frame(true);
 7096 
 7097     __ mov(rscratch1, r0);
 7098 
 7099     __ pop_call_clobbered_registers();
 7100 
 7101     __ cbnz(rscratch1, deoptimize_label);
 7102 
 7103     __ leave();
 7104     __ ret(lr);
 7105 
 7106     __ BIND(deoptimize_label);
 7107 
 7108     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 7109     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 7110 
 7111     __ mov(sp, rscratch1);
 7112     __ br(rscratch2);
 7113 
 7114     return start;
 7115   }
 7116 
 7117   // r0  = result
 7118   // r1  = str1
 7119   // r2  = cnt1
 7120   // r3  = str2
 7121   // r4  = cnt2
 7122   // r10 = tmp1
 7123   // r11 = tmp2
 7124   address generate_compare_long_string_same_encoding(bool isLL) {
 7125     __ align(CodeEntryAlignment);
 7126     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 7127     StubCodeMark mark(this, stub_id);
 7128     address entry = __ pc();
 7129     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7130         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 7131 
 7132     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 7133 
 7134     // exit from large loop when less than 64 bytes left to read or we're about
 7135     // to prefetch memory behind array border
 7136     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 7137 
 7138     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 7139     __ eor(rscratch2, tmp1, tmp2);
 7140     __ cbnz(rscratch2, CAL_DIFFERENCE);
 7141 
 7142     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 7143     // update pointers, because of previous read
 7144     __ add(str1, str1, wordSize);
 7145     __ add(str2, str2, wordSize);
 7146     if (SoftwarePrefetchHintDistance >= 0) {
 7147       __ align(OptoLoopAlignment);
 7148       __ bind(LARGE_LOOP_PREFETCH);
 7149         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 7150         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 7151 
 7152         for (int i = 0; i < 4; i++) {
 7153           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 7154           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 7155           __ cmp(tmp1, tmp2);
 7156           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7157           __ br(Assembler::NE, DIFF);
 7158         }
 7159         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 7160         __ add(str1, str1, 64);
 7161         __ add(str2, str2, 64);
 7162         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 7163         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 7164         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 7165     }
 7166 
 7167     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 7168     __ br(Assembler::LE, LESS16);
 7169     __ align(OptoLoopAlignment);
 7170     __ bind(LOOP_COMPARE16);
 7171       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7172       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7173       __ cmp(tmp1, tmp2);
 7174       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7175       __ br(Assembler::NE, DIFF);
 7176       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7177       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7178       __ br(Assembler::LT, LESS16);
 7179 
 7180       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7181       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7182       __ cmp(tmp1, tmp2);
 7183       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7184       __ br(Assembler::NE, DIFF);
 7185       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7186       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7187       __ br(Assembler::GE, LOOP_COMPARE16);
 7188       __ cbz(cnt2, LENGTH_DIFF);
 7189 
 7190     __ bind(LESS16);
 7191       // each 8 compare
 7192       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 7193       __ br(Assembler::LE, LESS8);
 7194       __ ldr(tmp1, Address(__ post(str1, 8)));
 7195       __ ldr(tmp2, Address(__ post(str2, 8)));
 7196       __ eor(rscratch2, tmp1, tmp2);
 7197       __ cbnz(rscratch2, CAL_DIFFERENCE);
 7198       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 7199 
 7200     __ bind(LESS8); // directly load last 8 bytes
 7201       if (!isLL) {
 7202         __ add(cnt2, cnt2, cnt2);
 7203       }
 7204       __ ldr(tmp1, Address(str1, cnt2));
 7205       __ ldr(tmp2, Address(str2, cnt2));
 7206       __ eor(rscratch2, tmp1, tmp2);
 7207       __ cbz(rscratch2, LENGTH_DIFF);
 7208       __ b(CAL_DIFFERENCE);
 7209 
 7210     __ bind(DIFF);
 7211       __ cmp(tmp1, tmp2);
 7212       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 7213       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 7214       // reuse rscratch2 register for the result of eor instruction
 7215       __ eor(rscratch2, tmp1, tmp2);
 7216 
 7217     __ bind(CAL_DIFFERENCE);
 7218       __ rev(rscratch2, rscratch2);
 7219       __ clz(rscratch2, rscratch2);
 7220       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 7221       __ lsrv(tmp1, tmp1, rscratch2);
 7222       __ lsrv(tmp2, tmp2, rscratch2);
 7223       if (isLL) {
 7224         __ uxtbw(tmp1, tmp1);
 7225         __ uxtbw(tmp2, tmp2);
 7226       } else {
 7227         __ uxthw(tmp1, tmp1);
 7228         __ uxthw(tmp2, tmp2);
 7229       }
 7230       __ subw(result, tmp1, tmp2);
 7231 
 7232     __ bind(LENGTH_DIFF);
 7233       __ ret(lr);
 7234     return entry;
 7235   }
 7236 
 7237   enum string_compare_mode {
 7238     LL,
 7239     LU,
 7240     UL,
 7241     UU,
 7242   };
 7243 
 7244   // The following registers are declared in aarch64.ad
 7245   // r0  = result
 7246   // r1  = str1
 7247   // r2  = cnt1
 7248   // r3  = str2
 7249   // r4  = cnt2
 7250   // r10 = tmp1
 7251   // r11 = tmp2
 7252   // z0  = ztmp1
 7253   // z1  = ztmp2
 7254   // p0  = pgtmp1
 7255   // p1  = pgtmp2
 7256   address generate_compare_long_string_sve(string_compare_mode mode) {
 7257     StubGenStubId stub_id;
 7258     switch (mode) {
 7259       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 7260       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 7261       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 7262       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 7263       default: ShouldNotReachHere();
 7264     }
 7265 
 7266     __ align(CodeEntryAlignment);
 7267     address entry = __ pc();
 7268     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7269              tmp1 = r10, tmp2 = r11;
 7270 
 7271     Label LOOP, DONE, MISMATCH;
 7272     Register vec_len = tmp1;
 7273     Register idx = tmp2;
 7274     // The minimum of the string lengths has been stored in cnt2.
 7275     Register cnt = cnt2;
 7276     FloatRegister ztmp1 = z0, ztmp2 = z1;
 7277     PRegister pgtmp1 = p0, pgtmp2 = p1;
 7278 
 7279 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 7280     switch (mode) {                                                            \
 7281       case LL:                                                                 \
 7282         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 7283         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 7284         break;                                                                 \
 7285       case LU:                                                                 \
 7286         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 7287         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7288         break;                                                                 \
 7289       case UL:                                                                 \
 7290         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7291         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 7292         break;                                                                 \
 7293       case UU:                                                                 \
 7294         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7295         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7296         break;                                                                 \
 7297       default:                                                                 \
 7298         ShouldNotReachHere();                                                  \
 7299     }
 7300 
 7301     StubCodeMark mark(this, stub_id);
 7302 
 7303     __ mov(idx, 0);
 7304     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7305 
 7306     if (mode == LL) {
 7307       __ sve_cntb(vec_len);
 7308     } else {
 7309       __ sve_cnth(vec_len);
 7310     }
 7311 
 7312     __ sub(rscratch1, cnt, vec_len);
 7313 
 7314     __ bind(LOOP);
 7315 
 7316       // main loop
 7317       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7318       __ add(idx, idx, vec_len);
 7319       // Compare strings.
 7320       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7321       __ br(__ NE, MISMATCH);
 7322       __ cmp(idx, rscratch1);
 7323       __ br(__ LT, LOOP);
 7324 
 7325     // post loop, last iteration
 7326     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7327 
 7328     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7329     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7330     __ br(__ EQ, DONE);
 7331 
 7332     __ bind(MISMATCH);
 7333 
 7334     // Crop the vector to find its location.
 7335     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 7336     // Extract the first different characters of each string.
 7337     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 7338     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 7339 
 7340     // Compute the difference of the first different characters.
 7341     __ sub(result, rscratch1, rscratch2);
 7342 
 7343     __ bind(DONE);
 7344     __ ret(lr);
 7345 #undef LOAD_PAIR
 7346     return entry;
 7347   }
 7348 
 7349   void generate_compare_long_strings() {
 7350     if (UseSVE == 0) {
 7351       StubRoutines::aarch64::_compare_long_string_LL
 7352           = generate_compare_long_string_same_encoding(true);
 7353       StubRoutines::aarch64::_compare_long_string_UU
 7354           = generate_compare_long_string_same_encoding(false);
 7355       StubRoutines::aarch64::_compare_long_string_LU
 7356           = generate_compare_long_string_different_encoding(true);
 7357       StubRoutines::aarch64::_compare_long_string_UL
 7358           = generate_compare_long_string_different_encoding(false);
 7359     } else {
 7360       StubRoutines::aarch64::_compare_long_string_LL
 7361           = generate_compare_long_string_sve(LL);
 7362       StubRoutines::aarch64::_compare_long_string_UU
 7363           = generate_compare_long_string_sve(UU);
 7364       StubRoutines::aarch64::_compare_long_string_LU
 7365           = generate_compare_long_string_sve(LU);
 7366       StubRoutines::aarch64::_compare_long_string_UL
 7367           = generate_compare_long_string_sve(UL);
 7368     }
 7369   }
 7370 
 7371   // R0 = result
 7372   // R1 = str2
 7373   // R2 = cnt1
 7374   // R3 = str1
 7375   // R4 = cnt2
 7376   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 7377   //
 7378   // This generic linear code use few additional ideas, which makes it faster:
 7379   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 7380   // in order to skip initial loading(help in systems with 1 ld pipeline)
 7381   // 2) we can use "fast" algorithm of finding single character to search for
 7382   // first symbol with less branches(1 branch per each loaded register instead
 7383   // of branch for each symbol), so, this is where constants like
 7384   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 7385   // 3) after loading and analyzing 1st register of source string, it can be
 7386   // used to search for every 1st character entry, saving few loads in
 7387   // comparison with "simplier-but-slower" implementation
 7388   // 4) in order to avoid lots of push/pop operations, code below is heavily
 7389   // re-using/re-initializing/compressing register values, which makes code
 7390   // larger and a bit less readable, however, most of extra operations are
 7391   // issued during loads or branches, so, penalty is minimal
 7392   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 7393     StubGenStubId stub_id;
 7394     if (str1_isL) {
 7395       if (str2_isL) {
 7396         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 7397       } else {
 7398         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 7399       }
 7400     } else {
 7401       if (str2_isL) {
 7402         ShouldNotReachHere();
 7403       } else {
 7404         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 7405       }
 7406     }
 7407     __ align(CodeEntryAlignment);
 7408     StubCodeMark mark(this, stub_id);
 7409     address entry = __ pc();
 7410 
 7411     int str1_chr_size = str1_isL ? 1 : 2;
 7412     int str2_chr_size = str2_isL ? 1 : 2;
 7413     int str1_chr_shift = str1_isL ? 0 : 1;
 7414     int str2_chr_shift = str2_isL ? 0 : 1;
 7415     bool isL = str1_isL && str2_isL;
 7416    // parameters
 7417     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 7418     // temporary registers
 7419     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 7420     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 7421     // redefinitions
 7422     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 7423 
 7424     __ push(spilled_regs, sp);
 7425     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 7426         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 7427         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 7428         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 7429         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 7430         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 7431     // Read whole register from str1. It is safe, because length >=8 here
 7432     __ ldr(ch1, Address(str1));
 7433     // Read whole register from str2. It is safe, because length >=8 here
 7434     __ ldr(ch2, Address(str2));
 7435     __ sub(cnt2, cnt2, cnt1);
 7436     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 7437     if (str1_isL != str2_isL) {
 7438       __ eor(v0, __ T16B, v0, v0);
 7439     }
 7440     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 7441     __ mul(first, first, tmp1);
 7442     // check if we have less than 1 register to check
 7443     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 7444     if (str1_isL != str2_isL) {
 7445       __ fmovd(v1, ch1);
 7446     }
 7447     __ br(__ LE, L_SMALL);
 7448     __ eor(ch2, first, ch2);
 7449     if (str1_isL != str2_isL) {
 7450       __ zip1(v1, __ T16B, v1, v0);
 7451     }
 7452     __ sub(tmp2, ch2, tmp1);
 7453     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7454     __ bics(tmp2, tmp2, ch2);
 7455     if (str1_isL != str2_isL) {
 7456       __ fmovd(ch1, v1);
 7457     }
 7458     __ br(__ NE, L_HAS_ZERO);
 7459     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7460     __ add(result, result, wordSize/str2_chr_size);
 7461     __ add(str2, str2, wordSize);
 7462     __ br(__ LT, L_POST_LOOP);
 7463     __ BIND(L_LOOP);
 7464       __ ldr(ch2, Address(str2));
 7465       __ eor(ch2, first, ch2);
 7466       __ sub(tmp2, ch2, tmp1);
 7467       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7468       __ bics(tmp2, tmp2, ch2);
 7469       __ br(__ NE, L_HAS_ZERO);
 7470     __ BIND(L_LOOP_PROCEED);
 7471       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7472       __ add(str2, str2, wordSize);
 7473       __ add(result, result, wordSize/str2_chr_size);
 7474       __ br(__ GE, L_LOOP);
 7475     __ BIND(L_POST_LOOP);
 7476       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 7477       __ br(__ LE, NOMATCH);
 7478       __ ldr(ch2, Address(str2));
 7479       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7480       __ eor(ch2, first, ch2);
 7481       __ sub(tmp2, ch2, tmp1);
 7482       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7483       __ mov(tmp4, -1); // all bits set
 7484       __ b(L_SMALL_PROCEED);
 7485     __ align(OptoLoopAlignment);
 7486     __ BIND(L_SMALL);
 7487       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7488       __ eor(ch2, first, ch2);
 7489       if (str1_isL != str2_isL) {
 7490         __ zip1(v1, __ T16B, v1, v0);
 7491       }
 7492       __ sub(tmp2, ch2, tmp1);
 7493       __ mov(tmp4, -1); // all bits set
 7494       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7495       if (str1_isL != str2_isL) {
 7496         __ fmovd(ch1, v1); // move converted 4 symbols
 7497       }
 7498     __ BIND(L_SMALL_PROCEED);
 7499       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 7500       __ bic(tmp2, tmp2, ch2);
 7501       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 7502       __ rbit(tmp2, tmp2);
 7503       __ br(__ EQ, NOMATCH);
 7504     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 7505       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 7506       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 7507       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 7508       if (str2_isL) { // LL
 7509         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7510         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7511         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7512         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7513         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7514       } else {
 7515         __ mov(ch2, 0xE); // all bits in byte set except last one
 7516         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7517         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7518         __ lslv(tmp2, tmp2, tmp4);
 7519         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7520         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7521         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7522         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7523       }
 7524       __ cmp(ch1, ch2);
 7525       __ mov(tmp4, wordSize/str2_chr_size);
 7526       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7527     __ BIND(L_SMALL_CMP_LOOP);
 7528       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7529                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7530       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7531                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7532       __ add(tmp4, tmp4, 1);
 7533       __ cmp(tmp4, cnt1);
 7534       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 7535       __ cmp(first, ch2);
 7536       __ br(__ EQ, L_SMALL_CMP_LOOP);
 7537     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 7538       __ cbz(tmp2, NOMATCH); // no more matches. exit
 7539       __ clz(tmp4, tmp2);
 7540       __ add(result, result, 1); // advance index
 7541       __ add(str2, str2, str2_chr_size); // advance pointer
 7542       __ b(L_SMALL_HAS_ZERO_LOOP);
 7543     __ align(OptoLoopAlignment);
 7544     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 7545       __ cmp(first, ch2);
 7546       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7547       __ b(DONE);
 7548     __ align(OptoLoopAlignment);
 7549     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 7550       if (str2_isL) { // LL
 7551         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7552         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7553         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7554         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7555         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7556       } else {
 7557         __ mov(ch2, 0xE); // all bits in byte set except last one
 7558         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7559         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7560         __ lslv(tmp2, tmp2, tmp4);
 7561         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7562         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7563         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7564         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7565       }
 7566       __ cmp(ch1, ch2);
 7567       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7568       __ b(DONE);
 7569     __ align(OptoLoopAlignment);
 7570     __ BIND(L_HAS_ZERO);
 7571       __ rbit(tmp2, tmp2);
 7572       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 7573       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 7574       // It's fine because both counters are 32bit and are not changed in this
 7575       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 7576       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 7577       __ sub(result, result, 1);
 7578     __ BIND(L_HAS_ZERO_LOOP);
 7579       __ mov(cnt1, wordSize/str2_chr_size);
 7580       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7581       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 7582       if (str2_isL) {
 7583         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7584         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7585         __ lslv(tmp2, tmp2, tmp4);
 7586         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7587         __ add(tmp4, tmp4, 1);
 7588         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7589         __ lsl(tmp2, tmp2, 1);
 7590         __ mov(tmp4, wordSize/str2_chr_size);
 7591       } else {
 7592         __ mov(ch2, 0xE);
 7593         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7594         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7595         __ lslv(tmp2, tmp2, tmp4);
 7596         __ add(tmp4, tmp4, 1);
 7597         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7598         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7599         __ lsl(tmp2, tmp2, 1);
 7600         __ mov(tmp4, wordSize/str2_chr_size);
 7601         __ sub(str2, str2, str2_chr_size);
 7602       }
 7603       __ cmp(ch1, ch2);
 7604       __ mov(tmp4, wordSize/str2_chr_size);
 7605       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7606     __ BIND(L_CMP_LOOP);
 7607       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7608                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7609       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7610                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7611       __ add(tmp4, tmp4, 1);
 7612       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7613       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 7614       __ cmp(cnt1, ch2);
 7615       __ br(__ EQ, L_CMP_LOOP);
 7616     __ BIND(L_CMP_LOOP_NOMATCH);
 7617       // here we're not matched
 7618       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 7619       __ clz(tmp4, tmp2);
 7620       __ add(str2, str2, str2_chr_size); // advance pointer
 7621       __ b(L_HAS_ZERO_LOOP);
 7622     __ align(OptoLoopAlignment);
 7623     __ BIND(L_CMP_LOOP_LAST_CMP);
 7624       __ cmp(cnt1, ch2);
 7625       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7626       __ b(DONE);
 7627     __ align(OptoLoopAlignment);
 7628     __ BIND(L_CMP_LOOP_LAST_CMP2);
 7629       if (str2_isL) {
 7630         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7631         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7632         __ lslv(tmp2, tmp2, tmp4);
 7633         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7634         __ add(tmp4, tmp4, 1);
 7635         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7636         __ lsl(tmp2, tmp2, 1);
 7637       } else {
 7638         __ mov(ch2, 0xE);
 7639         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7640         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7641         __ lslv(tmp2, tmp2, tmp4);
 7642         __ add(tmp4, tmp4, 1);
 7643         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7644         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7645         __ lsl(tmp2, tmp2, 1);
 7646         __ sub(str2, str2, str2_chr_size);
 7647       }
 7648       __ cmp(ch1, ch2);
 7649       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7650       __ b(DONE);
 7651     __ align(OptoLoopAlignment);
 7652     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 7653       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 7654       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 7655       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 7656       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 7657       // result by analyzed characters value, so, we can just reset lower bits
 7658       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 7659       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 7660       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 7661       // index of last analyzed substring inside current octet. So, str2 in at
 7662       // respective start address. We need to advance it to next octet
 7663       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 7664       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 7665       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 7666       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 7667       __ movw(cnt2, cnt2);
 7668       __ b(L_LOOP_PROCEED);
 7669     __ align(OptoLoopAlignment);
 7670     __ BIND(NOMATCH);
 7671       __ mov(result, -1);
 7672     __ BIND(DONE);
 7673       __ pop(spilled_regs, sp);
 7674       __ ret(lr);
 7675     return entry;
 7676   }
 7677 
 7678   void generate_string_indexof_stubs() {
 7679     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 7680     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 7681     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 7682   }
 7683 
 7684   void inflate_and_store_2_fp_registers(bool generatePrfm,
 7685       FloatRegister src1, FloatRegister src2) {
 7686     Register dst = r1;
 7687     __ zip1(v1, __ T16B, src1, v0);
 7688     __ zip2(v2, __ T16B, src1, v0);
 7689     if (generatePrfm) {
 7690       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 7691     }
 7692     __ zip1(v3, __ T16B, src2, v0);
 7693     __ zip2(v4, __ T16B, src2, v0);
 7694     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 7695   }
 7696 
 7697   // R0 = src
 7698   // R1 = dst
 7699   // R2 = len
 7700   // R3 = len >> 3
 7701   // V0 = 0
 7702   // v1 = loaded 8 bytes
 7703   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 7704   address generate_large_byte_array_inflate() {
 7705     __ align(CodeEntryAlignment);
 7706     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 7707     StubCodeMark mark(this, stub_id);
 7708     address entry = __ pc();
 7709     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 7710     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 7711     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 7712 
 7713     // do one more 8-byte read to have address 16-byte aligned in most cases
 7714     // also use single store instruction
 7715     __ ldrd(v2, __ post(src, 8));
 7716     __ sub(octetCounter, octetCounter, 2);
 7717     __ zip1(v1, __ T16B, v1, v0);
 7718     __ zip1(v2, __ T16B, v2, v0);
 7719     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 7720     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7721     __ subs(rscratch1, octetCounter, large_loop_threshold);
 7722     __ br(__ LE, LOOP_START);
 7723     __ b(LOOP_PRFM_START);
 7724     __ bind(LOOP_PRFM);
 7725       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7726     __ bind(LOOP_PRFM_START);
 7727       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 7728       __ sub(octetCounter, octetCounter, 8);
 7729       __ subs(rscratch1, octetCounter, large_loop_threshold);
 7730       inflate_and_store_2_fp_registers(true, v3, v4);
 7731       inflate_and_store_2_fp_registers(true, v5, v6);
 7732       __ br(__ GT, LOOP_PRFM);
 7733       __ cmp(octetCounter, (u1)8);
 7734       __ br(__ LT, DONE);
 7735     __ bind(LOOP);
 7736       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7737       __ bind(LOOP_START);
 7738       __ sub(octetCounter, octetCounter, 8);
 7739       __ cmp(octetCounter, (u1)8);
 7740       inflate_and_store_2_fp_registers(false, v3, v4);
 7741       inflate_and_store_2_fp_registers(false, v5, v6);
 7742       __ br(__ GE, LOOP);
 7743     __ bind(DONE);
 7744       __ ret(lr);
 7745     return entry;
 7746   }
 7747 
 7748   /**
 7749    *  Arguments:
 7750    *
 7751    *  Input:
 7752    *  c_rarg0   - current state address
 7753    *  c_rarg1   - H key address
 7754    *  c_rarg2   - data address
 7755    *  c_rarg3   - number of blocks
 7756    *
 7757    *  Output:
 7758    *  Updated state at c_rarg0
 7759    */
 7760   address generate_ghash_processBlocks() {
 7761     // Bafflingly, GCM uses little-endian for the byte order, but
 7762     // big-endian for the bit order.  For example, the polynomial 1 is
 7763     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 7764     //
 7765     // So, we must either reverse the bytes in each word and do
 7766     // everything big-endian or reverse the bits in each byte and do
 7767     // it little-endian.  On AArch64 it's more idiomatic to reverse
 7768     // the bits in each byte (we have an instruction, RBIT, to do
 7769     // that) and keep the data in little-endian bit order through the
 7770     // calculation, bit-reversing the inputs and outputs.
 7771 
 7772     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 7773     StubCodeMark mark(this, stub_id);
 7774     __ align(wordSize * 2);
 7775     address p = __ pc();
 7776     __ emit_int64(0x87);  // The low-order bits of the field
 7777                           // polynomial (i.e. p = z^7+z^2+z+1)
 7778                           // repeated in the low and high parts of a
 7779                           // 128-bit vector
 7780     __ emit_int64(0x87);
 7781 
 7782     __ align(CodeEntryAlignment);
 7783     address start = __ pc();
 7784 
 7785     Register state   = c_rarg0;
 7786     Register subkeyH = c_rarg1;
 7787     Register data    = c_rarg2;
 7788     Register blocks  = c_rarg3;
 7789 
 7790     FloatRegister vzr = v30;
 7791     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 7792 
 7793     __ ldrq(v24, p);    // The field polynomial
 7794 
 7795     __ ldrq(v0, Address(state));
 7796     __ ldrq(v1, Address(subkeyH));
 7797 
 7798     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 7799     __ rbit(v0, __ T16B, v0);
 7800     __ rev64(v1, __ T16B, v1);
 7801     __ rbit(v1, __ T16B, v1);
 7802 
 7803     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 7804     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 7805 
 7806     {
 7807       Label L_ghash_loop;
 7808       __ bind(L_ghash_loop);
 7809 
 7810       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 7811                                                  // reversing each byte
 7812       __ rbit(v2, __ T16B, v2);
 7813       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 7814 
 7815       // Multiply state in v2 by subkey in v1
 7816       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 7817                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 7818                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 7819       // Reduce v7:v5 by the field polynomial
 7820       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 7821 
 7822       __ sub(blocks, blocks, 1);
 7823       __ cbnz(blocks, L_ghash_loop);
 7824     }
 7825 
 7826     // The bit-reversed result is at this point in v0
 7827     __ rev64(v0, __ T16B, v0);
 7828     __ rbit(v0, __ T16B, v0);
 7829 
 7830     __ st1(v0, __ T16B, state);
 7831     __ ret(lr);
 7832 
 7833     return start;
 7834   }
 7835 
 7836   address generate_ghash_processBlocks_wide() {
 7837     address small = generate_ghash_processBlocks();
 7838 
 7839     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 7840     StubCodeMark mark(this, stub_id);
 7841     __ align(wordSize * 2);
 7842     address p = __ pc();
 7843     __ emit_int64(0x87);  // The low-order bits of the field
 7844                           // polynomial (i.e. p = z^7+z^2+z+1)
 7845                           // repeated in the low and high parts of a
 7846                           // 128-bit vector
 7847     __ emit_int64(0x87);
 7848 
 7849     __ align(CodeEntryAlignment);
 7850     address start = __ pc();
 7851 
 7852     Register state   = c_rarg0;
 7853     Register subkeyH = c_rarg1;
 7854     Register data    = c_rarg2;
 7855     Register blocks  = c_rarg3;
 7856 
 7857     const int unroll = 4;
 7858 
 7859     __ cmp(blocks, (unsigned char)(unroll * 2));
 7860     __ br(__ LT, small);
 7861 
 7862     if (unroll > 1) {
 7863     // Save state before entering routine
 7864       __ sub(sp, sp, 4 * 16);
 7865       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 7866       __ sub(sp, sp, 4 * 16);
 7867       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 7868     }
 7869 
 7870     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 7871 
 7872     if (unroll > 1) {
 7873       // And restore state
 7874       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 7875       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 7876     }
 7877 
 7878     __ cmp(blocks, (unsigned char)0);
 7879     __ br(__ GT, small);
 7880 
 7881     __ ret(lr);
 7882 
 7883     return start;
 7884   }
 7885 
 7886   void generate_base64_encode_simdround(Register src, Register dst,
 7887         FloatRegister codec, u8 size) {
 7888 
 7889     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 7890     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 7891     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 7892 
 7893     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 7894 
 7895     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 7896 
 7897     __ ushr(ind0, arrangement, in0,  2);
 7898 
 7899     __ ushr(ind1, arrangement, in1,  2);
 7900     __ shl(in0,   arrangement, in0,  6);
 7901     __ orr(ind1,  arrangement, ind1, in0);
 7902     __ ushr(ind1, arrangement, ind1, 2);
 7903 
 7904     __ ushr(ind2, arrangement, in2,  4);
 7905     __ shl(in1,   arrangement, in1,  4);
 7906     __ orr(ind2,  arrangement, in1,  ind2);
 7907     __ ushr(ind2, arrangement, ind2, 2);
 7908 
 7909     __ shl(ind3,  arrangement, in2,  2);
 7910     __ ushr(ind3, arrangement, ind3, 2);
 7911 
 7912     __ tbl(out0,  arrangement, codec,  4, ind0);
 7913     __ tbl(out1,  arrangement, codec,  4, ind1);
 7914     __ tbl(out2,  arrangement, codec,  4, ind2);
 7915     __ tbl(out3,  arrangement, codec,  4, ind3);
 7916 
 7917     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 7918   }
 7919 
 7920    /**
 7921    *  Arguments:
 7922    *
 7923    *  Input:
 7924    *  c_rarg0   - src_start
 7925    *  c_rarg1   - src_offset
 7926    *  c_rarg2   - src_length
 7927    *  c_rarg3   - dest_start
 7928    *  c_rarg4   - dest_offset
 7929    *  c_rarg5   - isURL
 7930    *
 7931    */
 7932   address generate_base64_encodeBlock() {
 7933 
 7934     static const char toBase64[64] = {
 7935       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7936       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7937       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7938       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7939       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 7940     };
 7941 
 7942     static const char toBase64URL[64] = {
 7943       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7944       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7945       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7946       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7947       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 7948     };
 7949 
 7950     __ align(CodeEntryAlignment);
 7951     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 7952     StubCodeMark mark(this, stub_id);
 7953     address start = __ pc();
 7954 
 7955     Register src   = c_rarg0;  // source array
 7956     Register soff  = c_rarg1;  // source start offset
 7957     Register send  = c_rarg2;  // source end offset
 7958     Register dst   = c_rarg3;  // dest array
 7959     Register doff  = c_rarg4;  // position for writing to dest array
 7960     Register isURL = c_rarg5;  // Base64 or URL character set
 7961 
 7962     // c_rarg6 and c_rarg7 are free to use as temps
 7963     Register codec  = c_rarg6;
 7964     Register length = c_rarg7;
 7965 
 7966     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 7967 
 7968     __ add(src, src, soff);
 7969     __ add(dst, dst, doff);
 7970     __ sub(length, send, soff);
 7971 
 7972     // load the codec base address
 7973     __ lea(codec, ExternalAddress((address) toBase64));
 7974     __ cbz(isURL, ProcessData);
 7975     __ lea(codec, ExternalAddress((address) toBase64URL));
 7976 
 7977     __ BIND(ProcessData);
 7978 
 7979     // too short to formup a SIMD loop, roll back
 7980     __ cmp(length, (u1)24);
 7981     __ br(Assembler::LT, Process3B);
 7982 
 7983     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 7984 
 7985     __ BIND(Process48B);
 7986     __ cmp(length, (u1)48);
 7987     __ br(Assembler::LT, Process24B);
 7988     generate_base64_encode_simdround(src, dst, v0, 16);
 7989     __ sub(length, length, 48);
 7990     __ b(Process48B);
 7991 
 7992     __ BIND(Process24B);
 7993     __ cmp(length, (u1)24);
 7994     __ br(Assembler::LT, SIMDExit);
 7995     generate_base64_encode_simdround(src, dst, v0, 8);
 7996     __ sub(length, length, 24);
 7997 
 7998     __ BIND(SIMDExit);
 7999     __ cbz(length, Exit);
 8000 
 8001     __ BIND(Process3B);
 8002     //  3 src bytes, 24 bits
 8003     __ ldrb(r10, __ post(src, 1));
 8004     __ ldrb(r11, __ post(src, 1));
 8005     __ ldrb(r12, __ post(src, 1));
 8006     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 8007     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 8008     // codec index
 8009     __ ubfmw(r15, r12, 18, 23);
 8010     __ ubfmw(r14, r12, 12, 17);
 8011     __ ubfmw(r13, r12, 6,  11);
 8012     __ andw(r12,  r12, 63);
 8013     // get the code based on the codec
 8014     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 8015     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 8016     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 8017     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 8018     __ strb(r15, __ post(dst, 1));
 8019     __ strb(r14, __ post(dst, 1));
 8020     __ strb(r13, __ post(dst, 1));
 8021     __ strb(r12, __ post(dst, 1));
 8022     __ sub(length, length, 3);
 8023     __ cbnz(length, Process3B);
 8024 
 8025     __ BIND(Exit);
 8026     __ ret(lr);
 8027 
 8028     return start;
 8029   }
 8030 
 8031   void generate_base64_decode_simdround(Register src, Register dst,
 8032         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 8033 
 8034     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 8035     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 8036 
 8037     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 8038     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 8039 
 8040     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 8041 
 8042     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 8043 
 8044     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 8045 
 8046     // we need unsigned saturating subtract, to make sure all input values
 8047     // in range [0, 63] will have 0U value in the higher half lookup
 8048     __ uqsubv(decH0, __ T16B, in0, v27);
 8049     __ uqsubv(decH1, __ T16B, in1, v27);
 8050     __ uqsubv(decH2, __ T16B, in2, v27);
 8051     __ uqsubv(decH3, __ T16B, in3, v27);
 8052 
 8053     // lower half lookup
 8054     __ tbl(decL0, arrangement, codecL, 4, in0);
 8055     __ tbl(decL1, arrangement, codecL, 4, in1);
 8056     __ tbl(decL2, arrangement, codecL, 4, in2);
 8057     __ tbl(decL3, arrangement, codecL, 4, in3);
 8058 
 8059     // higher half lookup
 8060     __ tbx(decH0, arrangement, codecH, 4, decH0);
 8061     __ tbx(decH1, arrangement, codecH, 4, decH1);
 8062     __ tbx(decH2, arrangement, codecH, 4, decH2);
 8063     __ tbx(decH3, arrangement, codecH, 4, decH3);
 8064 
 8065     // combine lower and higher
 8066     __ orr(decL0, arrangement, decL0, decH0);
 8067     __ orr(decL1, arrangement, decL1, decH1);
 8068     __ orr(decL2, arrangement, decL2, decH2);
 8069     __ orr(decL3, arrangement, decL3, decH3);
 8070 
 8071     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 8072     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 8073     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 8074     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 8075     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 8076     __ orr(in0, arrangement, decH0, decH1);
 8077     __ orr(in1, arrangement, decH2, decH3);
 8078     __ orr(in2, arrangement, in0,   in1);
 8079     __ umaxv(in3, arrangement, in2);
 8080     __ umov(rscratch2, in3, __ B, 0);
 8081 
 8082     // get the data to output
 8083     __ shl(out0,  arrangement, decL0, 2);
 8084     __ ushr(out1, arrangement, decL1, 4);
 8085     __ orr(out0,  arrangement, out0,  out1);
 8086     __ shl(out1,  arrangement, decL1, 4);
 8087     __ ushr(out2, arrangement, decL2, 2);
 8088     __ orr(out1,  arrangement, out1,  out2);
 8089     __ shl(out2,  arrangement, decL2, 6);
 8090     __ orr(out2,  arrangement, out2,  decL3);
 8091 
 8092     __ cbz(rscratch2, NoIllegalData);
 8093 
 8094     // handle illegal input
 8095     __ umov(r10, in2, __ D, 0);
 8096     if (size == 16) {
 8097       __ cbnz(r10, ErrorInLowerHalf);
 8098 
 8099       // illegal input is in higher half, store the lower half now.
 8100       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 8101 
 8102       __ umov(r10, in2,  __ D, 1);
 8103       __ umov(r11, out0, __ D, 1);
 8104       __ umov(r12, out1, __ D, 1);
 8105       __ umov(r13, out2, __ D, 1);
 8106       __ b(StoreLegalData);
 8107 
 8108       __ BIND(ErrorInLowerHalf);
 8109     }
 8110     __ umov(r11, out0, __ D, 0);
 8111     __ umov(r12, out1, __ D, 0);
 8112     __ umov(r13, out2, __ D, 0);
 8113 
 8114     __ BIND(StoreLegalData);
 8115     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 8116     __ strb(r11, __ post(dst, 1));
 8117     __ strb(r12, __ post(dst, 1));
 8118     __ strb(r13, __ post(dst, 1));
 8119     __ lsr(r10, r10, 8);
 8120     __ lsr(r11, r11, 8);
 8121     __ lsr(r12, r12, 8);
 8122     __ lsr(r13, r13, 8);
 8123     __ b(StoreLegalData);
 8124 
 8125     __ BIND(NoIllegalData);
 8126     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 8127   }
 8128 
 8129 
 8130    /**
 8131    *  Arguments:
 8132    *
 8133    *  Input:
 8134    *  c_rarg0   - src_start
 8135    *  c_rarg1   - src_offset
 8136    *  c_rarg2   - src_length
 8137    *  c_rarg3   - dest_start
 8138    *  c_rarg4   - dest_offset
 8139    *  c_rarg5   - isURL
 8140    *  c_rarg6   - isMIME
 8141    *
 8142    */
 8143   address generate_base64_decodeBlock() {
 8144 
 8145     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 8146     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 8147     // titled "Base64 decoding".
 8148 
 8149     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 8150     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 8151     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 8152     static const uint8_t fromBase64ForNoSIMD[256] = {
 8153       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8154       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8155       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8156        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8157       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8158        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 8159       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8160        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8161       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8162       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8163       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8164       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8166       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8167       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8168       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8169     };
 8170 
 8171     static const uint8_t fromBase64URLForNoSIMD[256] = {
 8172       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8173       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8174       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8175        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8176       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8177        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 8178       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8179        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8180       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8181       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8182       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8183       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8184       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8185       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8186       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8187       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8188     };
 8189 
 8190     // A legal value of base64 code is in range [0, 127].  We need two lookups
 8191     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 8192     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 8193     // table vector lookup use tbx, out of range indices are unchanged in
 8194     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 8195     // The value of index 64 is set to 0, so that we know that we already get the
 8196     // decoded data with the 1st lookup.
 8197     static const uint8_t fromBase64ForSIMD[128] = {
 8198       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8199       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8200       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8201        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8202         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8203        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8204       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8205        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8206     };
 8207 
 8208     static const uint8_t fromBase64URLForSIMD[128] = {
 8209       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8210       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8211       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8212        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8213         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8214        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8215        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8216        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8217     };
 8218 
 8219     __ align(CodeEntryAlignment);
 8220     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 8221     StubCodeMark mark(this, stub_id);
 8222     address start = __ pc();
 8223 
 8224     Register src    = c_rarg0;  // source array
 8225     Register soff   = c_rarg1;  // source start offset
 8226     Register send   = c_rarg2;  // source end offset
 8227     Register dst    = c_rarg3;  // dest array
 8228     Register doff   = c_rarg4;  // position for writing to dest array
 8229     Register isURL  = c_rarg5;  // Base64 or URL character set
 8230     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 8231 
 8232     Register length = send;    // reuse send as length of source data to process
 8233 
 8234     Register simd_codec   = c_rarg6;
 8235     Register nosimd_codec = c_rarg7;
 8236 
 8237     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 8238 
 8239     __ enter();
 8240 
 8241     __ add(src, src, soff);
 8242     __ add(dst, dst, doff);
 8243 
 8244     __ mov(doff, dst);
 8245 
 8246     __ sub(length, send, soff);
 8247     __ bfm(length, zr, 0, 1);
 8248 
 8249     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 8250     __ cbz(isURL, ProcessData);
 8251     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 8252 
 8253     __ BIND(ProcessData);
 8254     __ mov(rscratch1, length);
 8255     __ cmp(length, (u1)144); // 144 = 80 + 64
 8256     __ br(Assembler::LT, Process4B);
 8257 
 8258     // In the MIME case, the line length cannot be more than 76
 8259     // bytes (see RFC 2045). This is too short a block for SIMD
 8260     // to be worthwhile, so we use non-SIMD here.
 8261     __ movw(rscratch1, 79);
 8262 
 8263     __ BIND(Process4B);
 8264     __ ldrw(r14, __ post(src, 4));
 8265     __ ubfxw(r10, r14, 0,  8);
 8266     __ ubfxw(r11, r14, 8,  8);
 8267     __ ubfxw(r12, r14, 16, 8);
 8268     __ ubfxw(r13, r14, 24, 8);
 8269     // get the de-code
 8270     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 8271     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 8272     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 8273     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 8274     // error detection, 255u indicates an illegal input
 8275     __ orrw(r14, r10, r11);
 8276     __ orrw(r15, r12, r13);
 8277     __ orrw(r14, r14, r15);
 8278     __ tbnz(r14, 7, Exit);
 8279     // recover the data
 8280     __ lslw(r14, r10, 10);
 8281     __ bfiw(r14, r11, 4, 6);
 8282     __ bfmw(r14, r12, 2, 5);
 8283     __ rev16w(r14, r14);
 8284     __ bfiw(r13, r12, 6, 2);
 8285     __ strh(r14, __ post(dst, 2));
 8286     __ strb(r13, __ post(dst, 1));
 8287     // non-simd loop
 8288     __ subsw(rscratch1, rscratch1, 4);
 8289     __ br(Assembler::GT, Process4B);
 8290 
 8291     // if exiting from PreProcess80B, rscratch1 == -1;
 8292     // otherwise, rscratch1 == 0.
 8293     __ cbzw(rscratch1, Exit);
 8294     __ sub(length, length, 80);
 8295 
 8296     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 8297     __ cbz(isURL, SIMDEnter);
 8298     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 8299 
 8300     __ BIND(SIMDEnter);
 8301     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 8302     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 8303     __ mov(rscratch1, 63);
 8304     __ dup(v27, __ T16B, rscratch1);
 8305 
 8306     __ BIND(Process64B);
 8307     __ cmp(length, (u1)64);
 8308     __ br(Assembler::LT, Process32B);
 8309     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 8310     __ sub(length, length, 64);
 8311     __ b(Process64B);
 8312 
 8313     __ BIND(Process32B);
 8314     __ cmp(length, (u1)32);
 8315     __ br(Assembler::LT, SIMDExit);
 8316     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 8317     __ sub(length, length, 32);
 8318     __ b(Process32B);
 8319 
 8320     __ BIND(SIMDExit);
 8321     __ cbz(length, Exit);
 8322     __ movw(rscratch1, length);
 8323     __ b(Process4B);
 8324 
 8325     __ BIND(Exit);
 8326     __ sub(c_rarg0, dst, doff);
 8327 
 8328     __ leave();
 8329     __ ret(lr);
 8330 
 8331     return start;
 8332   }
 8333 
 8334   // Support for spin waits.
 8335   address generate_spin_wait() {
 8336     __ align(CodeEntryAlignment);
 8337     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 8338     StubCodeMark mark(this, stub_id);
 8339     address start = __ pc();
 8340 
 8341     __ spin_wait();
 8342     __ ret(lr);
 8343 
 8344     return start;
 8345   }
 8346 
 8347   void generate_lookup_secondary_supers_table_stub() {
 8348     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 8349     StubCodeMark mark(this, stub_id);
 8350 
 8351     const Register
 8352       r_super_klass  = r0,
 8353       r_array_base   = r1,
 8354       r_array_length = r2,
 8355       r_array_index  = r3,
 8356       r_sub_klass    = r4,
 8357       r_bitmap       = rscratch2,
 8358       result         = r5;
 8359     const FloatRegister
 8360       vtemp          = v0;
 8361 
 8362     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 8363       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 8364       Label L_success;
 8365       __ enter();
 8366       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 8367                                              r_array_base, r_array_length, r_array_index,
 8368                                              vtemp, result, slot,
 8369                                              /*stub_is_near*/true);
 8370       __ leave();
 8371       __ ret(lr);
 8372     }
 8373   }
 8374 
 8375   // Slow path implementation for UseSecondarySupersTable.
 8376   address generate_lookup_secondary_supers_table_slow_path_stub() {
 8377     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 8378     StubCodeMark mark(this, stub_id);
 8379 
 8380     address start = __ pc();
 8381     const Register
 8382       r_super_klass  = r0,        // argument
 8383       r_array_base   = r1,        // argument
 8384       temp1          = r2,        // temp
 8385       r_array_index  = r3,        // argument
 8386       r_bitmap       = rscratch2, // argument
 8387       result         = r5;        // argument
 8388 
 8389     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 8390     __ ret(lr);
 8391 
 8392     return start;
 8393   }
 8394 
 8395 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 8396 
 8397   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 8398   //
 8399   // If LSE is in use, generate LSE versions of all the stubs. The
 8400   // non-LSE versions are in atomic_aarch64.S.
 8401 
 8402   // class AtomicStubMark records the entry point of a stub and the
 8403   // stub pointer which will point to it. The stub pointer is set to
 8404   // the entry point when ~AtomicStubMark() is called, which must be
 8405   // after ICache::invalidate_range. This ensures safe publication of
 8406   // the generated code.
 8407   class AtomicStubMark {
 8408     address _entry_point;
 8409     aarch64_atomic_stub_t *_stub;
 8410     MacroAssembler *_masm;
 8411   public:
 8412     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 8413       _masm = masm;
 8414       __ align(32);
 8415       _entry_point = __ pc();
 8416       _stub = stub;
 8417     }
 8418     ~AtomicStubMark() {
 8419       *_stub = (aarch64_atomic_stub_t)_entry_point;
 8420     }
 8421   };
 8422 
 8423   // NB: For memory_order_conservative we need a trailing membar after
 8424   // LSE atomic operations but not a leading membar.
 8425   //
 8426   // We don't need a leading membar because a clause in the Arm ARM
 8427   // says:
 8428   //
 8429   //   Barrier-ordered-before
 8430   //
 8431   //   Barrier instructions order prior Memory effects before subsequent
 8432   //   Memory effects generated by the same Observer. A read or a write
 8433   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 8434   //   Observer if and only if RW1 appears in program order before RW 2
 8435   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 8436   //   instruction with both Acquire and Release semantics.
 8437   //
 8438   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 8439   // and Release semantics, therefore we don't need a leading
 8440   // barrier. However, there is no corresponding Barrier-ordered-after
 8441   // relationship, therefore we need a trailing membar to prevent a
 8442   // later store or load from being reordered with the store in an
 8443   // atomic instruction.
 8444   //
 8445   // This was checked by using the herd7 consistency model simulator
 8446   // (http://diy.inria.fr/) with this test case:
 8447   //
 8448   // AArch64 LseCas
 8449   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 8450   // P0 | P1;
 8451   // LDR W4, [X2] | MOV W3, #0;
 8452   // DMB LD       | MOV W4, #1;
 8453   // LDR W3, [X1] | CASAL W3, W4, [X1];
 8454   //              | DMB ISH;
 8455   //              | STR W4, [X2];
 8456   // exists
 8457   // (0:X3=0 /\ 0:X4=1)
 8458   //
 8459   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 8460   // with the store to x in P1. Without the DMB in P1 this may happen.
 8461   //
 8462   // At the time of writing we don't know of any AArch64 hardware that
 8463   // reorders stores in this way, but the Reference Manual permits it.
 8464 
 8465   void gen_cas_entry(Assembler::operand_size size,
 8466                      atomic_memory_order order) {
 8467     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 8468       exchange_val = c_rarg2;
 8469     bool acquire, release;
 8470     switch (order) {
 8471       case memory_order_relaxed:
 8472         acquire = false;
 8473         release = false;
 8474         break;
 8475       case memory_order_release:
 8476         acquire = false;
 8477         release = true;
 8478         break;
 8479       default:
 8480         acquire = true;
 8481         release = true;
 8482         break;
 8483     }
 8484     __ mov(prev, compare_val);
 8485     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 8486     if (order == memory_order_conservative) {
 8487       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8488     }
 8489     if (size == Assembler::xword) {
 8490       __ mov(r0, prev);
 8491     } else {
 8492       __ movw(r0, prev);
 8493     }
 8494     __ ret(lr);
 8495   }
 8496 
 8497   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 8498     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8499     // If not relaxed, then default to conservative.  Relaxed is the only
 8500     // case we use enough to be worth specializing.
 8501     if (order == memory_order_relaxed) {
 8502       __ ldadd(size, incr, prev, addr);
 8503     } else {
 8504       __ ldaddal(size, incr, prev, addr);
 8505       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8506     }
 8507     if (size == Assembler::xword) {
 8508       __ mov(r0, prev);
 8509     } else {
 8510       __ movw(r0, prev);
 8511     }
 8512     __ ret(lr);
 8513   }
 8514 
 8515   void gen_swpal_entry(Assembler::operand_size size) {
 8516     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8517     __ swpal(size, incr, prev, addr);
 8518     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8519     if (size == Assembler::xword) {
 8520       __ mov(r0, prev);
 8521     } else {
 8522       __ movw(r0, prev);
 8523     }
 8524     __ ret(lr);
 8525   }
 8526 
 8527   void generate_atomic_entry_points() {
 8528     if (! UseLSE) {
 8529       return;
 8530     }
 8531     __ align(CodeEntryAlignment);
 8532     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 8533     StubCodeMark mark(this, stub_id);
 8534     address first_entry = __ pc();
 8535 
 8536     // ADD, memory_order_conservative
 8537     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 8538     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 8539     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 8540     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 8541 
 8542     // ADD, memory_order_relaxed
 8543     AtomicStubMark mark_fetch_add_4_relaxed
 8544       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 8545     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 8546     AtomicStubMark mark_fetch_add_8_relaxed
 8547       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 8548     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 8549 
 8550     // XCHG, memory_order_conservative
 8551     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 8552     gen_swpal_entry(Assembler::word);
 8553     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 8554     gen_swpal_entry(Assembler::xword);
 8555 
 8556     // CAS, memory_order_conservative
 8557     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 8558     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 8559     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 8560     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 8561     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 8562     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 8563 
 8564     // CAS, memory_order_relaxed
 8565     AtomicStubMark mark_cmpxchg_1_relaxed
 8566       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 8567     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 8568     AtomicStubMark mark_cmpxchg_4_relaxed
 8569       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 8570     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 8571     AtomicStubMark mark_cmpxchg_8_relaxed
 8572       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 8573     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 8574 
 8575     AtomicStubMark mark_cmpxchg_4_release
 8576       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 8577     gen_cas_entry(MacroAssembler::word, memory_order_release);
 8578     AtomicStubMark mark_cmpxchg_8_release
 8579       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 8580     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 8581 
 8582     AtomicStubMark mark_cmpxchg_4_seq_cst
 8583       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 8584     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 8585     AtomicStubMark mark_cmpxchg_8_seq_cst
 8586       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 8587     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 8588 
 8589     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 8590   }
 8591 #endif // LINUX
 8592 
 8593   address generate_cont_thaw(Continuation::thaw_kind kind) {
 8594     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 8595     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
 8596 
 8597     address start = __ pc();
 8598 
 8599     if (return_barrier) {
 8600       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
 8601       __ mov(sp, rscratch1);
 8602     }
 8603     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8604 
 8605     if (return_barrier) {
 8606       // preserve possible return value from a method returning to the return barrier
 8607       __ fmovd(rscratch1, v0);
 8608       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8609     }
 8610 
 8611     __ movw(c_rarg1, (return_barrier ? 1 : 0));
 8612     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
 8613     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
 8614 
 8615     if (return_barrier) {
 8616       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8617       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8618       __ fmovd(v0, rscratch1);
 8619     }
 8620     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8621 
 8622 
 8623     Label thaw_success;
 8624     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
 8625     __ cbnz(rscratch2, thaw_success);
 8626     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
 8627     __ br(rscratch1);
 8628     __ bind(thaw_success);
 8629 
 8630     // make room for the thawed frames
 8631     __ sub(rscratch1, sp, rscratch2);
 8632     __ andr(rscratch1, rscratch1, -16); // align
 8633     __ mov(sp, rscratch1);
 8634 
 8635     if (return_barrier) {
 8636       // save original return value -- again
 8637       __ fmovd(rscratch1, v0);
 8638       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8639     }
 8640 
 8641     // If we want, we can templatize thaw by kind, and have three different entries
 8642     __ movw(c_rarg1, (uint32_t)kind);
 8643 
 8644     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
 8645     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
 8646 
 8647     if (return_barrier) {
 8648       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8649       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8650       __ fmovd(v0, rscratch1);
 8651     } else {
 8652       __ mov(r0, zr); // return 0 (success) from doYield
 8653     }
 8654 
 8655     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
 8656     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
 8657     __ mov(rfp, sp);
 8658 
 8659     if (return_barrier_exception) {
 8660       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
 8661       __ authenticate_return_address(c_rarg1);
 8662       __ verify_oop(r0);
 8663       // save return value containing the exception oop in callee-saved R19
 8664       __ mov(r19, r0);
 8665 
 8666       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
 8667 
 8668       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
 8669       // __ reinitialize_ptrue();
 8670 
 8671       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
 8672 
 8673       __ mov(r1, r0); // the exception handler
 8674       __ mov(r0, r19); // restore return value containing the exception oop
 8675       __ verify_oop(r0);
 8676 
 8677       __ leave();
 8678       __ mov(r3, lr);
 8679       __ br(r1); // the exception handler
 8680     } else {
 8681       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
 8682       __ leave();
 8683       __ ret(lr);
 8684     }
 8685 
 8686     return start;
 8687   }
 8688 
 8689   address generate_cont_thaw() {
 8690     if (!Continuations::enabled()) return nullptr;
 8691 
 8692     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
 8693     StubCodeMark mark(this, stub_id);
 8694     address start = __ pc();
 8695     generate_cont_thaw(Continuation::thaw_top);
 8696     return start;
 8697   }
 8698 
 8699   address generate_cont_returnBarrier() {
 8700     if (!Continuations::enabled()) return nullptr;
 8701 
 8702     // TODO: will probably need multiple return barriers depending on return type
 8703     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
 8704     StubCodeMark mark(this, stub_id);
 8705     address start = __ pc();
 8706 
 8707     generate_cont_thaw(Continuation::thaw_return_barrier);
 8708 
 8709     return start;
 8710   }
 8711 
 8712   address generate_cont_returnBarrier_exception() {
 8713     if (!Continuations::enabled()) return nullptr;
 8714 
 8715     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
 8716     StubCodeMark mark(this, stub_id);
 8717     address start = __ pc();
 8718 
 8719     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
 8720 
 8721     return start;
 8722   }
 8723 
 8724   address generate_cont_preempt_stub() {
 8725     if (!Continuations::enabled()) return nullptr;
 8726     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
 8727     StubCodeMark mark(this, stub_id);
 8728     address start = __ pc();
 8729 
 8730     __ reset_last_Java_frame(true);
 8731 
 8732     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
 8733     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
 8734     __ mov(sp, rscratch2);
 8735 
 8736     Label preemption_cancelled;
 8737     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8738     __ cbnz(rscratch1, preemption_cancelled);
 8739 
 8740     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
 8741     SharedRuntime::continuation_enter_cleanup(_masm);
 8742     __ leave();
 8743     __ ret(lr);
 8744 
 8745     // We acquired the monitor after freezing the frames so call thaw to continue execution.
 8746     __ bind(preemption_cancelled);
 8747     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8748     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
 8749     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
 8750     __ ldr(rscratch1, Address(rscratch1));
 8751     __ br(rscratch1);
 8752 
 8753     return start;
 8754   }
 8755 
 8756   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
 8757   // are represented as long[5], with BITS_PER_LIMB = 26.
 8758   // Pack five 26-bit limbs into three 64-bit registers.
 8759   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
 8760     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
 8761     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
 8762     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
 8763     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
 8764 
 8765     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
 8766     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
 8767     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
 8768     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
 8769 
 8770     if (dest2->is_valid()) {
 8771       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8772     } else {
 8773 #ifdef ASSERT
 8774       Label OK;
 8775       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8776       __ br(__ EQ, OK);
 8777       __ stop("high bits of Poly1305 integer should be zero");
 8778       __ should_not_reach_here();
 8779       __ bind(OK);
 8780 #endif
 8781     }
 8782   }
 8783 
 8784   // As above, but return only a 128-bit integer, packed into two
 8785   // 64-bit registers.
 8786   void pack_26(Register dest0, Register dest1, Register src) {
 8787     pack_26(dest0, dest1, noreg, src);
 8788   }
 8789 
 8790   // Multiply and multiply-accumulate unsigned 64-bit registers.
 8791   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
 8792     __ mul(prod_lo, n, m);
 8793     __ umulh(prod_hi, n, m);
 8794   }
 8795   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
 8796     wide_mul(rscratch1, rscratch2, n, m);
 8797     __ adds(sum_lo, sum_lo, rscratch1);
 8798     __ adc(sum_hi, sum_hi, rscratch2);
 8799   }
 8800 
 8801   // Poly1305, RFC 7539
 8802 
 8803   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
 8804   // description of the tricks used to simplify and accelerate this
 8805   // computation.
 8806 
 8807   address generate_poly1305_processBlocks() {
 8808     __ align(CodeEntryAlignment);
 8809     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
 8810     StubCodeMark mark(this, stub_id);
 8811     address start = __ pc();
 8812     Label here;
 8813     __ enter();
 8814     RegSet callee_saved = RegSet::range(r19, r28);
 8815     __ push(callee_saved, sp);
 8816 
 8817     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
 8818 
 8819     // Arguments
 8820     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
 8821 
 8822     // R_n is the 128-bit randomly-generated key, packed into two
 8823     // registers.  The caller passes this key to us as long[5], with
 8824     // BITS_PER_LIMB = 26.
 8825     const Register R_0 = *++regs, R_1 = *++regs;
 8826     pack_26(R_0, R_1, r_start);
 8827 
 8828     // RR_n is (R_n >> 2) * 5
 8829     const Register RR_0 = *++regs, RR_1 = *++regs;
 8830     __ lsr(RR_0, R_0, 2);
 8831     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
 8832     __ lsr(RR_1, R_1, 2);
 8833     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
 8834 
 8835     // U_n is the current checksum
 8836     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
 8837     pack_26(U_0, U_1, U_2, acc_start);
 8838 
 8839     static constexpr int BLOCK_LENGTH = 16;
 8840     Label DONE, LOOP;
 8841 
 8842     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8843     __ br(Assembler::LT, DONE); {
 8844       __ bind(LOOP);
 8845 
 8846       // S_n is to be the sum of U_n and the next block of data
 8847       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
 8848       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
 8849       __ adds(S_0, U_0, S_0);
 8850       __ adcs(S_1, U_1, S_1);
 8851       __ adc(S_2, U_2, zr);
 8852       __ add(S_2, S_2, 1);
 8853 
 8854       const Register U_0HI = *++regs, U_1HI = *++regs;
 8855 
 8856       // NB: this logic depends on some of the special properties of
 8857       // Poly1305 keys. In particular, because we know that the top
 8858       // four bits of R_0 and R_1 are zero, we can add together
 8859       // partial products without any risk of needing to propagate a
 8860       // carry out.
 8861       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
 8862       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
 8863       __ andr(U_2, R_0, 3);
 8864       __ mul(U_2, S_2, U_2);
 8865 
 8866       // Recycle registers S_0, S_1, S_2
 8867       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
 8868 
 8869       // Partial reduction mod 2**130 - 5
 8870       __ adds(U_1, U_0HI, U_1);
 8871       __ adc(U_2, U_1HI, U_2);
 8872       // Sum now in U_2:U_1:U_0.
 8873       // Dead: U_0HI, U_1HI.
 8874       regs = (regs.remaining() + U_0HI + U_1HI).begin();
 8875 
 8876       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
 8877 
 8878       // First, U_2:U_1:U_0 += (U_2 >> 2)
 8879       __ lsr(rscratch1, U_2, 2);
 8880       __ andr(U_2, U_2, (u8)3);
 8881       __ adds(U_0, U_0, rscratch1);
 8882       __ adcs(U_1, U_1, zr);
 8883       __ adc(U_2, U_2, zr);
 8884       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
 8885       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
 8886       __ adcs(U_1, U_1, zr);
 8887       __ adc(U_2, U_2, zr);
 8888 
 8889       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
 8890       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8891       __ br(~ Assembler::LT, LOOP);
 8892     }
 8893 
 8894     // Further reduce modulo 2^130 - 5
 8895     __ lsr(rscratch1, U_2, 2);
 8896     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
 8897     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
 8898     __ adcs(U_1, U_1, zr);
 8899     __ andr(U_2, U_2, (u1)3);
 8900     __ adc(U_2, U_2, zr);
 8901 
 8902     // Unpack the sum into five 26-bit limbs and write to memory.
 8903     __ ubfiz(rscratch1, U_0, 0, 26);
 8904     __ ubfx(rscratch2, U_0, 26, 26);
 8905     __ stp(rscratch1, rscratch2, Address(acc_start));
 8906     __ ubfx(rscratch1, U_0, 52, 12);
 8907     __ bfi(rscratch1, U_1, 12, 14);
 8908     __ ubfx(rscratch2, U_1, 14, 26);
 8909     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
 8910     __ ubfx(rscratch1, U_1, 40, 24);
 8911     __ bfi(rscratch1, U_2, 24, 3);
 8912     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
 8913 
 8914     __ bind(DONE);
 8915     __ pop(callee_saved, sp);
 8916     __ leave();
 8917     __ ret(lr);
 8918 
 8919     return start;
 8920   }
 8921 
 8922   // exception handler for upcall stubs
 8923   address generate_upcall_stub_exception_handler() {
 8924     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
 8925     StubCodeMark mark(this, stub_id);
 8926     address start = __ pc();
 8927 
 8928     // Native caller has no idea how to handle exceptions,
 8929     // so we just crash here. Up to callee to catch exceptions.
 8930     __ verify_oop(r0);
 8931     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
 8932     __ blr(rscratch1);
 8933     __ should_not_reach_here();
 8934 
 8935     return start;
 8936   }
 8937 
 8938   // load Method* target of MethodHandle
 8939   // j_rarg0 = jobject receiver
 8940   // rmethod = result
 8941   address generate_upcall_stub_load_target() {
 8942     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
 8943     StubCodeMark mark(this, stub_id);
 8944     address start = __ pc();
 8945 
 8946     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
 8947       // Load target method from receiver
 8948     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
 8949     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
 8950     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
 8951     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
 8952                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
 8953                       noreg, noreg);
 8954     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
 8955 
 8956     __ ret(lr);
 8957 
 8958     return start;
 8959   }
 8960 
 8961 #undef __
 8962 #define __ masm->
 8963 
 8964   class MontgomeryMultiplyGenerator : public MacroAssembler {
 8965 
 8966     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
 8967       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
 8968 
 8969     RegSet _toSave;
 8970     bool _squaring;
 8971 
 8972   public:
 8973     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
 8974       : MacroAssembler(as->code()), _squaring(squaring) {
 8975 
 8976       // Register allocation
 8977 
 8978       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
 8979       Pa_base = *regs;       // Argument registers
 8980       if (squaring)
 8981         Pb_base = Pa_base;
 8982       else
 8983         Pb_base = *++regs;
 8984       Pn_base = *++regs;
 8985       Rlen= *++regs;
 8986       inv = *++regs;
 8987       Pm_base = *++regs;
 8988 
 8989                           // Working registers:
 8990       Ra =  *++regs;        // The current digit of a, b, n, and m.
 8991       Rb =  *++regs;
 8992       Rm =  *++regs;
 8993       Rn =  *++regs;
 8994 
 8995       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
 8996       Pb =  *++regs;
 8997       Pm =  *++regs;
 8998       Pn =  *++regs;
 8999 
 9000       t0 =  *++regs;        // Three registers which form a
 9001       t1 =  *++regs;        // triple-precision accumuator.
 9002       t2 =  *++regs;
 9003 
 9004       Ri =  *++regs;        // Inner and outer loop indexes.
 9005       Rj =  *++regs;
 9006 
 9007       Rhi_ab = *++regs;     // Product registers: low and high parts
 9008       Rlo_ab = *++regs;     // of a*b and m*n.
 9009       Rhi_mn = *++regs;
 9010       Rlo_mn = *++regs;
 9011 
 9012       // r19 and up are callee-saved.
 9013       _toSave = RegSet::range(r19, *regs) + Pm_base;
 9014     }
 9015 
 9016   private:
 9017     void save_regs() {
 9018       push(_toSave, sp);
 9019     }
 9020 
 9021     void restore_regs() {
 9022       pop(_toSave, sp);
 9023     }
 9024 
 9025     template <typename T>
 9026     void unroll_2(Register count, T block) {
 9027       Label loop, end, odd;
 9028       tbnz(count, 0, odd);
 9029       cbz(count, end);
 9030       align(16);
 9031       bind(loop);
 9032       (this->*block)();
 9033       bind(odd);
 9034       (this->*block)();
 9035       subs(count, count, 2);
 9036       br(Assembler::GT, loop);
 9037       bind(end);
 9038     }
 9039 
 9040     template <typename T>
 9041     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
 9042       Label loop, end, odd;
 9043       tbnz(count, 0, odd);
 9044       cbz(count, end);
 9045       align(16);
 9046       bind(loop);
 9047       (this->*block)(d, s, tmp);
 9048       bind(odd);
 9049       (this->*block)(d, s, tmp);
 9050       subs(count, count, 2);
 9051       br(Assembler::GT, loop);
 9052       bind(end);
 9053     }
 9054 
 9055     void pre1(RegisterOrConstant i) {
 9056       block_comment("pre1");
 9057       // Pa = Pa_base;
 9058       // Pb = Pb_base + i;
 9059       // Pm = Pm_base;
 9060       // Pn = Pn_base + i;
 9061       // Ra = *Pa;
 9062       // Rb = *Pb;
 9063       // Rm = *Pm;
 9064       // Rn = *Pn;
 9065       ldr(Ra, Address(Pa_base));
 9066       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9067       ldr(Rm, Address(Pm_base));
 9068       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9069       lea(Pa, Address(Pa_base));
 9070       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9071       lea(Pm, Address(Pm_base));
 9072       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9073 
 9074       // Zero the m*n result.
 9075       mov(Rhi_mn, zr);
 9076       mov(Rlo_mn, zr);
 9077     }
 9078 
 9079     // The core multiply-accumulate step of a Montgomery
 9080     // multiplication.  The idea is to schedule operations as a
 9081     // pipeline so that instructions with long latencies (loads and
 9082     // multiplies) have time to complete before their results are
 9083     // used.  This most benefits in-order implementations of the
 9084     // architecture but out-of-order ones also benefit.
 9085     void step() {
 9086       block_comment("step");
 9087       // MACC(Ra, Rb, t0, t1, t2);
 9088       // Ra = *++Pa;
 9089       // Rb = *--Pb;
 9090       umulh(Rhi_ab, Ra, Rb);
 9091       mul(Rlo_ab, Ra, Rb);
 9092       ldr(Ra, pre(Pa, wordSize));
 9093       ldr(Rb, pre(Pb, -wordSize));
 9094       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
 9095                                        // previous iteration.
 9096       // MACC(Rm, Rn, t0, t1, t2);
 9097       // Rm = *++Pm;
 9098       // Rn = *--Pn;
 9099       umulh(Rhi_mn, Rm, Rn);
 9100       mul(Rlo_mn, Rm, Rn);
 9101       ldr(Rm, pre(Pm, wordSize));
 9102       ldr(Rn, pre(Pn, -wordSize));
 9103       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9104     }
 9105 
 9106     void post1() {
 9107       block_comment("post1");
 9108 
 9109       // MACC(Ra, Rb, t0, t1, t2);
 9110       // Ra = *++Pa;
 9111       // Rb = *--Pb;
 9112       umulh(Rhi_ab, Ra, Rb);
 9113       mul(Rlo_ab, Ra, Rb);
 9114       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9115       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9116 
 9117       // *Pm = Rm = t0 * inv;
 9118       mul(Rm, t0, inv);
 9119       str(Rm, Address(Pm));
 9120 
 9121       // MACC(Rm, Rn, t0, t1, t2);
 9122       // t0 = t1; t1 = t2; t2 = 0;
 9123       umulh(Rhi_mn, Rm, Rn);
 9124 
 9125 #ifndef PRODUCT
 9126       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9127       {
 9128         mul(Rlo_mn, Rm, Rn);
 9129         add(Rlo_mn, t0, Rlo_mn);
 9130         Label ok;
 9131         cbz(Rlo_mn, ok); {
 9132           stop("broken Montgomery multiply");
 9133         } bind(ok);
 9134       }
 9135 #endif
 9136       // We have very carefully set things up so that
 9137       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9138       // the lower half of Rm * Rn because we know the result already:
 9139       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9140       // t0 != 0.  So, rather than do a mul and an adds we just set
 9141       // the carry flag iff t0 is nonzero.
 9142       //
 9143       // mul(Rlo_mn, Rm, Rn);
 9144       // adds(zr, t0, Rlo_mn);
 9145       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9146       adcs(t0, t1, Rhi_mn);
 9147       adc(t1, t2, zr);
 9148       mov(t2, zr);
 9149     }
 9150 
 9151     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
 9152       block_comment("pre2");
 9153       // Pa = Pa_base + i-len;
 9154       // Pb = Pb_base + len;
 9155       // Pm = Pm_base + i-len;
 9156       // Pn = Pn_base + len;
 9157 
 9158       if (i.is_register()) {
 9159         sub(Rj, i.as_register(), len);
 9160       } else {
 9161         mov(Rj, i.as_constant());
 9162         sub(Rj, Rj, len);
 9163       }
 9164       // Rj == i-len
 9165 
 9166       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
 9167       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
 9168       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9169       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
 9170 
 9171       // Ra = *++Pa;
 9172       // Rb = *--Pb;
 9173       // Rm = *++Pm;
 9174       // Rn = *--Pn;
 9175       ldr(Ra, pre(Pa, wordSize));
 9176       ldr(Rb, pre(Pb, -wordSize));
 9177       ldr(Rm, pre(Pm, wordSize));
 9178       ldr(Rn, pre(Pn, -wordSize));
 9179 
 9180       mov(Rhi_mn, zr);
 9181       mov(Rlo_mn, zr);
 9182     }
 9183 
 9184     void post2(RegisterOrConstant i, RegisterOrConstant len) {
 9185       block_comment("post2");
 9186       if (i.is_constant()) {
 9187         mov(Rj, i.as_constant()-len.as_constant());
 9188       } else {
 9189         sub(Rj, i.as_register(), len);
 9190       }
 9191 
 9192       adds(t0, t0, Rlo_mn); // The pending m*n, low part
 9193 
 9194       // As soon as we know the least significant digit of our result,
 9195       // store it.
 9196       // Pm_base[i-len] = t0;
 9197       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9198 
 9199       // t0 = t1; t1 = t2; t2 = 0;
 9200       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
 9201       adc(t1, t2, zr);
 9202       mov(t2, zr);
 9203     }
 9204 
 9205     // A carry in t0 after Montgomery multiplication means that we
 9206     // should subtract multiples of n from our result in m.  We'll
 9207     // keep doing that until there is no carry.
 9208     void normalize(RegisterOrConstant len) {
 9209       block_comment("normalize");
 9210       // while (t0)
 9211       //   t0 = sub(Pm_base, Pn_base, t0, len);
 9212       Label loop, post, again;
 9213       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
 9214       cbz(t0, post); {
 9215         bind(again); {
 9216           mov(i, zr);
 9217           mov(cnt, len);
 9218           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9219           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9220           subs(zr, zr, zr); // set carry flag, i.e. no borrow
 9221           align(16);
 9222           bind(loop); {
 9223             sbcs(Rm, Rm, Rn);
 9224             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9225             add(i, i, 1);
 9226             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9227             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9228             sub(cnt, cnt, 1);
 9229           } cbnz(cnt, loop);
 9230           sbc(t0, t0, zr);
 9231         } cbnz(t0, again);
 9232       } bind(post);
 9233     }
 9234 
 9235     // Move memory at s to d, reversing words.
 9236     //    Increments d to end of copied memory
 9237     //    Destroys tmp1, tmp2
 9238     //    Preserves len
 9239     //    Leaves s pointing to the address which was in d at start
 9240     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
 9241       assert(tmp1->encoding() < r19->encoding(), "register corruption");
 9242       assert(tmp2->encoding() < r19->encoding(), "register corruption");
 9243 
 9244       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
 9245       mov(tmp1, len);
 9246       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
 9247       sub(s, d, len, ext::uxtw, LogBytesPerWord);
 9248     }
 9249     // where
 9250     void reverse1(Register d, Register s, Register tmp) {
 9251       ldr(tmp, pre(s, -wordSize));
 9252       ror(tmp, tmp, 32);
 9253       str(tmp, post(d, wordSize));
 9254     }
 9255 
 9256     void step_squaring() {
 9257       // An extra ACC
 9258       step();
 9259       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9260     }
 9261 
 9262     void last_squaring(RegisterOrConstant i) {
 9263       Label dont;
 9264       // if ((i & 1) == 0) {
 9265       tbnz(i.as_register(), 0, dont); {
 9266         // MACC(Ra, Rb, t0, t1, t2);
 9267         // Ra = *++Pa;
 9268         // Rb = *--Pb;
 9269         umulh(Rhi_ab, Ra, Rb);
 9270         mul(Rlo_ab, Ra, Rb);
 9271         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9272       } bind(dont);
 9273     }
 9274 
 9275     void extra_step_squaring() {
 9276       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9277 
 9278       // MACC(Rm, Rn, t0, t1, t2);
 9279       // Rm = *++Pm;
 9280       // Rn = *--Pn;
 9281       umulh(Rhi_mn, Rm, Rn);
 9282       mul(Rlo_mn, Rm, Rn);
 9283       ldr(Rm, pre(Pm, wordSize));
 9284       ldr(Rn, pre(Pn, -wordSize));
 9285     }
 9286 
 9287     void post1_squaring() {
 9288       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9289 
 9290       // *Pm = Rm = t0 * inv;
 9291       mul(Rm, t0, inv);
 9292       str(Rm, Address(Pm));
 9293 
 9294       // MACC(Rm, Rn, t0, t1, t2);
 9295       // t0 = t1; t1 = t2; t2 = 0;
 9296       umulh(Rhi_mn, Rm, Rn);
 9297 
 9298 #ifndef PRODUCT
 9299       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9300       {
 9301         mul(Rlo_mn, Rm, Rn);
 9302         add(Rlo_mn, t0, Rlo_mn);
 9303         Label ok;
 9304         cbz(Rlo_mn, ok); {
 9305           stop("broken Montgomery multiply");
 9306         } bind(ok);
 9307       }
 9308 #endif
 9309       // We have very carefully set things up so that
 9310       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9311       // the lower half of Rm * Rn because we know the result already:
 9312       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9313       // t0 != 0.  So, rather than do a mul and an adds we just set
 9314       // the carry flag iff t0 is nonzero.
 9315       //
 9316       // mul(Rlo_mn, Rm, Rn);
 9317       // adds(zr, t0, Rlo_mn);
 9318       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9319       adcs(t0, t1, Rhi_mn);
 9320       adc(t1, t2, zr);
 9321       mov(t2, zr);
 9322     }
 9323 
 9324     void acc(Register Rhi, Register Rlo,
 9325              Register t0, Register t1, Register t2) {
 9326       adds(t0, t0, Rlo);
 9327       adcs(t1, t1, Rhi);
 9328       adc(t2, t2, zr);
 9329     }
 9330 
 9331   public:
 9332     /**
 9333      * Fast Montgomery multiplication.  The derivation of the
 9334      * algorithm is in A Cryptographic Library for the Motorola
 9335      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
 9336      *
 9337      * Arguments:
 9338      *
 9339      * Inputs for multiplication:
 9340      *   c_rarg0   - int array elements a
 9341      *   c_rarg1   - int array elements b
 9342      *   c_rarg2   - int array elements n (the modulus)
 9343      *   c_rarg3   - int length
 9344      *   c_rarg4   - int inv
 9345      *   c_rarg5   - int array elements m (the result)
 9346      *
 9347      * Inputs for squaring:
 9348      *   c_rarg0   - int array elements a
 9349      *   c_rarg1   - int array elements n (the modulus)
 9350      *   c_rarg2   - int length
 9351      *   c_rarg3   - int inv
 9352      *   c_rarg4   - int array elements m (the result)
 9353      *
 9354      */
 9355     address generate_multiply() {
 9356       Label argh, nothing;
 9357       bind(argh);
 9358       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9359 
 9360       align(CodeEntryAlignment);
 9361       address entry = pc();
 9362 
 9363       cbzw(Rlen, nothing);
 9364 
 9365       enter();
 9366 
 9367       // Make room.
 9368       cmpw(Rlen, 512);
 9369       br(Assembler::HI, argh);
 9370       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9371       andr(sp, Ra, -2 * wordSize);
 9372 
 9373       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9374 
 9375       {
 9376         // Copy input args, reversing as we go.  We use Ra as a
 9377         // temporary variable.
 9378         reverse(Ra, Pa_base, Rlen, t0, t1);
 9379         if (!_squaring)
 9380           reverse(Ra, Pb_base, Rlen, t0, t1);
 9381         reverse(Ra, Pn_base, Rlen, t0, t1);
 9382       }
 9383 
 9384       // Push all call-saved registers and also Pm_base which we'll need
 9385       // at the end.
 9386       save_regs();
 9387 
 9388 #ifndef PRODUCT
 9389       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
 9390       {
 9391         ldr(Rn, Address(Pn_base, 0));
 9392         mul(Rlo_mn, Rn, inv);
 9393         subs(zr, Rlo_mn, -1);
 9394         Label ok;
 9395         br(EQ, ok); {
 9396           stop("broken inverse in Montgomery multiply");
 9397         } bind(ok);
 9398       }
 9399 #endif
 9400 
 9401       mov(Pm_base, Ra);
 9402 
 9403       mov(t0, zr);
 9404       mov(t1, zr);
 9405       mov(t2, zr);
 9406 
 9407       block_comment("for (int i = 0; i < len; i++) {");
 9408       mov(Ri, zr); {
 9409         Label loop, end;
 9410         cmpw(Ri, Rlen);
 9411         br(Assembler::GE, end);
 9412 
 9413         bind(loop);
 9414         pre1(Ri);
 9415 
 9416         block_comment("  for (j = i; j; j--) {"); {
 9417           movw(Rj, Ri);
 9418           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9419         } block_comment("  } // j");
 9420 
 9421         post1();
 9422         addw(Ri, Ri, 1);
 9423         cmpw(Ri, Rlen);
 9424         br(Assembler::LT, loop);
 9425         bind(end);
 9426         block_comment("} // i");
 9427       }
 9428 
 9429       block_comment("for (int i = len; i < 2*len; i++) {");
 9430       mov(Ri, Rlen); {
 9431         Label loop, end;
 9432         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9433         br(Assembler::GE, end);
 9434 
 9435         bind(loop);
 9436         pre2(Ri, Rlen);
 9437 
 9438         block_comment("  for (j = len*2-i-1; j; j--) {"); {
 9439           lslw(Rj, Rlen, 1);
 9440           subw(Rj, Rj, Ri);
 9441           subw(Rj, Rj, 1);
 9442           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9443         } block_comment("  } // j");
 9444 
 9445         post2(Ri, Rlen);
 9446         addw(Ri, Ri, 1);
 9447         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9448         br(Assembler::LT, loop);
 9449         bind(end);
 9450       }
 9451       block_comment("} // i");
 9452 
 9453       normalize(Rlen);
 9454 
 9455       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9456       restore_regs();  // Restore caller's Pm_base
 9457 
 9458       // Copy our result into caller's Pm_base
 9459       reverse(Pm_base, Ra, Rlen, t0, t1);
 9460 
 9461       leave();
 9462       bind(nothing);
 9463       ret(lr);
 9464 
 9465       return entry;
 9466     }
 9467     // In C, approximately:
 9468 
 9469     // void
 9470     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
 9471     //                     julong Pn_base[], julong Pm_base[],
 9472     //                     julong inv, int len) {
 9473     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9474     //   julong *Pa, *Pb, *Pn, *Pm;
 9475     //   julong Ra, Rb, Rn, Rm;
 9476 
 9477     //   int i;
 9478 
 9479     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9480 
 9481     //   for (i = 0; i < len; i++) {
 9482     //     int j;
 9483 
 9484     //     Pa = Pa_base;
 9485     //     Pb = Pb_base + i;
 9486     //     Pm = Pm_base;
 9487     //     Pn = Pn_base + i;
 9488 
 9489     //     Ra = *Pa;
 9490     //     Rb = *Pb;
 9491     //     Rm = *Pm;
 9492     //     Rn = *Pn;
 9493 
 9494     //     int iters = i;
 9495     //     for (j = 0; iters--; j++) {
 9496     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9497     //       MACC(Ra, Rb, t0, t1, t2);
 9498     //       Ra = *++Pa;
 9499     //       Rb = *--Pb;
 9500     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9501     //       MACC(Rm, Rn, t0, t1, t2);
 9502     //       Rm = *++Pm;
 9503     //       Rn = *--Pn;
 9504     //     }
 9505 
 9506     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
 9507     //     MACC(Ra, Rb, t0, t1, t2);
 9508     //     *Pm = Rm = t0 * inv;
 9509     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9510     //     MACC(Rm, Rn, t0, t1, t2);
 9511 
 9512     //     assert(t0 == 0, "broken Montgomery multiply");
 9513 
 9514     //     t0 = t1; t1 = t2; t2 = 0;
 9515     //   }
 9516 
 9517     //   for (i = len; i < 2*len; i++) {
 9518     //     int j;
 9519 
 9520     //     Pa = Pa_base + i-len;
 9521     //     Pb = Pb_base + len;
 9522     //     Pm = Pm_base + i-len;
 9523     //     Pn = Pn_base + len;
 9524 
 9525     //     Ra = *++Pa;
 9526     //     Rb = *--Pb;
 9527     //     Rm = *++Pm;
 9528     //     Rn = *--Pn;
 9529 
 9530     //     int iters = len*2-i-1;
 9531     //     for (j = i-len+1; iters--; j++) {
 9532     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9533     //       MACC(Ra, Rb, t0, t1, t2);
 9534     //       Ra = *++Pa;
 9535     //       Rb = *--Pb;
 9536     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9537     //       MACC(Rm, Rn, t0, t1, t2);
 9538     //       Rm = *++Pm;
 9539     //       Rn = *--Pn;
 9540     //     }
 9541 
 9542     //     Pm_base[i-len] = t0;
 9543     //     t0 = t1; t1 = t2; t2 = 0;
 9544     //   }
 9545 
 9546     //   while (t0)
 9547     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9548     // }
 9549 
 9550     /**
 9551      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
 9552      * multiplies than Montgomery multiplication so it should be up to
 9553      * 25% faster.  However, its loop control is more complex and it
 9554      * may actually run slower on some machines.
 9555      *
 9556      * Arguments:
 9557      *
 9558      * Inputs:
 9559      *   c_rarg0   - int array elements a
 9560      *   c_rarg1   - int array elements n (the modulus)
 9561      *   c_rarg2   - int length
 9562      *   c_rarg3   - int inv
 9563      *   c_rarg4   - int array elements m (the result)
 9564      *
 9565      */
 9566     address generate_square() {
 9567       Label argh;
 9568       bind(argh);
 9569       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9570 
 9571       align(CodeEntryAlignment);
 9572       address entry = pc();
 9573 
 9574       enter();
 9575 
 9576       // Make room.
 9577       cmpw(Rlen, 512);
 9578       br(Assembler::HI, argh);
 9579       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9580       andr(sp, Ra, -2 * wordSize);
 9581 
 9582       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9583 
 9584       {
 9585         // Copy input args, reversing as we go.  We use Ra as a
 9586         // temporary variable.
 9587         reverse(Ra, Pa_base, Rlen, t0, t1);
 9588         reverse(Ra, Pn_base, Rlen, t0, t1);
 9589       }
 9590 
 9591       // Push all call-saved registers and also Pm_base which we'll need
 9592       // at the end.
 9593       save_regs();
 9594 
 9595       mov(Pm_base, Ra);
 9596 
 9597       mov(t0, zr);
 9598       mov(t1, zr);
 9599       mov(t2, zr);
 9600 
 9601       block_comment("for (int i = 0; i < len; i++) {");
 9602       mov(Ri, zr); {
 9603         Label loop, end;
 9604         bind(loop);
 9605         cmp(Ri, Rlen);
 9606         br(Assembler::GE, end);
 9607 
 9608         pre1(Ri);
 9609 
 9610         block_comment("for (j = (i+1)/2; j; j--) {"); {
 9611           add(Rj, Ri, 1);
 9612           lsr(Rj, Rj, 1);
 9613           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9614         } block_comment("  } // j");
 9615 
 9616         last_squaring(Ri);
 9617 
 9618         block_comment("  for (j = i/2; j; j--) {"); {
 9619           lsr(Rj, Ri, 1);
 9620           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9621         } block_comment("  } // j");
 9622 
 9623         post1_squaring();
 9624         add(Ri, Ri, 1);
 9625         cmp(Ri, Rlen);
 9626         br(Assembler::LT, loop);
 9627 
 9628         bind(end);
 9629         block_comment("} // i");
 9630       }
 9631 
 9632       block_comment("for (int i = len; i < 2*len; i++) {");
 9633       mov(Ri, Rlen); {
 9634         Label loop, end;
 9635         bind(loop);
 9636         cmp(Ri, Rlen, Assembler::LSL, 1);
 9637         br(Assembler::GE, end);
 9638 
 9639         pre2(Ri, Rlen);
 9640 
 9641         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
 9642           lsl(Rj, Rlen, 1);
 9643           sub(Rj, Rj, Ri);
 9644           sub(Rj, Rj, 1);
 9645           lsr(Rj, Rj, 1);
 9646           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9647         } block_comment("  } // j");
 9648 
 9649         last_squaring(Ri);
 9650 
 9651         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
 9652           lsl(Rj, Rlen, 1);
 9653           sub(Rj, Rj, Ri);
 9654           lsr(Rj, Rj, 1);
 9655           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9656         } block_comment("  } // j");
 9657 
 9658         post2(Ri, Rlen);
 9659         add(Ri, Ri, 1);
 9660         cmp(Ri, Rlen, Assembler::LSL, 1);
 9661 
 9662         br(Assembler::LT, loop);
 9663         bind(end);
 9664         block_comment("} // i");
 9665       }
 9666 
 9667       normalize(Rlen);
 9668 
 9669       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9670       restore_regs();  // Restore caller's Pm_base
 9671 
 9672       // Copy our result into caller's Pm_base
 9673       reverse(Pm_base, Ra, Rlen, t0, t1);
 9674 
 9675       leave();
 9676       ret(lr);
 9677 
 9678       return entry;
 9679     }
 9680     // In C, approximately:
 9681 
 9682     // void
 9683     // montgomery_square(julong Pa_base[], julong Pn_base[],
 9684     //                   julong Pm_base[], julong inv, int len) {
 9685     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9686     //   julong *Pa, *Pb, *Pn, *Pm;
 9687     //   julong Ra, Rb, Rn, Rm;
 9688 
 9689     //   int i;
 9690 
 9691     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9692 
 9693     //   for (i = 0; i < len; i++) {
 9694     //     int j;
 9695 
 9696     //     Pa = Pa_base;
 9697     //     Pb = Pa_base + i;
 9698     //     Pm = Pm_base;
 9699     //     Pn = Pn_base + i;
 9700 
 9701     //     Ra = *Pa;
 9702     //     Rb = *Pb;
 9703     //     Rm = *Pm;
 9704     //     Rn = *Pn;
 9705 
 9706     //     int iters = (i+1)/2;
 9707     //     for (j = 0; iters--; j++) {
 9708     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9709     //       MACC2(Ra, Rb, t0, t1, t2);
 9710     //       Ra = *++Pa;
 9711     //       Rb = *--Pb;
 9712     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9713     //       MACC(Rm, Rn, t0, t1, t2);
 9714     //       Rm = *++Pm;
 9715     //       Rn = *--Pn;
 9716     //     }
 9717     //     if ((i & 1) == 0) {
 9718     //       assert(Ra == Pa_base[j], "must be");
 9719     //       MACC(Ra, Ra, t0, t1, t2);
 9720     //     }
 9721     //     iters = i/2;
 9722     //     assert(iters == i-j, "must be");
 9723     //     for (; iters--; j++) {
 9724     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9725     //       MACC(Rm, Rn, t0, t1, t2);
 9726     //       Rm = *++Pm;
 9727     //       Rn = *--Pn;
 9728     //     }
 9729 
 9730     //     *Pm = Rm = t0 * inv;
 9731     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9732     //     MACC(Rm, Rn, t0, t1, t2);
 9733 
 9734     //     assert(t0 == 0, "broken Montgomery multiply");
 9735 
 9736     //     t0 = t1; t1 = t2; t2 = 0;
 9737     //   }
 9738 
 9739     //   for (i = len; i < 2*len; i++) {
 9740     //     int start = i-len+1;
 9741     //     int end = start + (len - start)/2;
 9742     //     int j;
 9743 
 9744     //     Pa = Pa_base + i-len;
 9745     //     Pb = Pa_base + len;
 9746     //     Pm = Pm_base + i-len;
 9747     //     Pn = Pn_base + len;
 9748 
 9749     //     Ra = *++Pa;
 9750     //     Rb = *--Pb;
 9751     //     Rm = *++Pm;
 9752     //     Rn = *--Pn;
 9753 
 9754     //     int iters = (2*len-i-1)/2;
 9755     //     assert(iters == end-start, "must be");
 9756     //     for (j = start; iters--; j++) {
 9757     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9758     //       MACC2(Ra, Rb, t0, t1, t2);
 9759     //       Ra = *++Pa;
 9760     //       Rb = *--Pb;
 9761     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9762     //       MACC(Rm, Rn, t0, t1, t2);
 9763     //       Rm = *++Pm;
 9764     //       Rn = *--Pn;
 9765     //     }
 9766     //     if ((i & 1) == 0) {
 9767     //       assert(Ra == Pa_base[j], "must be");
 9768     //       MACC(Ra, Ra, t0, t1, t2);
 9769     //     }
 9770     //     iters =  (2*len-i)/2;
 9771     //     assert(iters == len-j, "must be");
 9772     //     for (; iters--; j++) {
 9773     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9774     //       MACC(Rm, Rn, t0, t1, t2);
 9775     //       Rm = *++Pm;
 9776     //       Rn = *--Pn;
 9777     //     }
 9778     //     Pm_base[i-len] = t0;
 9779     //     t0 = t1; t1 = t2; t2 = 0;
 9780     //   }
 9781 
 9782     //   while (t0)
 9783     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9784     // }
 9785   };
 9786 
 9787   void generate_vector_math_stubs() {
 9788     // Get native vector math stub routine addresses
 9789     void* libsleef = nullptr;
 9790     char ebuf[1024];
 9791     char dll_name[JVM_MAXPATHLEN];
 9792     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
 9793       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
 9794     }
 9795     if (libsleef == nullptr) {
 9796       log_info(library)("Failed to load native vector math library, %s!", ebuf);
 9797       return;
 9798     }
 9799     // Method naming convention
 9800     //   All the methods are named as <OP><T><N>_<U><suffix>
 9801     //   Where:
 9802     //     <OP>     is the operation name, e.g. sin
 9803     //     <T>      is optional to indicate float/double
 9804     //              "f/d" for vector float/double operation
 9805     //     <N>      is the number of elements in the vector
 9806     //              "2/4" for neon, and "x" for sve
 9807     //     <U>      is the precision level
 9808     //              "u10/u05" represents 1.0/0.5 ULP error bounds
 9809     //               We use "u10" for all operations by default
 9810     //               But for those functions do not have u10 support, we use "u05" instead
 9811     //     <suffix> indicates neon/sve
 9812     //              "sve/advsimd" for sve/neon implementations
 9813     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
 9814     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
 9815     //
 9816     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
 9817 
 9818     // Math vector stubs implemented with SVE for scalable vector size.
 9819     if (UseSVE > 0) {
 9820       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9821         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9822         // Skip "tanh" because there is performance regression
 9823         if (vop == VectorSupport::VECTOR_OP_TANH) {
 9824           continue;
 9825         }
 9826 
 9827         // The native library does not support u10 level of "hypot".
 9828         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9829 
 9830         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
 9831         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9832 
 9833         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
 9834         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9835       }
 9836     }
 9837 
 9838     // Math vector stubs implemented with NEON for 64/128 bits vector size.
 9839     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9840       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9841       // Skip "tanh" because there is performance regression
 9842       if (vop == VectorSupport::VECTOR_OP_TANH) {
 9843         continue;
 9844       }
 9845 
 9846       // The native library does not support u10 level of "hypot".
 9847       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9848 
 9849       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9850       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
 9851 
 9852       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9853       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9854 
 9855       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
 9856       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9857     }
 9858   }
 9859 
 9860   // Call here from the interpreter or compiled code to either load
 9861   // multiple returned values from the inline type instance being
 9862   // returned to registers or to store returned values to a newly
 9863   // allocated inline type instance.
 9864   address generate_return_value_stub(address destination, const char* name, bool has_res) {
 9865     // We need to save all registers the calling convention may use so
 9866     // the runtime calls read or update those registers. This needs to
 9867     // be in sync with SharedRuntime::java_return_convention().
 9868     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
 9869     enum layout {
 9870       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
 9871       j_rarg6_off, j_rarg6_2,
 9872       j_rarg5_off, j_rarg5_2,
 9873       j_rarg4_off, j_rarg4_2,
 9874       j_rarg3_off, j_rarg3_2,
 9875       j_rarg2_off, j_rarg2_2,
 9876       j_rarg1_off, j_rarg1_2,
 9877       j_rarg0_off, j_rarg0_2,
 9878 
 9879       j_farg7_off, j_farg7_2,
 9880       j_farg6_off, j_farg6_2,
 9881       j_farg5_off, j_farg5_2,
 9882       j_farg4_off, j_farg4_2,
 9883       j_farg3_off, j_farg3_2,
 9884       j_farg2_off, j_farg2_2,
 9885       j_farg1_off, j_farg1_2,
 9886       j_farg0_off, j_farg0_2,
 9887 
 9888       rfp_off, rfp_off2,
 9889       return_off, return_off2,
 9890 
 9891       framesize // inclusive of return address
 9892     };
 9893 
 9894     CodeBuffer code(name, 512, 64);
 9895     MacroAssembler* masm = new MacroAssembler(&code);
 9896 
 9897     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
 9898     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
 9899     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 9900     int frame_size_in_words = frame_size_in_bytes / wordSize;
 9901 
 9902     OopMapSet* oop_maps = new OopMapSet();
 9903     OopMap* map = new OopMap(frame_size_in_slots, 0);
 9904 
 9905     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
 9906     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
 9907     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
 9908     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
 9909     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
 9910     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
 9911     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
 9912     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
 9913 
 9914     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
 9915     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
 9916     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
 9917     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
 9918     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
 9919     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
 9920     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
 9921     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
 9922 
 9923     address start = __ pc();
 9924 
 9925     __ enter(); // Save FP and LR before call
 9926 
 9927     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
 9928     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
 9929     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
 9930     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
 9931 
 9932     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
 9933     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
 9934     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
 9935     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
 9936 
 9937     int frame_complete = __ offset();
 9938 
 9939     // Set up last_Java_sp and last_Java_fp
 9940     address the_pc = __ pc();
 9941     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
 9942 
 9943     // Call runtime
 9944     __ mov(c_rarg1, r0);
 9945     __ mov(c_rarg0, rthread);
 9946 
 9947     __ mov(rscratch1, destination);
 9948     __ blr(rscratch1);
 9949 
 9950     oop_maps->add_gc_map(the_pc - start, map);
 9951 
 9952     __ reset_last_Java_frame(false);
 9953 
 9954     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
 9955     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
 9956     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
 9957     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
 9958 
 9959     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
 9960     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
 9961     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
 9962     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
 9963 
 9964     __ leave();
 9965 
 9966     // check for pending exceptions
 9967     Label pending;
 9968     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 9969     __ cbnz(rscratch1, pending);
 9970 
 9971     if (has_res) {
 9972       __ get_vm_result(r0, rthread);
 9973     }
 9974 
 9975     __ ret(lr);
 9976 
 9977     __ bind(pending);
 9978     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 9979 
 9980     // -------------
 9981     // make sure all code is generated
 9982     masm->flush();
 9983 
 9984     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
 9985     return stub->entry_point();
 9986   }
 9987 
 9988   // Initialization
 9989   void generate_initial_stubs() {
 9990     // Generate initial stubs and initializes the entry points
 9991 
 9992     // entry points that exist in all platforms Note: This is code
 9993     // that could be shared among different platforms - however the
 9994     // benefit seems to be smaller than the disadvantage of having a
 9995     // much more complicated generator structure. See also comment in
 9996     // stubRoutines.hpp.
 9997 
 9998     StubRoutines::_forward_exception_entry = generate_forward_exception();
 9999 
10000     StubRoutines::_call_stub_entry =
10001       generate_call_stub(StubRoutines::_call_stub_return_address);
10002 
10003     // is referenced by megamorphic call
10004     StubRoutines::_catch_exception_entry = generate_catch_exception();
10005 
10006     // Initialize table for copy memory (arraycopy) check.
10007     if (UnsafeMemoryAccess::_table == nullptr) {
10008       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
10009     }
10010 
10011     if (UseCRC32Intrinsics) {
10012       // set table address before stub generation which use it
10013       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
10014       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
10015     }
10016 
10017     if (UseCRC32CIntrinsics) {
10018       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
10019     }
10020 
10021     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
10022       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
10023     }
10024 
10025     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
10026       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
10027     }
10028 
10029     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
10030         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
10031       StubRoutines::_hf2f = generate_float16ToFloat();
10032       StubRoutines::_f2hf = generate_floatToFloat16();
10033     }
10034 
10035     if (InlineTypeReturnedAsFields) {
10036       StubRoutines::_load_inline_type_fields_in_regs =
10037          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
10038       StubRoutines::_store_inline_type_fields_to_buf =
10039          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
10040     }
10041 
10042   }
10043 
10044   void generate_continuation_stubs() {
10045     // Continuation stubs:
10046     StubRoutines::_cont_thaw          = generate_cont_thaw();
10047     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
10048     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
10049     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
10050   }
10051 
10052   void generate_final_stubs() {
10053     // support for verify_oop (must happen after universe_init)
10054     if (VerifyOops) {
10055       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
10056     }
10057 
10058     // arraycopy stubs used by compilers
10059     generate_arraycopy_stubs();
10060 
10061     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
10062     if (bs_nm != nullptr) {
10063       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
10064     }
10065 
10066     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
10067 
10068     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
10069     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
10070 
10071 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10072 
10073     generate_atomic_entry_points();
10074 
10075 #endif // LINUX
10076 
10077 #ifdef COMPILER2
10078     if (UseSecondarySupersTable) {
10079       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
10080       if (! InlineSecondarySupersTest) {
10081         generate_lookup_secondary_supers_table_stub();
10082       }
10083     }
10084 #endif
10085 
10086     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
10087   }
10088 
10089   void generate_compiler_stubs() {
10090 #if COMPILER2_OR_JVMCI
10091 
10092     if (UseSVE == 0) {
10093       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
10094     }
10095 
10096     // array equals stub for large arrays.
10097     if (!UseSimpleArrayEquals) {
10098       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
10099     }
10100 
10101     // arrays_hascode stub for large arrays.
10102     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
10103     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
10104     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
10105     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
10106     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
10107 
10108     // byte_array_inflate stub for large arrays.
10109     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
10110 
10111     // countPositives stub for large arrays.
10112     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
10113 
10114     generate_compare_long_strings();
10115 
10116     generate_string_indexof_stubs();
10117 
10118 #ifdef COMPILER2
10119     if (UseMultiplyToLenIntrinsic) {
10120       StubRoutines::_multiplyToLen = generate_multiplyToLen();
10121     }
10122 
10123     if (UseSquareToLenIntrinsic) {
10124       StubRoutines::_squareToLen = generate_squareToLen();
10125     }
10126 
10127     if (UseMulAddIntrinsic) {
10128       StubRoutines::_mulAdd = generate_mulAdd();
10129     }
10130 
10131     if (UseSIMDForBigIntegerShiftIntrinsics) {
10132       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
10133       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
10134     }
10135 
10136     if (UseMontgomeryMultiplyIntrinsic) {
10137       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
10138       StubCodeMark mark(this, stub_id);
10139       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
10140       StubRoutines::_montgomeryMultiply = g.generate_multiply();
10141     }
10142 
10143     if (UseMontgomerySquareIntrinsic) {
10144       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
10145       StubCodeMark mark(this, stub_id);
10146       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
10147       // We use generate_multiply() rather than generate_square()
10148       // because it's faster for the sizes of modulus we care about.
10149       StubRoutines::_montgomerySquare = g.generate_multiply();
10150     }
10151 
10152     generate_vector_math_stubs();
10153 
10154 #endif // COMPILER2
10155 
10156     if (UseChaCha20Intrinsics) {
10157       StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
10158     }
10159 
10160     if (UseDilithiumIntrinsics) {
10161       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
10162       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
10163       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
10164       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
10165       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
10166     }
10167 
10168     if (UseBASE64Intrinsics) {
10169         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
10170         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
10171     }
10172 
10173     // data cache line writeback
10174     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
10175     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
10176 
10177     if (UseAESIntrinsics) {
10178       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
10179       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
10180       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
10181       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
10182       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
10183     }
10184     if (UseGHASHIntrinsics) {
10185       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
10186       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
10187     }
10188     if (UseAESIntrinsics && UseGHASHIntrinsics) {
10189       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
10190     }
10191 
10192     if (UseMD5Intrinsics) {
10193       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
10194       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
10195     }
10196     if (UseSHA1Intrinsics) {
10197       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
10198       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
10199     }
10200     if (UseSHA256Intrinsics) {
10201       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
10202       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
10203     }
10204     if (UseSHA512Intrinsics) {
10205       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
10206       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
10207     }
10208     if (UseSHA3Intrinsics) {
10209       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
10210       StubRoutines::_double_keccak         = generate_double_keccak();
10211       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
10212     }
10213 
10214     if (UsePoly1305Intrinsics) {
10215       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
10216     }
10217 
10218     // generate Adler32 intrinsics code
10219     if (UseAdler32Intrinsics) {
10220       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
10221     }
10222 
10223 #endif // COMPILER2_OR_JVMCI
10224   }
10225 
10226  public:
10227   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
10228     switch(blob_id) {
10229     case initial_id:
10230       generate_initial_stubs();
10231       break;
10232      case continuation_id:
10233       generate_continuation_stubs();
10234       break;
10235     case compiler_id:
10236       generate_compiler_stubs();
10237       break;
10238     case final_id:
10239       generate_final_stubs();
10240       break;
10241     default:
10242       fatal("unexpected blob id: %d", blob_id);
10243       break;
10244     };
10245   }
10246 }; // end class declaration
10247 
10248 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
10249   StubGenerator g(code, blob_id);
10250 }
10251 
10252 
10253 #if defined (LINUX)
10254 
10255 // Define pointers to atomic stubs and initialize them to point to the
10256 // code in atomic_aarch64.S.
10257 
10258 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
10259   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
10260     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
10261   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
10262     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
10263 
10264 DEFAULT_ATOMIC_OP(fetch_add, 4, )
10265 DEFAULT_ATOMIC_OP(fetch_add, 8, )
10266 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
10267 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
10268 DEFAULT_ATOMIC_OP(xchg, 4, )
10269 DEFAULT_ATOMIC_OP(xchg, 8, )
10270 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
10271 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
10272 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
10273 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
10274 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
10275 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
10276 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
10277 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
10278 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
10279 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
10280 
10281 #undef DEFAULT_ATOMIC_OP
10282 
10283 #endif // LINUX