1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     __ ldr(j_rarg2, result);
  332     Label is_long, is_float, is_double, exit;
  333     __ ldr(j_rarg1, result_type);
  334     __ cmp(j_rarg1, (u1)T_OBJECT);
  335     __ br(Assembler::EQ, is_long);
  336     __ cmp(j_rarg1, (u1)T_LONG);
  337     __ br(Assembler::EQ, is_long);
  338     __ cmp(j_rarg1, (u1)T_FLOAT);
  339     __ br(Assembler::EQ, is_float);
  340     __ cmp(j_rarg1, (u1)T_DOUBLE);
  341     __ br(Assembler::EQ, is_double);
  342 
  343     // handle T_INT case
  344     __ strw(r0, Address(j_rarg2));
  345 
  346     __ BIND(exit);
  347 
  348     // pop parameters
  349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  350 
  351 #ifdef ASSERT
  352     // verify that threads correspond
  353     {
  354       Label L, S;
  355       __ ldr(rscratch1, thread);
  356       __ cmp(rthread, rscratch1);
  357       __ br(Assembler::NE, S);
  358       __ get_thread(rscratch1);
  359       __ cmp(rthread, rscratch1);
  360       __ br(Assembler::EQ, L);
  361       __ BIND(S);
  362       __ stop("StubRoutines::call_stub: threads must correspond");
  363       __ BIND(L);
  364     }
  365 #endif
  366 
  367     __ pop_cont_fastpath(rthread);
  368 
  369     // restore callee-save registers
  370     __ ldpd(v15, v14,  d15_save);
  371     __ ldpd(v13, v12,  d13_save);
  372     __ ldpd(v11, v10,  d11_save);
  373     __ ldpd(v9,  v8,   d9_save);
  374 
  375     __ ldp(r28, r27,   r28_save);
  376     __ ldp(r26, r25,   r26_save);
  377     __ ldp(r24, r23,   r24_save);
  378     __ ldp(r22, r21,   r22_save);
  379     __ ldp(r20, r19,   r20_save);
  380 
  381     // restore fpcr
  382     __ ldr(rscratch1,  fpcr_save);
  383     __ set_fpcr(rscratch1);
  384 
  385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  386     __ ldrw(c_rarg2, result_type);
  387     __ ldr(c_rarg3,  method);
  388     __ ldp(c_rarg4, c_rarg5,  entry_point);
  389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  390 
  391     // leave frame and return to caller
  392     __ leave();
  393     __ ret(lr);
  394 
  395     // handle return types different from T_INT
  396 
  397     __ BIND(is_long);
  398     __ str(r0, Address(j_rarg2, 0));
  399     __ br(Assembler::AL, exit);
  400 
  401     __ BIND(is_float);
  402     __ strs(j_farg0, Address(j_rarg2, 0));
  403     __ br(Assembler::AL, exit);
  404 
  405     __ BIND(is_double);
  406     __ strd(j_farg0, Address(j_rarg2, 0));
  407     __ br(Assembler::AL, exit);
  408 
  409     return start;
  410   }
  411 
  412   // Return point for a Java call if there's an exception thrown in
  413   // Java code.  The exception is caught and transformed into a
  414   // pending exception stored in JavaThread that can be tested from
  415   // within the VM.
  416   //
  417   // Note: Usually the parameters are removed by the callee. In case
  418   // of an exception crossing an activation frame boundary, that is
  419   // not the case if the callee is compiled code => need to setup the
  420   // rsp.
  421   //
  422   // r0: exception oop
  423 
  424   address generate_catch_exception() {
  425     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  426     StubCodeMark mark(this, stub_id);
  427     address start = __ pc();
  428 
  429     // same as in generate_call_stub():
  430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  431     const Address thread        (rfp, thread_off         * wordSize);
  432 
  433 #ifdef ASSERT
  434     // verify that threads correspond
  435     {
  436       Label L, S;
  437       __ ldr(rscratch1, thread);
  438       __ cmp(rthread, rscratch1);
  439       __ br(Assembler::NE, S);
  440       __ get_thread(rscratch1);
  441       __ cmp(rthread, rscratch1);
  442       __ br(Assembler::EQ, L);
  443       __ bind(S);
  444       __ stop("StubRoutines::catch_exception: threads must correspond");
  445       __ bind(L);
  446     }
  447 #endif
  448 
  449     // set pending exception
  450     __ verify_oop(r0);
  451 
  452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  453     __ mov(rscratch1, (address)__FILE__);
  454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  455     __ movw(rscratch1, (int)__LINE__);
  456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  457 
  458     // complete return to VM
  459     assert(StubRoutines::_call_stub_return_address != nullptr,
  460            "_call_stub_return_address must have been generated before");
  461     __ b(StubRoutines::_call_stub_return_address);
  462 
  463     return start;
  464   }
  465 
  466   // Continuation point for runtime calls returning with a pending
  467   // exception.  The pending exception check happened in the runtime
  468   // or native call stub.  The pending exception in Thread is
  469   // converted into a Java-level exception.
  470   //
  471   // Contract with Java-level exception handlers:
  472   // r0: exception
  473   // r3: throwing pc
  474   //
  475   // NOTE: At entry of this stub, exception-pc must be in LR !!
  476 
  477   // NOTE: this is always used as a jump target within generated code
  478   // so it just needs to be generated code with no x86 prolog
  479 
  480   address generate_forward_exception() {
  481     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  482     StubCodeMark mark(this, stub_id);
  483     address start = __ pc();
  484 
  485     // Upon entry, LR points to the return address returning into
  486     // Java (interpreted or compiled) code; i.e., the return address
  487     // becomes the throwing pc.
  488     //
  489     // Arguments pushed before the runtime call are still on the stack
  490     // but the exception handler will reset the stack pointer ->
  491     // ignore them.  A potential result in registers can be ignored as
  492     // well.
  493 
  494 #ifdef ASSERT
  495     // make sure this code is only executed if there is a pending exception
  496     {
  497       Label L;
  498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  499       __ cbnz(rscratch1, L);
  500       __ stop("StubRoutines::forward exception: no pending exception (1)");
  501       __ bind(L);
  502     }
  503 #endif
  504 
  505     // compute exception handler into r19
  506 
  507     // call the VM to find the handler address associated with the
  508     // caller address. pass thread in r0 and caller pc (ret address)
  509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  510     // the stack.
  511     __ mov(c_rarg1, lr);
  512     // lr will be trashed by the VM call so we move it to R19
  513     // (callee-saved) because we also need to pass it to the handler
  514     // returned by this call.
  515     __ mov(r19, lr);
  516     BLOCK_COMMENT("call exception_handler_for_return_address");
  517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  518                          SharedRuntime::exception_handler_for_return_address),
  519                     rthread, c_rarg1);
  520     // Reinitialize the ptrue predicate register, in case the external runtime
  521     // call clobbers ptrue reg, as we may return to SVE compiled code.
  522     __ reinitialize_ptrue();
  523 
  524     // we should not really care that lr is no longer the callee
  525     // address. we saved the value the handler needs in r19 so we can
  526     // just copy it to r3. however, the C2 handler will push its own
  527     // frame and then calls into the VM and the VM code asserts that
  528     // the PC for the frame above the handler belongs to a compiled
  529     // Java method. So, we restore lr here to satisfy that assert.
  530     __ mov(lr, r19);
  531     // setup r0 & r3 & clear pending exception
  532     __ mov(r3, r19);
  533     __ mov(r19, r0);
  534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  536 
  537 #ifdef ASSERT
  538     // make sure exception is set
  539     {
  540       Label L;
  541       __ cbnz(r0, L);
  542       __ stop("StubRoutines::forward exception: no pending exception (2)");
  543       __ bind(L);
  544     }
  545 #endif
  546 
  547     // continue at exception handler
  548     // r0: exception
  549     // r3: throwing pc
  550     // r19: exception handler
  551     __ verify_oop(r0);
  552     __ br(r19);
  553 
  554     return start;
  555   }
  556 
  557   // Non-destructive plausibility checks for oops
  558   //
  559   // Arguments:
  560   //    r0: oop to verify
  561   //    rscratch1: error message
  562   //
  563   // Stack after saving c_rarg3:
  564   //    [tos + 0]: saved c_rarg3
  565   //    [tos + 1]: saved c_rarg2
  566   //    [tos + 2]: saved lr
  567   //    [tos + 3]: saved rscratch2
  568   //    [tos + 4]: saved r0
  569   //    [tos + 5]: saved rscratch1
  570   address generate_verify_oop() {
  571     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  572     StubCodeMark mark(this, stub_id);
  573     address start = __ pc();
  574 
  575     Label exit, error;
  576 
  577     // save c_rarg2 and c_rarg3
  578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  579 
  580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ ldr(c_rarg3, Address(c_rarg2));
  583     __ add(c_rarg3, c_rarg3, 1);
  584     __ str(c_rarg3, Address(c_rarg2));
  585 
  586     // object is in r0
  587     // make sure object is 'reasonable'
  588     __ cbz(r0, exit); // if obj is null it is OK
  589 
  590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  592 
  593     // return if everything seems ok
  594     __ bind(exit);
  595 
  596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  597     __ ret(lr);
  598 
  599     // handle errors
  600     __ bind(error);
  601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  602 
  603     __ push(RegSet::range(r0, r29), sp);
  604     // debug(char* msg, int64_t pc, int64_t regs[])
  605     __ mov(c_rarg0, rscratch1);      // pass address of error message
  606     __ mov(c_rarg1, lr);             // pass return address
  607     __ mov(c_rarg2, sp);             // pass address of regs on stack
  608 #ifndef PRODUCT
  609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  610 #endif
  611     BLOCK_COMMENT("call MacroAssembler::debug");
  612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  613     __ blr(rscratch1);
  614     __ hlt(0);
  615 
  616     return start;
  617   }
  618 
  619   // Generate indices for iota vector.
  620   address generate_iota_indices(StubGenStubId stub_id) {
  621     __ align(CodeEntryAlignment);
  622     StubCodeMark mark(this, stub_id);
  623     address start = __ pc();
  624     // B
  625     __ emit_data64(0x0706050403020100, relocInfo::none);
  626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  627     // H
  628     __ emit_data64(0x0003000200010000, relocInfo::none);
  629     __ emit_data64(0x0007000600050004, relocInfo::none);
  630     // S
  631     __ emit_data64(0x0000000100000000, relocInfo::none);
  632     __ emit_data64(0x0000000300000002, relocInfo::none);
  633     // D
  634     __ emit_data64(0x0000000000000000, relocInfo::none);
  635     __ emit_data64(0x0000000000000001, relocInfo::none);
  636     // S - FP
  637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  639     // D - FP
  640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  642     return start;
  643   }
  644 
  645   // The inner part of zero_words().  This is the bulk operation,
  646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  647   // caller is responsible for zeroing the last few words.
  648   //
  649   // Inputs:
  650   // r10: the HeapWord-aligned base address of an array to zero.
  651   // r11: the count in HeapWords, r11 > 0.
  652   //
  653   // Returns r10 and r11, adjusted for the caller to clear.
  654   // r10: the base address of the tail of words left to clear.
  655   // r11: the number of words in the tail.
  656   //      r11 < MacroAssembler::zero_words_block_size.
  657 
  658   address generate_zero_blocks() {
  659     Label done;
  660     Label base_aligned;
  661 
  662     Register base = r10, cnt = r11;
  663 
  664     __ align(CodeEntryAlignment);
  665     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  666     StubCodeMark mark(this, stub_id);
  667     address start = __ pc();
  668 
  669     if (UseBlockZeroing) {
  670       int zva_length = VM_Version::zva_length();
  671 
  672       // Ensure ZVA length can be divided by 16. This is required by
  673       // the subsequent operations.
  674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  675 
  676       __ tbz(base, 3, base_aligned);
  677       __ str(zr, Address(__ post(base, 8)));
  678       __ sub(cnt, cnt, 1);
  679       __ bind(base_aligned);
  680 
  681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  682       // alignment.
  683       Label small;
  684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  685       __ subs(rscratch1, cnt, low_limit >> 3);
  686       __ br(Assembler::LT, small);
  687       __ zero_dcache_blocks(base, cnt);
  688       __ bind(small);
  689     }
  690 
  691     {
  692       // Number of stp instructions we'll unroll
  693       const int unroll =
  694         MacroAssembler::zero_words_block_size / 2;
  695       // Clear the remaining blocks.
  696       Label loop;
  697       __ subs(cnt, cnt, unroll * 2);
  698       __ br(Assembler::LT, done);
  699       __ bind(loop);
  700       for (int i = 0; i < unroll; i++)
  701         __ stp(zr, zr, __ post(base, 16));
  702       __ subs(cnt, cnt, unroll * 2);
  703       __ br(Assembler::GE, loop);
  704       __ bind(done);
  705       __ add(cnt, cnt, unroll * 2);
  706     }
  707 
  708     __ ret(lr);
  709 
  710     return start;
  711   }
  712 
  713 
  714   typedef enum {
  715     copy_forwards = 1,
  716     copy_backwards = -1
  717   } copy_direction;
  718 
  719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  720   // for arraycopy stubs.
  721   class ArrayCopyBarrierSetHelper : StackObj {
  722     BarrierSetAssembler* _bs_asm;
  723     MacroAssembler* _masm;
  724     DecoratorSet _decorators;
  725     BasicType _type;
  726     Register _gct1;
  727     Register _gct2;
  728     Register _gct3;
  729     FloatRegister _gcvt1;
  730     FloatRegister _gcvt2;
  731     FloatRegister _gcvt3;
  732 
  733   public:
  734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  735                               DecoratorSet decorators,
  736                               BasicType type,
  737                               Register gct1,
  738                               Register gct2,
  739                               Register gct3,
  740                               FloatRegister gcvt1,
  741                               FloatRegister gcvt2,
  742                               FloatRegister gcvt3)
  743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  744         _masm(masm),
  745         _decorators(decorators),
  746         _type(type),
  747         _gct1(gct1),
  748         _gct2(gct2),
  749         _gct3(gct3),
  750         _gcvt1(gcvt1),
  751         _gcvt2(gcvt2),
  752         _gcvt3(gcvt3) {
  753     }
  754 
  755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  757                             dst1, dst2, src,
  758                             _gct1, _gct2, _gcvt1);
  759     }
  760 
  761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  763                              dst, src1, src2,
  764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  765     }
  766 
  767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  769                             dst1, dst2, src,
  770                             _gct1);
  771     }
  772 
  773     void copy_store_at_16(Address dst, Register src1, Register src2) {
  774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  775                              dst, src1, src2,
  776                              _gct1, _gct2, _gct3);
  777     }
  778 
  779     void copy_load_at_8(Register dst, Address src) {
  780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  781                             dst, noreg, src,
  782                             _gct1);
  783     }
  784 
  785     void copy_store_at_8(Address dst, Register src) {
  786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  787                              dst, src, noreg,
  788                              _gct1, _gct2, _gct3);
  789     }
  790   };
  791 
  792   // Bulk copy of blocks of 8 words.
  793   //
  794   // count is a count of words.
  795   //
  796   // Precondition: count >= 8
  797   //
  798   // Postconditions:
  799   //
  800   // The least significant bit of count contains the remaining count
  801   // of words to copy.  The rest of count is trash.
  802   //
  803   // s and d are adjusted to point to the remaining words to copy
  804   //
  805   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  806     BasicType type;
  807     copy_direction direction;
  808 
  809     switch (stub_id) {
  810     case copy_byte_f_id:
  811       direction = copy_forwards;
  812       type = T_BYTE;
  813       break;
  814     case copy_byte_b_id:
  815       direction = copy_backwards;
  816       type = T_BYTE;
  817       break;
  818     case copy_oop_f_id:
  819       direction = copy_forwards;
  820       type = T_OBJECT;
  821       break;
  822     case copy_oop_b_id:
  823       direction = copy_backwards;
  824       type = T_OBJECT;
  825       break;
  826     case copy_oop_uninit_f_id:
  827       direction = copy_forwards;
  828       type = T_OBJECT;
  829       break;
  830     case copy_oop_uninit_b_id:
  831       direction = copy_backwards;
  832       type = T_OBJECT;
  833       break;
  834     default:
  835       ShouldNotReachHere();
  836     }
  837 
  838     int unit = wordSize * direction;
  839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  840 
  841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  843     const Register stride = r14;
  844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  847 
  848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  849     assert_different_registers(s, d, count, rscratch1, rscratch2);
  850 
  851     Label again, drain;
  852 
  853     __ align(CodeEntryAlignment);
  854 
  855     StubCodeMark mark(this, stub_id);
  856 
  857     __ bind(start);
  858 
  859     Label unaligned_copy_long;
  860     if (AvoidUnalignedAccesses) {
  861       __ tbnz(d, 3, unaligned_copy_long);
  862     }
  863 
  864     if (direction == copy_forwards) {
  865       __ sub(s, s, bias);
  866       __ sub(d, d, bias);
  867     }
  868 
  869 #ifdef ASSERT
  870     // Make sure we are never given < 8 words
  871     {
  872       Label L;
  873       __ cmp(count, (u1)8);
  874       __ br(Assembler::GE, L);
  875       __ stop("genrate_copy_longs called with < 8 words");
  876       __ bind(L);
  877     }
  878 #endif
  879 
  880     // Fill 8 registers
  881     if (UseSIMDForMemoryOps) {
  882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  884     } else {
  885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  889     }
  890 
  891     __ subs(count, count, 16);
  892     __ br(Assembler::LO, drain);
  893 
  894     int prefetch = PrefetchCopyIntervalInBytes;
  895     bool use_stride = false;
  896     if (direction == copy_backwards) {
  897        use_stride = prefetch > 256;
  898        prefetch = -prefetch;
  899        if (use_stride) __ mov(stride, prefetch);
  900     }
  901 
  902     __ bind(again);
  903 
  904     if (PrefetchCopyIntervalInBytes > 0)
  905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  906 
  907     if (UseSIMDForMemoryOps) {
  908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  912     } else {
  913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  921     }
  922 
  923     __ subs(count, count, 8);
  924     __ br(Assembler::HS, again);
  925 
  926     // Drain
  927     __ bind(drain);
  928     if (UseSIMDForMemoryOps) {
  929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  931     } else {
  932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936     }
  937 
  938     {
  939       Label L1, L2;
  940       __ tbz(count, exact_log2(4), L1);
  941       if (UseSIMDForMemoryOps) {
  942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  944       } else {
  945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  949       }
  950       __ bind(L1);
  951 
  952       if (direction == copy_forwards) {
  953         __ add(s, s, bias);
  954         __ add(d, d, bias);
  955       }
  956 
  957       __ tbz(count, 1, L2);
  958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  960       __ bind(L2);
  961     }
  962 
  963     __ ret(lr);
  964 
  965     if (AvoidUnalignedAccesses) {
  966       Label drain, again;
  967       // Register order for storing. Order is different for backward copy.
  968 
  969       __ bind(unaligned_copy_long);
  970 
  971       // source address is even aligned, target odd aligned
  972       //
  973       // when forward copying word pairs we read long pairs at offsets
  974       // {0, 2, 4, 6} (in long words). when backwards copying we read
  975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  976       // address by -2 in the forwards case so we can compute the
  977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  978       // or -1.
  979       //
  980       // when forward copying we need to store 1 word, 3 pairs and
  981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  982       // zero offset We adjust the destination by -1 which means we
  983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  984       //
  985       // When backwards copyng we need to store 1 word, 3 pairs and
  986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  987       // offsets {1, 3, 5, 7, 8} * unit.
  988 
  989       if (direction == copy_forwards) {
  990         __ sub(s, s, 16);
  991         __ sub(d, d, 8);
  992       }
  993 
  994       // Fill 8 registers
  995       //
  996       // for forwards copy s was offset by -16 from the original input
  997       // value of s so the register contents are at these offsets
  998       // relative to the 64 bit block addressed by that original input
  999       // and so on for each successive 64 byte block when s is updated
 1000       //
 1001       // t0 at offset 0,  t1 at offset 8
 1002       // t2 at offset 16, t3 at offset 24
 1003       // t4 at offset 32, t5 at offset 40
 1004       // t6 at offset 48, t7 at offset 56
 1005 
 1006       // for backwards copy s was not offset so the register contents
 1007       // are at these offsets into the preceding 64 byte block
 1008       // relative to that original input and so on for each successive
 1009       // preceding 64 byte block when s is updated. this explains the
 1010       // slightly counter-intuitive looking pattern of register usage
 1011       // in the stp instructions for backwards copy.
 1012       //
 1013       // t0 at offset -16, t1 at offset -8
 1014       // t2 at offset -32, t3 at offset -24
 1015       // t4 at offset -48, t5 at offset -40
 1016       // t6 at offset -64, t7 at offset -56
 1017 
 1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1022 
 1023       __ subs(count, count, 16);
 1024       __ br(Assembler::LO, drain);
 1025 
 1026       int prefetch = PrefetchCopyIntervalInBytes;
 1027       bool use_stride = false;
 1028       if (direction == copy_backwards) {
 1029          use_stride = prefetch > 256;
 1030          prefetch = -prefetch;
 1031          if (use_stride) __ mov(stride, prefetch);
 1032       }
 1033 
 1034       __ bind(again);
 1035 
 1036       if (PrefetchCopyIntervalInBytes > 0)
 1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1038 
 1039       if (direction == copy_forwards) {
 1040        // allowing for the offset of -8 the store instructions place
 1041        // registers into the target 64 bit block at the following
 1042        // offsets
 1043        //
 1044        // t0 at offset 0
 1045        // t1 at offset 8,  t2 at offset 16
 1046        // t3 at offset 24, t4 at offset 32
 1047        // t5 at offset 40, t6 at offset 48
 1048        // t7 at offset 56
 1049 
 1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1059       } else {
 1060        // d was not offset when we started so the registers are
 1061        // written into the 64 bit block preceding d with the following
 1062        // offsets
 1063        //
 1064        // t1 at offset -8
 1065        // t3 at offset -24, t0 at offset -16
 1066        // t5 at offset -48, t2 at offset -32
 1067        // t7 at offset -56, t4 at offset -48
 1068        //                   t6 at offset -64
 1069        //
 1070        // note that this matches the offsets previously noted for the
 1071        // loads
 1072 
 1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1082       }
 1083 
 1084       __ subs(count, count, 8);
 1085       __ br(Assembler::HS, again);
 1086 
 1087       // Drain
 1088       //
 1089       // this uses the same pattern of offsets and register arguments
 1090       // as above
 1091       __ bind(drain);
 1092       if (direction == copy_forwards) {
 1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1098       } else {
 1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1104       }
 1105       // now we need to copy any remaining part block which may
 1106       // include a 4 word block subblock and/or a 2 word subblock.
 1107       // bits 2 and 1 in the count are the tell-tale for whether we
 1108       // have each such subblock
 1109       {
 1110         Label L1, L2;
 1111         __ tbz(count, exact_log2(4), L1);
 1112        // this is the same as above but copying only 4 longs hence
 1113        // with only one intervening stp between the str instructions
 1114        // but note that the offsets and registers still follow the
 1115        // same pattern
 1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1118         if (direction == copy_forwards) {
 1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1122         } else {
 1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1126         }
 1127         __ bind(L1);
 1128 
 1129         __ tbz(count, 1, L2);
 1130        // this is the same as above but copying only 2 longs hence
 1131        // there is no intervening stp between the str instructions
 1132        // but note that the offset and register patterns are still
 1133        // the same
 1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1135         if (direction == copy_forwards) {
 1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1141         }
 1142         __ bind(L2);
 1143 
 1144        // for forwards copy we need to re-adjust the offsets we
 1145        // applied so that s and d are follow the last words written
 1146 
 1147        if (direction == copy_forwards) {
 1148          __ add(s, s, 16);
 1149          __ add(d, d, 8);
 1150        }
 1151 
 1152       }
 1153 
 1154       __ ret(lr);
 1155       }
 1156   }
 1157 
 1158   // Small copy: less than 16 bytes.
 1159   //
 1160   // NB: Ignores all of the bits of count which represent more than 15
 1161   // bytes, so a caller doesn't have to mask them.
 1162 
 1163   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1164     bool is_backwards = step < 0;
 1165     size_t granularity = uabs(step);
 1166     int direction = is_backwards ? -1 : 1;
 1167 
 1168     Label Lword, Lint, Lshort, Lbyte;
 1169 
 1170     assert(granularity
 1171            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1172 
 1173     const Register t0 = r3;
 1174     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1175     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1176 
 1177     // ??? I don't know if this bit-test-and-branch is the right thing
 1178     // to do.  It does a lot of jumping, resulting in several
 1179     // mispredicted branches.  It might make more sense to do this
 1180     // with something like Duff's device with a single computed branch.
 1181 
 1182     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1183     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1184     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1185     __ bind(Lword);
 1186 
 1187     if (granularity <= sizeof (jint)) {
 1188       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1189       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1190       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1191       __ bind(Lint);
 1192     }
 1193 
 1194     if (granularity <= sizeof (jshort)) {
 1195       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1196       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1197       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1198       __ bind(Lshort);
 1199     }
 1200 
 1201     if (granularity <= sizeof (jbyte)) {
 1202       __ tbz(count, 0, Lbyte);
 1203       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1204       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1205       __ bind(Lbyte);
 1206     }
 1207   }
 1208 
 1209   Label copy_f, copy_b;
 1210   Label copy_obj_f, copy_obj_b;
 1211   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1212 
 1213   // All-singing all-dancing memory copy.
 1214   //
 1215   // Copy count units of memory from s to d.  The size of a unit is
 1216   // step, which can be positive or negative depending on the direction
 1217   // of copy.  If is_aligned is false, we align the source address.
 1218   //
 1219 
 1220   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1221                    Register s, Register d, Register count, int step) {
 1222     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1223     bool is_backwards = step < 0;
 1224     unsigned int granularity = uabs(step);
 1225     const Register t0 = r3, t1 = r4;
 1226 
 1227     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1228     // load all the data before writing anything
 1229     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1230     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1231     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1232     const Register send = r17, dend = r16;
 1233     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1234     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1235     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1236 
 1237     if (PrefetchCopyIntervalInBytes > 0)
 1238       __ prfm(Address(s, 0), PLDL1KEEP);
 1239     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1240     __ br(Assembler::HI, copy_big);
 1241 
 1242     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1243     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1244 
 1245     __ cmp(count, u1(16/granularity));
 1246     __ br(Assembler::LS, copy16);
 1247 
 1248     __ cmp(count, u1(64/granularity));
 1249     __ br(Assembler::HI, copy80);
 1250 
 1251     __ cmp(count, u1(32/granularity));
 1252     __ br(Assembler::LS, copy32);
 1253 
 1254     // 33..64 bytes
 1255     if (UseSIMDForMemoryOps) {
 1256       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1257       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1258       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1259       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1260     } else {
 1261       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1262       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1263       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1264       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1265 
 1266       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1267       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1268       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1269       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1270     }
 1271     __ b(finish);
 1272 
 1273     // 17..32 bytes
 1274     __ bind(copy32);
 1275     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1276     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1277 
 1278     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1279     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1280     __ b(finish);
 1281 
 1282     // 65..80/96 bytes
 1283     // (96 bytes if SIMD because we do 32 byes per instruction)
 1284     __ bind(copy80);
 1285     if (UseSIMDForMemoryOps) {
 1286       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1287       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1288       // Unaligned pointers can be an issue for copying.
 1289       // The issue has more chances to happen when granularity of data is
 1290       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1291       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1292       // The most performance drop has been seen for the range 65-80 bytes.
 1293       // For such cases using the pair of ldp/stp instead of the third pair of
 1294       // ldpq/stpq fixes the performance issue.
 1295       if (granularity < sizeof (jint)) {
 1296         Label copy96;
 1297         __ cmp(count, u1(80/granularity));
 1298         __ br(Assembler::HI, copy96);
 1299         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1300 
 1301         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1302         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1303 
 1304         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1305         __ b(finish);
 1306 
 1307         __ bind(copy96);
 1308       }
 1309       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1310 
 1311       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1312       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1313 
 1314       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1315     } else {
 1316       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1317       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1318       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1319       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1320       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1321 
 1322       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1323       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1324       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1325       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1326       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1327     }
 1328     __ b(finish);
 1329 
 1330     // 0..16 bytes
 1331     __ bind(copy16);
 1332     __ cmp(count, u1(8/granularity));
 1333     __ br(Assembler::LO, copy8);
 1334 
 1335     // 8..16 bytes
 1336     bs.copy_load_at_8(t0, Address(s, 0));
 1337     bs.copy_load_at_8(t1, Address(send, -8));
 1338     bs.copy_store_at_8(Address(d, 0), t0);
 1339     bs.copy_store_at_8(Address(dend, -8), t1);
 1340     __ b(finish);
 1341 
 1342     if (granularity < 8) {
 1343       // 4..7 bytes
 1344       __ bind(copy8);
 1345       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1346       __ ldrw(t0, Address(s, 0));
 1347       __ ldrw(t1, Address(send, -4));
 1348       __ strw(t0, Address(d, 0));
 1349       __ strw(t1, Address(dend, -4));
 1350       __ b(finish);
 1351       if (granularity < 4) {
 1352         // 0..3 bytes
 1353         __ bind(copy4);
 1354         __ cbz(count, finish); // get rid of 0 case
 1355         if (granularity == 2) {
 1356           __ ldrh(t0, Address(s, 0));
 1357           __ strh(t0, Address(d, 0));
 1358         } else { // granularity == 1
 1359           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1360           // the first and last byte.
 1361           // Handle the 3 byte case by loading and storing base + count/2
 1362           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1363           // This does means in the 1 byte case we load/store the same
 1364           // byte 3 times.
 1365           __ lsr(count, count, 1);
 1366           __ ldrb(t0, Address(s, 0));
 1367           __ ldrb(t1, Address(send, -1));
 1368           __ ldrb(t2, Address(s, count));
 1369           __ strb(t0, Address(d, 0));
 1370           __ strb(t1, Address(dend, -1));
 1371           __ strb(t2, Address(d, count));
 1372         }
 1373         __ b(finish);
 1374       }
 1375     }
 1376 
 1377     __ bind(copy_big);
 1378     if (is_backwards) {
 1379       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1380       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1381     }
 1382 
 1383     // Now we've got the small case out of the way we can align the
 1384     // source address on a 2-word boundary.
 1385 
 1386     // Here we will materialize a count in r15, which is used by copy_memory_small
 1387     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1388     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1389     // can not be used as a temp register, as it contains the count.
 1390 
 1391     Label aligned;
 1392 
 1393     if (is_aligned) {
 1394       // We may have to adjust by 1 word to get s 2-word-aligned.
 1395       __ tbz(s, exact_log2(wordSize), aligned);
 1396       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1397       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1398       __ sub(count, count, wordSize/granularity);
 1399     } else {
 1400       if (is_backwards) {
 1401         __ andr(r15, s, 2 * wordSize - 1);
 1402       } else {
 1403         __ neg(r15, s);
 1404         __ andr(r15, r15, 2 * wordSize - 1);
 1405       }
 1406       // r15 is the byte adjustment needed to align s.
 1407       __ cbz(r15, aligned);
 1408       int shift = exact_log2(granularity);
 1409       if (shift > 0) {
 1410         __ lsr(r15, r15, shift);
 1411       }
 1412       __ sub(count, count, r15);
 1413 
 1414 #if 0
 1415       // ?? This code is only correct for a disjoint copy.  It may or
 1416       // may not make sense to use it in that case.
 1417 
 1418       // Copy the first pair; s and d may not be aligned.
 1419       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1420       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1421 
 1422       // Align s and d, adjust count
 1423       if (is_backwards) {
 1424         __ sub(s, s, r15);
 1425         __ sub(d, d, r15);
 1426       } else {
 1427         __ add(s, s, r15);
 1428         __ add(d, d, r15);
 1429       }
 1430 #else
 1431       copy_memory_small(decorators, type, s, d, r15, step);
 1432 #endif
 1433     }
 1434 
 1435     __ bind(aligned);
 1436 
 1437     // s is now 2-word-aligned.
 1438 
 1439     // We have a count of units and some trailing bytes. Adjust the
 1440     // count and do a bulk copy of words. If the shift is zero
 1441     // perform a move instead to benefit from zero latency moves.
 1442     int shift = exact_log2(wordSize/granularity);
 1443     if (shift > 0) {
 1444       __ lsr(r15, count, shift);
 1445     } else {
 1446       __ mov(r15, count);
 1447     }
 1448     if (direction == copy_forwards) {
 1449       if (type != T_OBJECT) {
 1450         __ bl(copy_f);
 1451       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1452         __ bl(copy_obj_uninit_f);
 1453       } else {
 1454         __ bl(copy_obj_f);
 1455       }
 1456     } else {
 1457       if (type != T_OBJECT) {
 1458         __ bl(copy_b);
 1459       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1460         __ bl(copy_obj_uninit_b);
 1461       } else {
 1462         __ bl(copy_obj_b);
 1463       }
 1464     }
 1465 
 1466     // And the tail.
 1467     copy_memory_small(decorators, type, s, d, count, step);
 1468 
 1469     if (granularity >= 8) __ bind(copy8);
 1470     if (granularity >= 4) __ bind(copy4);
 1471     __ bind(finish);
 1472   }
 1473 
 1474 
 1475   void clobber_registers() {
 1476 #ifdef ASSERT
 1477     RegSet clobbered
 1478       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1479     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1480     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1481     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1482       __ mov(*it, rscratch1);
 1483     }
 1484 #endif
 1485 
 1486   }
 1487 
 1488   // Scan over array at a for count oops, verifying each one.
 1489   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1490   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1491     Label loop, end;
 1492     __ mov(rscratch1, a);
 1493     __ mov(rscratch2, zr);
 1494     __ bind(loop);
 1495     __ cmp(rscratch2, count);
 1496     __ br(Assembler::HS, end);
 1497     if (size == wordSize) {
 1498       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1499       __ verify_oop(temp);
 1500     } else {
 1501       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1502       __ decode_heap_oop(temp); // calls verify_oop
 1503     }
 1504     __ add(rscratch2, rscratch2, 1);
 1505     __ b(loop);
 1506     __ bind(end);
 1507   }
 1508 
 1509   // Arguments:
 1510   //   stub_id - is used to name the stub and identify all details of
 1511   //             how to perform the copy.
 1512   //
 1513   //   entry - is assigned to the stub's post push entry point unless
 1514   //           it is null
 1515   //
 1516   // Inputs:
 1517   //   c_rarg0   - source array address
 1518   //   c_rarg1   - destination array address
 1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1520   //
 1521   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1522   // the hardware handle it.  The two dwords within qwords that span
 1523   // cache line boundaries will still be loaded and stored atomically.
 1524   //
 1525   // Side Effects: entry is set to the (post push) entry point so it
 1526   //               can be used by the corresponding conjoint copy
 1527   //               method
 1528   //
 1529   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1530     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1531     RegSet saved_reg = RegSet::of(s, d, count);
 1532     int size;
 1533     bool aligned;
 1534     bool is_oop;
 1535     bool dest_uninitialized;
 1536     switch (stub_id) {
 1537     case jbyte_disjoint_arraycopy_id:
 1538       size = sizeof(jbyte);
 1539       aligned = false;
 1540       is_oop = false;
 1541       dest_uninitialized = false;
 1542       break;
 1543     case arrayof_jbyte_disjoint_arraycopy_id:
 1544       size = sizeof(jbyte);
 1545       aligned = true;
 1546       is_oop = false;
 1547       dest_uninitialized = false;
 1548       break;
 1549     case jshort_disjoint_arraycopy_id:
 1550       size = sizeof(jshort);
 1551       aligned = false;
 1552       is_oop = false;
 1553       dest_uninitialized = false;
 1554       break;
 1555     case arrayof_jshort_disjoint_arraycopy_id:
 1556       size = sizeof(jshort);
 1557       aligned = true;
 1558       is_oop = false;
 1559       dest_uninitialized = false;
 1560       break;
 1561     case jint_disjoint_arraycopy_id:
 1562       size = sizeof(jint);
 1563       aligned = false;
 1564       is_oop = false;
 1565       dest_uninitialized = false;
 1566       break;
 1567     case arrayof_jint_disjoint_arraycopy_id:
 1568       size = sizeof(jint);
 1569       aligned = true;
 1570       is_oop = false;
 1571       dest_uninitialized = false;
 1572       break;
 1573     case jlong_disjoint_arraycopy_id:
 1574       // since this is always aligned we can (should!) use the same
 1575       // stub as for case arrayof_jlong_disjoint_arraycopy
 1576       ShouldNotReachHere();
 1577       break;
 1578     case arrayof_jlong_disjoint_arraycopy_id:
 1579       size = sizeof(jlong);
 1580       aligned = true;
 1581       is_oop = false;
 1582       dest_uninitialized = false;
 1583       break;
 1584     case oop_disjoint_arraycopy_id:
 1585       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1586       aligned = !UseCompressedOops;
 1587       is_oop = true;
 1588       dest_uninitialized = false;
 1589       break;
 1590     case arrayof_oop_disjoint_arraycopy_id:
 1591       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1592       aligned = !UseCompressedOops;
 1593       is_oop = true;
 1594       dest_uninitialized = false;
 1595       break;
 1596     case oop_disjoint_arraycopy_uninit_id:
 1597       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1598       aligned = !UseCompressedOops;
 1599       is_oop = true;
 1600       dest_uninitialized = true;
 1601       break;
 1602     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1603       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1604       aligned = !UseCompressedOops;
 1605       is_oop = true;
 1606       dest_uninitialized = true;
 1607       break;
 1608     default:
 1609       ShouldNotReachHere();
 1610       break;
 1611     }
 1612 
 1613     __ align(CodeEntryAlignment);
 1614     StubCodeMark mark(this, stub_id);
 1615     address start = __ pc();
 1616     __ enter();
 1617 
 1618     if (entry != nullptr) {
 1619       *entry = __ pc();
 1620       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1621       BLOCK_COMMENT("Entry:");
 1622     }
 1623 
 1624     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1625     if (dest_uninitialized) {
 1626       decorators |= IS_DEST_UNINITIALIZED;
 1627     }
 1628     if (aligned) {
 1629       decorators |= ARRAYCOPY_ALIGNED;
 1630     }
 1631 
 1632     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1633     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1634 
 1635     if (is_oop) {
 1636       // save regs before copy_memory
 1637       __ push(RegSet::of(d, count), sp);
 1638     }
 1639     {
 1640       // UnsafeMemoryAccess page error: continue after unsafe access
 1641       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1642       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1643       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1644     }
 1645 
 1646     if (is_oop) {
 1647       __ pop(RegSet::of(d, count), sp);
 1648       if (VerifyOops)
 1649         verify_oop_array(size, d, count, r16);
 1650     }
 1651 
 1652     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1653 
 1654     __ leave();
 1655     __ mov(r0, zr); // return 0
 1656     __ ret(lr);
 1657     return start;
 1658   }
 1659 
 1660   // Arguments:
 1661   //   stub_id - is used to name the stub and identify all details of
 1662   //             how to perform the copy.
 1663   //
 1664   //   nooverlap_target - identifes the (post push) entry for the
 1665   //             corresponding disjoint copy routine which can be
 1666   //             jumped to if the ranges do not actually overlap
 1667   //
 1668   //   entry - is assigned to the stub's post push entry point unless
 1669   //           it is null
 1670   //
 1671   //
 1672   // Inputs:
 1673   //   c_rarg0   - source array address
 1674   //   c_rarg1   - destination array address
 1675   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1676   //
 1677   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1678   // the hardware handle it.  The two dwords within qwords that span
 1679   // cache line boundaries will still be loaded and stored atomically.
 1680   //
 1681   // Side Effects:
 1682   //   entry is set to the no-overlap entry point so it can be used by
 1683   //   some other conjoint copy method
 1684   //
 1685   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1686     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1687     RegSet saved_regs = RegSet::of(s, d, count);
 1688     int size;
 1689     bool aligned;
 1690     bool is_oop;
 1691     bool dest_uninitialized;
 1692     switch (stub_id) {
 1693     case jbyte_arraycopy_id:
 1694       size = sizeof(jbyte);
 1695       aligned = false;
 1696       is_oop = false;
 1697       dest_uninitialized = false;
 1698       break;
 1699     case arrayof_jbyte_arraycopy_id:
 1700       size = sizeof(jbyte);
 1701       aligned = true;
 1702       is_oop = false;
 1703       dest_uninitialized = false;
 1704       break;
 1705     case jshort_arraycopy_id:
 1706       size = sizeof(jshort);
 1707       aligned = false;
 1708       is_oop = false;
 1709       dest_uninitialized = false;
 1710       break;
 1711     case arrayof_jshort_arraycopy_id:
 1712       size = sizeof(jshort);
 1713       aligned = true;
 1714       is_oop = false;
 1715       dest_uninitialized = false;
 1716       break;
 1717     case jint_arraycopy_id:
 1718       size = sizeof(jint);
 1719       aligned = false;
 1720       is_oop = false;
 1721       dest_uninitialized = false;
 1722       break;
 1723     case arrayof_jint_arraycopy_id:
 1724       size = sizeof(jint);
 1725       aligned = true;
 1726       is_oop = false;
 1727       dest_uninitialized = false;
 1728       break;
 1729     case jlong_arraycopy_id:
 1730       // since this is always aligned we can (should!) use the same
 1731       // stub as for case arrayof_jlong_disjoint_arraycopy
 1732       ShouldNotReachHere();
 1733       break;
 1734     case arrayof_jlong_arraycopy_id:
 1735       size = sizeof(jlong);
 1736       aligned = true;
 1737       is_oop = false;
 1738       dest_uninitialized = false;
 1739       break;
 1740     case oop_arraycopy_id:
 1741       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1742       aligned = !UseCompressedOops;
 1743       is_oop = true;
 1744       dest_uninitialized = false;
 1745       break;
 1746     case arrayof_oop_arraycopy_id:
 1747       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1748       aligned = !UseCompressedOops;
 1749       is_oop = true;
 1750       dest_uninitialized = false;
 1751       break;
 1752     case oop_arraycopy_uninit_id:
 1753       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1754       aligned = !UseCompressedOops;
 1755       is_oop = true;
 1756       dest_uninitialized = true;
 1757       break;
 1758     case arrayof_oop_arraycopy_uninit_id:
 1759       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1760       aligned = !UseCompressedOops;
 1761       is_oop = true;
 1762       dest_uninitialized = true;
 1763       break;
 1764     default:
 1765       ShouldNotReachHere();
 1766     }
 1767 
 1768     StubCodeMark mark(this, stub_id);
 1769     address start = __ pc();
 1770     __ enter();
 1771 
 1772     if (entry != nullptr) {
 1773       *entry = __ pc();
 1774       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1775       BLOCK_COMMENT("Entry:");
 1776     }
 1777 
 1778     // use fwd copy when (d-s) above_equal (count*size)
 1779     __ sub(rscratch1, d, s);
 1780     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1781     __ br(Assembler::HS, nooverlap_target);
 1782 
 1783     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1784     if (dest_uninitialized) {
 1785       decorators |= IS_DEST_UNINITIALIZED;
 1786     }
 1787     if (aligned) {
 1788       decorators |= ARRAYCOPY_ALIGNED;
 1789     }
 1790 
 1791     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1792     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1793 
 1794     if (is_oop) {
 1795       // save regs before copy_memory
 1796       __ push(RegSet::of(d, count), sp);
 1797     }
 1798     {
 1799       // UnsafeMemoryAccess page error: continue after unsafe access
 1800       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1801       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1802       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1803     }
 1804     if (is_oop) {
 1805       __ pop(RegSet::of(d, count), sp);
 1806       if (VerifyOops)
 1807         verify_oop_array(size, d, count, r16);
 1808     }
 1809     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1810     __ leave();
 1811     __ mov(r0, zr); // return 0
 1812     __ ret(lr);
 1813     return start;
 1814   }
 1815 
 1816   // Helper for generating a dynamic type check.
 1817   // Smashes rscratch1, rscratch2.
 1818   void generate_type_check(Register sub_klass,
 1819                            Register super_check_offset,
 1820                            Register super_klass,
 1821                            Register temp1,
 1822                            Register temp2,
 1823                            Register result,
 1824                            Label& L_success) {
 1825     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1826 
 1827     BLOCK_COMMENT("type_check:");
 1828 
 1829     Label L_miss;
 1830 
 1831     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1832                                      super_check_offset);
 1833     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1834 
 1835     // Fall through on failure!
 1836     __ BIND(L_miss);
 1837   }
 1838 
 1839   //
 1840   //  Generate checkcasting array copy stub
 1841   //
 1842   //  Input:
 1843   //    c_rarg0   - source array address
 1844   //    c_rarg1   - destination array address
 1845   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1846   //    c_rarg3   - size_t ckoff (super_check_offset)
 1847   //    c_rarg4   - oop ckval (super_klass)
 1848   //
 1849   //  Output:
 1850   //    r0 ==  0  -  success
 1851   //    r0 == -1^K - failure, where K is partial transfer count
 1852   //
 1853   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1854     bool dest_uninitialized;
 1855     switch (stub_id) {
 1856     case checkcast_arraycopy_id:
 1857       dest_uninitialized = false;
 1858       break;
 1859     case checkcast_arraycopy_uninit_id:
 1860       dest_uninitialized = true;
 1861       break;
 1862     default:
 1863       ShouldNotReachHere();
 1864     }
 1865 
 1866     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1867 
 1868     // Input registers (after setup_arg_regs)
 1869     const Register from        = c_rarg0;   // source array address
 1870     const Register to          = c_rarg1;   // destination array address
 1871     const Register count       = c_rarg2;   // elementscount
 1872     const Register ckoff       = c_rarg3;   // super_check_offset
 1873     const Register ckval       = c_rarg4;   // super_klass
 1874 
 1875     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1876     RegSet wb_post_saved_regs = RegSet::of(count);
 1877 
 1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1879     const Register copied_oop  = r22;       // actual oop copied
 1880     const Register count_save  = r21;       // orig elementscount
 1881     const Register start_to    = r20;       // destination array start address
 1882     const Register r19_klass   = r19;       // oop._klass
 1883 
 1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1886 
 1887     //---------------------------------------------------------------
 1888     // Assembler stub will be used for this call to arraycopy
 1889     // if the two arrays are subtypes of Object[] but the
 1890     // destination array type is not equal to or a supertype
 1891     // of the source type.  Each element must be separately
 1892     // checked.
 1893 
 1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1895                                copied_oop, r19_klass, count_save);
 1896 
 1897     __ align(CodeEntryAlignment);
 1898     StubCodeMark mark(this, stub_id);
 1899     address start = __ pc();
 1900 
 1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1902 
 1903 #ifdef ASSERT
 1904     // caller guarantees that the arrays really are different
 1905     // otherwise, we would have to make conjoint checks
 1906     { Label L;
 1907       __ b(L);                  // conjoint check not yet implemented
 1908       __ stop("checkcast_copy within a single array");
 1909       __ bind(L);
 1910     }
 1911 #endif //ASSERT
 1912 
 1913     // Caller of this entry point must set up the argument registers.
 1914     if (entry != nullptr) {
 1915       *entry = __ pc();
 1916       BLOCK_COMMENT("Entry:");
 1917     }
 1918 
 1919      // Empty array:  Nothing to do.
 1920     __ cbz(count, L_done);
 1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1922 
 1923 #ifdef ASSERT
 1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1925     // The ckoff and ckval must be mutually consistent,
 1926     // even though caller generates both.
 1927     { Label L;
 1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1929       __ ldrw(start_to, Address(ckval, sco_offset));
 1930       __ cmpw(ckoff, start_to);
 1931       __ br(Assembler::EQ, L);
 1932       __ stop("super_check_offset inconsistent");
 1933       __ bind(L);
 1934     }
 1935 #endif //ASSERT
 1936 
 1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1938     bool is_oop = true;
 1939     int element_size = UseCompressedOops ? 4 : 8;
 1940     if (dest_uninitialized) {
 1941       decorators |= IS_DEST_UNINITIALIZED;
 1942     }
 1943 
 1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1946 
 1947     // save the original count
 1948     __ mov(count_save, count);
 1949 
 1950     // Copy from low to high addresses
 1951     __ mov(start_to, to);              // Save destination array start address
 1952     __ b(L_load_element);
 1953 
 1954     // ======== begin loop ========
 1955     // (Loop is rotated; its entry is L_load_element.)
 1956     // Loop control:
 1957     //   for (; count != 0; count--) {
 1958     //     copied_oop = load_heap_oop(from++);
 1959     //     ... generate_type_check ...;
 1960     //     store_heap_oop(to++, copied_oop);
 1961     //   }
 1962     __ align(OptoLoopAlignment);
 1963 
 1964     __ BIND(L_store_element);
 1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1966                       __ post(to, element_size), copied_oop, noreg,
 1967                       gct1, gct2, gct3);
 1968     __ sub(count, count, 1);
 1969     __ cbz(count, L_do_card_marks);
 1970 
 1971     // ======== loop entry is here ========
 1972     __ BIND(L_load_element);
 1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1974                      copied_oop, noreg, __ post(from, element_size),
 1975                      gct1);
 1976     __ cbz(copied_oop, L_store_element);
 1977 
 1978     __ load_klass(r19_klass, copied_oop);// query the object klass
 1979 
 1980     BLOCK_COMMENT("type_check:");
 1981     generate_type_check(/*sub_klass*/r19_klass,
 1982                         /*super_check_offset*/ckoff,
 1983                         /*super_klass*/ckval,
 1984                         /*r_array_base*/gct1,
 1985                         /*temp2*/gct2,
 1986                         /*result*/r10, L_store_element);
 1987 
 1988     // Fall through on failure!
 1989 
 1990     // ======== end loop ========
 1991 
 1992     // It was a real error; we must depend on the caller to finish the job.
 1993     // Register count = remaining oops, count_orig = total oops.
 1994     // Emit GC store barriers for the oops we have copied and report
 1995     // their number to the caller.
 1996 
 1997     __ subs(count, count_save, count);     // K = partially copied oop count
 1998     __ eon(count, count, zr);              // report (-1^K) to caller
 1999     __ br(Assembler::EQ, L_done_pop);
 2000 
 2001     __ BIND(L_do_card_marks);
 2002     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2003 
 2004     __ bind(L_done_pop);
 2005     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2007 
 2008     __ bind(L_done);
 2009     __ mov(r0, count);
 2010     __ leave();
 2011     __ ret(lr);
 2012 
 2013     return start;
 2014   }
 2015 
 2016   // Perform range checks on the proposed arraycopy.
 2017   // Kills temp, but nothing else.
 2018   // Also, clean the sign bits of src_pos and dst_pos.
 2019   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2020                               Register src_pos, // source position (c_rarg1)
 2021                               Register dst,     // destination array oo (c_rarg2)
 2022                               Register dst_pos, // destination position (c_rarg3)
 2023                               Register length,
 2024                               Register temp,
 2025                               Label& L_failed) {
 2026     BLOCK_COMMENT("arraycopy_range_checks:");
 2027 
 2028     assert_different_registers(rscratch1, temp);
 2029 
 2030     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2031     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2032     __ addw(temp, length, src_pos);
 2033     __ cmpw(temp, rscratch1);
 2034     __ br(Assembler::HI, L_failed);
 2035 
 2036     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2037     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2038     __ addw(temp, length, dst_pos);
 2039     __ cmpw(temp, rscratch1);
 2040     __ br(Assembler::HI, L_failed);
 2041 
 2042     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2043     __ movw(src_pos, src_pos);
 2044     __ movw(dst_pos, dst_pos);
 2045 
 2046     BLOCK_COMMENT("arraycopy_range_checks done");
 2047   }
 2048 
 2049   // These stubs get called from some dumb test routine.
 2050   // I'll write them properly when they're called from
 2051   // something that's actually doing something.
 2052   static void fake_arraycopy_stub(address src, address dst, int count) {
 2053     assert(count == 0, "huh?");
 2054   }
 2055 
 2056 
 2057   //
 2058   //  Generate 'unsafe' array copy stub
 2059   //  Though just as safe as the other stubs, it takes an unscaled
 2060   //  size_t argument instead of an element count.
 2061   //
 2062   //  Input:
 2063   //    c_rarg0   - source array address
 2064   //    c_rarg1   - destination array address
 2065   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2066   //
 2067   // Examines the alignment of the operands and dispatches
 2068   // to a long, int, short, or byte copy loop.
 2069   //
 2070   address generate_unsafe_copy(address byte_copy_entry,
 2071                                address short_copy_entry,
 2072                                address int_copy_entry,
 2073                                address long_copy_entry) {
 2074     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2075 
 2076     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2077     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2078 
 2079     __ align(CodeEntryAlignment);
 2080     StubCodeMark mark(this, stub_id);
 2081     address start = __ pc();
 2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2083 
 2084     // bump this on entry, not on exit:
 2085     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2086 
 2087     __ orr(rscratch1, s, d);
 2088     __ orr(rscratch1, rscratch1, count);
 2089 
 2090     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2091     __ cbz(rscratch1, L_long_aligned);
 2092     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2093     __ cbz(rscratch1, L_int_aligned);
 2094     __ tbz(rscratch1, 0, L_short_aligned);
 2095     __ b(RuntimeAddress(byte_copy_entry));
 2096 
 2097     __ BIND(L_short_aligned);
 2098     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2099     __ b(RuntimeAddress(short_copy_entry));
 2100     __ BIND(L_int_aligned);
 2101     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2102     __ b(RuntimeAddress(int_copy_entry));
 2103     __ BIND(L_long_aligned);
 2104     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2105     __ b(RuntimeAddress(long_copy_entry));
 2106 
 2107     return start;
 2108   }
 2109 
 2110   //
 2111   //  Generate generic array copy stubs
 2112   //
 2113   //  Input:
 2114   //    c_rarg0    -  src oop
 2115   //    c_rarg1    -  src_pos (32-bits)
 2116   //    c_rarg2    -  dst oop
 2117   //    c_rarg3    -  dst_pos (32-bits)
 2118   //    c_rarg4    -  element count (32-bits)
 2119   //
 2120   //  Output:
 2121   //    r0 ==  0  -  success
 2122   //    r0 == -1^K - failure, where K is partial transfer count
 2123   //
 2124   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2125                                 address int_copy_entry, address oop_copy_entry,
 2126                                 address long_copy_entry, address checkcast_copy_entry) {
 2127     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2128 
 2129     Label L_failed, L_objArray;
 2130     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2131 
 2132     // Input registers
 2133     const Register src        = c_rarg0;  // source array oop
 2134     const Register src_pos    = c_rarg1;  // source position
 2135     const Register dst        = c_rarg2;  // destination array oop
 2136     const Register dst_pos    = c_rarg3;  // destination position
 2137     const Register length     = c_rarg4;
 2138 
 2139 
 2140     // Registers used as temps
 2141     const Register dst_klass  = c_rarg5;
 2142 
 2143     __ align(CodeEntryAlignment);
 2144 
 2145     StubCodeMark mark(this, stub_id);
 2146 
 2147     address start = __ pc();
 2148 
 2149     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2150 
 2151     // bump this on entry, not on exit:
 2152     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2153 
 2154     //-----------------------------------------------------------------------
 2155     // Assembler stub will be used for this call to arraycopy
 2156     // if the following conditions are met:
 2157     //
 2158     // (1) src and dst must not be null.
 2159     // (2) src_pos must not be negative.
 2160     // (3) dst_pos must not be negative.
 2161     // (4) length  must not be negative.
 2162     // (5) src klass and dst klass should be the same and not null.
 2163     // (6) src and dst should be arrays.
 2164     // (7) src_pos + length must not exceed length of src.
 2165     // (8) dst_pos + length must not exceed length of dst.
 2166     //
 2167 
 2168     //  if (src == nullptr) return -1;
 2169     __ cbz(src, L_failed);
 2170 
 2171     //  if (src_pos < 0) return -1;
 2172     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2173 
 2174     //  if (dst == nullptr) return -1;
 2175     __ cbz(dst, L_failed);
 2176 
 2177     //  if (dst_pos < 0) return -1;
 2178     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2179 
 2180     // registers used as temp
 2181     const Register scratch_length    = r16; // elements count to copy
 2182     const Register scratch_src_klass = r17; // array klass
 2183     const Register lh                = r15; // layout helper
 2184 
 2185     //  if (length < 0) return -1;
 2186     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2187     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2188 
 2189     __ load_klass(scratch_src_klass, src);
 2190 #ifdef ASSERT
 2191     //  assert(src->klass() != nullptr);
 2192     {
 2193       BLOCK_COMMENT("assert klasses not null {");
 2194       Label L1, L2;
 2195       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2196       __ bind(L1);
 2197       __ stop("broken null klass");
 2198       __ bind(L2);
 2199       __ load_klass(rscratch1, dst);
 2200       __ cbz(rscratch1, L1);     // this would be broken also
 2201       BLOCK_COMMENT("} assert klasses not null done");
 2202     }
 2203 #endif
 2204 
 2205     // Load layout helper (32-bits)
 2206     //
 2207     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2208     // 32        30    24            16              8     2                 0
 2209     //
 2210     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2211     //
 2212 
 2213     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2214 
 2215     // Handle objArrays completely differently...
 2216     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2217     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2218     __ movw(rscratch1, objArray_lh);
 2219     __ eorw(rscratch2, lh, rscratch1);
 2220     __ cbzw(rscratch2, L_objArray);
 2221 
 2222     //  if (src->klass() != dst->klass()) return -1;
 2223     __ load_klass(rscratch2, dst);
 2224     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2225     __ cbnz(rscratch2, L_failed);
 2226 
 2227     //  if (!src->is_Array()) return -1;
 2228     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2229 
 2230     // At this point, it is known to be a typeArray (array_tag 0x3).
 2231 #ifdef ASSERT
 2232     {
 2233       BLOCK_COMMENT("assert primitive array {");
 2234       Label L;
 2235       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2236       __ cmpw(lh, rscratch2);
 2237       __ br(Assembler::GE, L);
 2238       __ stop("must be a primitive array");
 2239       __ bind(L);
 2240       BLOCK_COMMENT("} assert primitive array done");
 2241     }
 2242 #endif
 2243 
 2244     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2245                            rscratch2, L_failed);
 2246 
 2247     // TypeArrayKlass
 2248     //
 2249     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2250     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2251     //
 2252 
 2253     const Register rscratch1_offset = rscratch1;    // array offset
 2254     const Register r15_elsize = lh; // element size
 2255 
 2256     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2257            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2258     __ add(src, src, rscratch1_offset);           // src array offset
 2259     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2260     BLOCK_COMMENT("choose copy loop based on element size");
 2261 
 2262     // next registers should be set before the jump to corresponding stub
 2263     const Register from     = c_rarg0;  // source array address
 2264     const Register to       = c_rarg1;  // destination array address
 2265     const Register count    = c_rarg2;  // elements count
 2266 
 2267     // 'from', 'to', 'count' registers should be set in such order
 2268     // since they are the same as 'src', 'src_pos', 'dst'.
 2269 
 2270     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2271 
 2272     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2273     // size in bytes).  We do a simple bitwise binary search.
 2274   __ BIND(L_copy_bytes);
 2275     __ tbnz(r15_elsize, 1, L_copy_ints);
 2276     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2277     __ lea(from, Address(src, src_pos));// src_addr
 2278     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2279     __ movw(count, scratch_length); // length
 2280     __ b(RuntimeAddress(byte_copy_entry));
 2281 
 2282   __ BIND(L_copy_shorts);
 2283     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2285     __ movw(count, scratch_length); // length
 2286     __ b(RuntimeAddress(short_copy_entry));
 2287 
 2288   __ BIND(L_copy_ints);
 2289     __ tbnz(r15_elsize, 0, L_copy_longs);
 2290     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2291     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2292     __ movw(count, scratch_length); // length
 2293     __ b(RuntimeAddress(int_copy_entry));
 2294 
 2295   __ BIND(L_copy_longs);
 2296 #ifdef ASSERT
 2297     {
 2298       BLOCK_COMMENT("assert long copy {");
 2299       Label L;
 2300       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2301       __ cmpw(r15_elsize, LogBytesPerLong);
 2302       __ br(Assembler::EQ, L);
 2303       __ stop("must be long copy, but elsize is wrong");
 2304       __ bind(L);
 2305       BLOCK_COMMENT("} assert long copy done");
 2306     }
 2307 #endif
 2308     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2310     __ movw(count, scratch_length); // length
 2311     __ b(RuntimeAddress(long_copy_entry));
 2312 
 2313     // ObjArrayKlass
 2314   __ BIND(L_objArray);
 2315     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2316 
 2317     Label L_plain_copy, L_checkcast_copy;
 2318     //  test array classes for subtyping
 2319     __ load_klass(r15, dst);
 2320     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2321     __ br(Assembler::NE, L_checkcast_copy);
 2322 
 2323     // Identically typed arrays can be copied without element-wise checks.
 2324     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2325                            rscratch2, L_failed);
 2326 
 2327     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2328     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2329     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2330     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2331     __ movw(count, scratch_length); // length
 2332   __ BIND(L_plain_copy);
 2333     __ b(RuntimeAddress(oop_copy_entry));
 2334 
 2335   __ BIND(L_checkcast_copy);
 2336     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2337     {
 2338       // Before looking at dst.length, make sure dst is also an objArray.
 2339       __ ldrw(rscratch1, Address(r15, lh_offset));
 2340       __ movw(rscratch2, objArray_lh);
 2341       __ eorw(rscratch1, rscratch1, rscratch2);
 2342       __ cbnzw(rscratch1, L_failed);
 2343 
 2344       // It is safe to examine both src.length and dst.length.
 2345       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2346                              r15, L_failed);
 2347 
 2348       __ load_klass(dst_klass, dst); // reload
 2349 
 2350       // Marshal the base address arguments now, freeing registers.
 2351       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2352       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2354       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2355       __ movw(count, length);           // length (reloaded)
 2356       Register sco_temp = c_rarg3;      // this register is free now
 2357       assert_different_registers(from, to, count, sco_temp,
 2358                                  dst_klass, scratch_src_klass);
 2359       // assert_clean_int(count, sco_temp);
 2360 
 2361       // Generate the type check.
 2362       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2364 
 2365       // Smashes rscratch1, rscratch2
 2366       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2367                           L_plain_copy);
 2368 
 2369       // Fetch destination element klass from the ObjArrayKlass header.
 2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2373 
 2374       // the checkcast_copy loop needs two extra arguments:
 2375       assert(c_rarg3 == sco_temp, "#3 already in place");
 2376       // Set up arguments for checkcast_copy_entry.
 2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2378       __ b(RuntimeAddress(checkcast_copy_entry));
 2379     }
 2380 
 2381   __ BIND(L_failed);
 2382     __ mov(r0, -1);
 2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2384     __ ret(lr);
 2385 
 2386     return start;
 2387   }
 2388 
 2389   //
 2390   // Generate stub for array fill. If "aligned" is true, the
 2391   // "to" address is assumed to be heapword aligned.
 2392   //
 2393   // Arguments for generated stub:
 2394   //   to:    c_rarg0
 2395   //   value: c_rarg1
 2396   //   count: c_rarg2 treated as signed
 2397   //
 2398   address generate_fill(StubGenStubId stub_id) {
 2399     BasicType t;
 2400     bool aligned;
 2401 
 2402     switch (stub_id) {
 2403     case jbyte_fill_id:
 2404       t = T_BYTE;
 2405       aligned = false;
 2406       break;
 2407     case jshort_fill_id:
 2408       t = T_SHORT;
 2409       aligned = false;
 2410       break;
 2411     case jint_fill_id:
 2412       t = T_INT;
 2413       aligned = false;
 2414       break;
 2415     case arrayof_jbyte_fill_id:
 2416       t = T_BYTE;
 2417       aligned = true;
 2418       break;
 2419     case arrayof_jshort_fill_id:
 2420       t = T_SHORT;
 2421       aligned = true;
 2422       break;
 2423     case arrayof_jint_fill_id:
 2424       t = T_INT;
 2425       aligned = true;
 2426       break;
 2427     default:
 2428       ShouldNotReachHere();
 2429     };
 2430 
 2431     __ align(CodeEntryAlignment);
 2432     StubCodeMark mark(this, stub_id);
 2433     address start = __ pc();
 2434 
 2435     BLOCK_COMMENT("Entry:");
 2436 
 2437     const Register to        = c_rarg0;  // source array address
 2438     const Register value     = c_rarg1;  // value
 2439     const Register count     = c_rarg2;  // elements count
 2440 
 2441     const Register bz_base = r10;        // base for block_zero routine
 2442     const Register cnt_words = r11;      // temp register
 2443 
 2444     __ enter();
 2445 
 2446     Label L_fill_elements, L_exit1;
 2447 
 2448     int shift = -1;
 2449     switch (t) {
 2450       case T_BYTE:
 2451         shift = 0;
 2452         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2453         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2454         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2455         __ br(Assembler::LO, L_fill_elements);
 2456         break;
 2457       case T_SHORT:
 2458         shift = 1;
 2459         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2461         __ br(Assembler::LO, L_fill_elements);
 2462         break;
 2463       case T_INT:
 2464         shift = 2;
 2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2466         __ br(Assembler::LO, L_fill_elements);
 2467         break;
 2468       default: ShouldNotReachHere();
 2469     }
 2470 
 2471     // Align source address at 8 bytes address boundary.
 2472     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2473     if (!aligned) {
 2474       switch (t) {
 2475         case T_BYTE:
 2476           // One byte misalignment happens only for byte arrays.
 2477           __ tbz(to, 0, L_skip_align1);
 2478           __ strb(value, Address(__ post(to, 1)));
 2479           __ subw(count, count, 1);
 2480           __ bind(L_skip_align1);
 2481           // Fallthrough
 2482         case T_SHORT:
 2483           // Two bytes misalignment happens only for byte and short (char) arrays.
 2484           __ tbz(to, 1, L_skip_align2);
 2485           __ strh(value, Address(__ post(to, 2)));
 2486           __ subw(count, count, 2 >> shift);
 2487           __ bind(L_skip_align2);
 2488           // Fallthrough
 2489         case T_INT:
 2490           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2491           __ tbz(to, 2, L_skip_align4);
 2492           __ strw(value, Address(__ post(to, 4)));
 2493           __ subw(count, count, 4 >> shift);
 2494           __ bind(L_skip_align4);
 2495           break;
 2496         default: ShouldNotReachHere();
 2497       }
 2498     }
 2499 
 2500     //
 2501     //  Fill large chunks
 2502     //
 2503     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2504     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2505     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2506     if (UseBlockZeroing) {
 2507       Label non_block_zeroing, rest;
 2508       // If the fill value is zero we can use the fast zero_words().
 2509       __ cbnz(value, non_block_zeroing);
 2510       __ mov(bz_base, to);
 2511       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2512       address tpc = __ zero_words(bz_base, cnt_words);
 2513       if (tpc == nullptr) {
 2514         fatal("CodeCache is full at generate_fill");
 2515       }
 2516       __ b(rest);
 2517       __ bind(non_block_zeroing);
 2518       __ fill_words(to, cnt_words, value);
 2519       __ bind(rest);
 2520     } else {
 2521       __ fill_words(to, cnt_words, value);
 2522     }
 2523 
 2524     // Remaining count is less than 8 bytes. Fill it by a single store.
 2525     // Note that the total length is no less than 8 bytes.
 2526     if (t == T_BYTE || t == T_SHORT) {
 2527       Label L_exit1;
 2528       __ cbzw(count, L_exit1);
 2529       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2530       __ str(value, Address(to, -8));    // overwrite some elements
 2531       __ bind(L_exit1);
 2532       __ leave();
 2533       __ ret(lr);
 2534     }
 2535 
 2536     // Handle copies less than 8 bytes.
 2537     Label L_fill_2, L_fill_4, L_exit2;
 2538     __ bind(L_fill_elements);
 2539     switch (t) {
 2540       case T_BYTE:
 2541         __ tbz(count, 0, L_fill_2);
 2542         __ strb(value, Address(__ post(to, 1)));
 2543         __ bind(L_fill_2);
 2544         __ tbz(count, 1, L_fill_4);
 2545         __ strh(value, Address(__ post(to, 2)));
 2546         __ bind(L_fill_4);
 2547         __ tbz(count, 2, L_exit2);
 2548         __ strw(value, Address(to));
 2549         break;
 2550       case T_SHORT:
 2551         __ tbz(count, 0, L_fill_4);
 2552         __ strh(value, Address(__ post(to, 2)));
 2553         __ bind(L_fill_4);
 2554         __ tbz(count, 1, L_exit2);
 2555         __ strw(value, Address(to));
 2556         break;
 2557       case T_INT:
 2558         __ cbzw(count, L_exit2);
 2559         __ strw(value, Address(to));
 2560         break;
 2561       default: ShouldNotReachHere();
 2562     }
 2563     __ bind(L_exit2);
 2564     __ leave();
 2565     __ ret(lr);
 2566     return start;
 2567   }
 2568 
 2569   address generate_data_cache_writeback() {
 2570     const Register line        = c_rarg0;  // address of line to write back
 2571 
 2572     __ align(CodeEntryAlignment);
 2573 
 2574     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2575     StubCodeMark mark(this, stub_id);
 2576 
 2577     address start = __ pc();
 2578     __ enter();
 2579     __ cache_wb(Address(line, 0));
 2580     __ leave();
 2581     __ ret(lr);
 2582 
 2583     return start;
 2584   }
 2585 
 2586   address generate_data_cache_writeback_sync() {
 2587     const Register is_pre     = c_rarg0;  // pre or post sync
 2588 
 2589     __ align(CodeEntryAlignment);
 2590 
 2591     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2592     StubCodeMark mark(this, stub_id);
 2593 
 2594     // pre wbsync is a no-op
 2595     // post wbsync translates to an sfence
 2596 
 2597     Label skip;
 2598     address start = __ pc();
 2599     __ enter();
 2600     __ cbnz(is_pre, skip);
 2601     __ cache_wbsync(false);
 2602     __ bind(skip);
 2603     __ leave();
 2604     __ ret(lr);
 2605 
 2606     return start;
 2607   }
 2608 
 2609   void generate_arraycopy_stubs() {
 2610     address entry;
 2611     address entry_jbyte_arraycopy;
 2612     address entry_jshort_arraycopy;
 2613     address entry_jint_arraycopy;
 2614     address entry_oop_arraycopy;
 2615     address entry_jlong_arraycopy;
 2616     address entry_checkcast_arraycopy;
 2617 
 2618     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2619     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2620 
 2621     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2622     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2623 
 2624     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2625     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2626 
 2627     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2628 
 2629     //*** jbyte
 2630     // Always need aligned and unaligned versions
 2631     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2632     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2633     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2634     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2635 
 2636     //*** jshort
 2637     // Always need aligned and unaligned versions
 2638     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2639     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2640     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2641     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2642 
 2643     //*** jint
 2644     // Aligned versions
 2645     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2646     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2647     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2648     // entry_jint_arraycopy always points to the unaligned version
 2649     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2650     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2651 
 2652     //*** jlong
 2653     // It is always aligned
 2654     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2655     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2656     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2657     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2658 
 2659     //*** oops
 2660     {
 2661       // With compressed oops we need unaligned versions; notice that
 2662       // we overwrite entry_oop_arraycopy.
 2663       bool aligned = !UseCompressedOops;
 2664 
 2665       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2666         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2667       StubRoutines::_arrayof_oop_arraycopy
 2668         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2669       // Aligned versions without pre-barriers
 2670       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2671         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2672       StubRoutines::_arrayof_oop_arraycopy_uninit
 2673         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2674     }
 2675 
 2676     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2677     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2678     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2679     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2680 
 2681     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2682     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2683 
 2684     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2685                                                               entry_jshort_arraycopy,
 2686                                                               entry_jint_arraycopy,
 2687                                                               entry_jlong_arraycopy);
 2688 
 2689     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2690                                                                entry_jshort_arraycopy,
 2691                                                                entry_jint_arraycopy,
 2692                                                                entry_oop_arraycopy,
 2693                                                                entry_jlong_arraycopy,
 2694                                                                entry_checkcast_arraycopy);
 2695 
 2696     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2697     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2698     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2699     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2700     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2701     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2702   }
 2703 
 2704   void generate_math_stubs() { Unimplemented(); }
 2705 
 2706   // Arguments:
 2707   //
 2708   // Inputs:
 2709   //   c_rarg0   - source byte array address
 2710   //   c_rarg1   - destination byte array address
 2711   //   c_rarg2   - K (key) in little endian int array
 2712   //
 2713   address generate_aescrypt_encryptBlock() {
 2714     __ align(CodeEntryAlignment);
 2715     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2716     StubCodeMark mark(this, stub_id);
 2717 
 2718     const Register from        = c_rarg0;  // source array address
 2719     const Register to          = c_rarg1;  // destination array address
 2720     const Register key         = c_rarg2;  // key array address
 2721     const Register keylen      = rscratch1;
 2722 
 2723     address start = __ pc();
 2724     __ enter();
 2725 
 2726     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2727 
 2728     __ aesenc_loadkeys(key, keylen);
 2729     __ aesecb_encrypt(from, to, keylen);
 2730 
 2731     __ mov(r0, 0);
 2732 
 2733     __ leave();
 2734     __ ret(lr);
 2735 
 2736     return start;
 2737   }
 2738 
 2739   // Arguments:
 2740   //
 2741   // Inputs:
 2742   //   c_rarg0   - source byte array address
 2743   //   c_rarg1   - destination byte array address
 2744   //   c_rarg2   - K (key) in little endian int array
 2745   //
 2746   address generate_aescrypt_decryptBlock() {
 2747     assert(UseAES, "need AES cryptographic extension support");
 2748     __ align(CodeEntryAlignment);
 2749     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2750     StubCodeMark mark(this, stub_id);
 2751     Label L_doLast;
 2752 
 2753     const Register from        = c_rarg0;  // source array address
 2754     const Register to          = c_rarg1;  // destination array address
 2755     const Register key         = c_rarg2;  // key array address
 2756     const Register keylen      = rscratch1;
 2757 
 2758     address start = __ pc();
 2759     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2760 
 2761     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2762 
 2763     __ aesecb_decrypt(from, to, key, keylen);
 2764 
 2765     __ mov(r0, 0);
 2766 
 2767     __ leave();
 2768     __ ret(lr);
 2769 
 2770     return start;
 2771   }
 2772 
 2773   // Arguments:
 2774   //
 2775   // Inputs:
 2776   //   c_rarg0   - source byte array address
 2777   //   c_rarg1   - destination byte array address
 2778   //   c_rarg2   - K (key) in little endian int array
 2779   //   c_rarg3   - r vector byte array address
 2780   //   c_rarg4   - input length
 2781   //
 2782   // Output:
 2783   //   x0        - input length
 2784   //
 2785   address generate_cipherBlockChaining_encryptAESCrypt() {
 2786     assert(UseAES, "need AES cryptographic extension support");
 2787     __ align(CodeEntryAlignment);
 2788     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2789     StubCodeMark mark(this, stub_id);
 2790 
 2791     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2792 
 2793     const Register from        = c_rarg0;  // source array address
 2794     const Register to          = c_rarg1;  // destination array address
 2795     const Register key         = c_rarg2;  // key array address
 2796     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2797                                            // and left with the results of the last encryption block
 2798     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2799     const Register keylen      = rscratch1;
 2800 
 2801     address start = __ pc();
 2802 
 2803       __ enter();
 2804 
 2805       __ movw(rscratch2, len_reg);
 2806 
 2807       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2808 
 2809       __ ld1(v0, __ T16B, rvec);
 2810 
 2811       __ cmpw(keylen, 52);
 2812       __ br(Assembler::CC, L_loadkeys_44);
 2813       __ br(Assembler::EQ, L_loadkeys_52);
 2814 
 2815       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2816       __ rev32(v17, __ T16B, v17);
 2817       __ rev32(v18, __ T16B, v18);
 2818     __ BIND(L_loadkeys_52);
 2819       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2820       __ rev32(v19, __ T16B, v19);
 2821       __ rev32(v20, __ T16B, v20);
 2822     __ BIND(L_loadkeys_44);
 2823       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2824       __ rev32(v21, __ T16B, v21);
 2825       __ rev32(v22, __ T16B, v22);
 2826       __ rev32(v23, __ T16B, v23);
 2827       __ rev32(v24, __ T16B, v24);
 2828       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2829       __ rev32(v25, __ T16B, v25);
 2830       __ rev32(v26, __ T16B, v26);
 2831       __ rev32(v27, __ T16B, v27);
 2832       __ rev32(v28, __ T16B, v28);
 2833       __ ld1(v29, v30, v31, __ T16B, key);
 2834       __ rev32(v29, __ T16B, v29);
 2835       __ rev32(v30, __ T16B, v30);
 2836       __ rev32(v31, __ T16B, v31);
 2837 
 2838     __ BIND(L_aes_loop);
 2839       __ ld1(v1, __ T16B, __ post(from, 16));
 2840       __ eor(v0, __ T16B, v0, v1);
 2841 
 2842       __ br(Assembler::CC, L_rounds_44);
 2843       __ br(Assembler::EQ, L_rounds_52);
 2844 
 2845       __ aese(v0, v17); __ aesmc(v0, v0);
 2846       __ aese(v0, v18); __ aesmc(v0, v0);
 2847     __ BIND(L_rounds_52);
 2848       __ aese(v0, v19); __ aesmc(v0, v0);
 2849       __ aese(v0, v20); __ aesmc(v0, v0);
 2850     __ BIND(L_rounds_44);
 2851       __ aese(v0, v21); __ aesmc(v0, v0);
 2852       __ aese(v0, v22); __ aesmc(v0, v0);
 2853       __ aese(v0, v23); __ aesmc(v0, v0);
 2854       __ aese(v0, v24); __ aesmc(v0, v0);
 2855       __ aese(v0, v25); __ aesmc(v0, v0);
 2856       __ aese(v0, v26); __ aesmc(v0, v0);
 2857       __ aese(v0, v27); __ aesmc(v0, v0);
 2858       __ aese(v0, v28); __ aesmc(v0, v0);
 2859       __ aese(v0, v29); __ aesmc(v0, v0);
 2860       __ aese(v0, v30);
 2861       __ eor(v0, __ T16B, v0, v31);
 2862 
 2863       __ st1(v0, __ T16B, __ post(to, 16));
 2864 
 2865       __ subw(len_reg, len_reg, 16);
 2866       __ cbnzw(len_reg, L_aes_loop);
 2867 
 2868       __ st1(v0, __ T16B, rvec);
 2869 
 2870       __ mov(r0, rscratch2);
 2871 
 2872       __ leave();
 2873       __ ret(lr);
 2874 
 2875       return start;
 2876   }
 2877 
 2878   // Arguments:
 2879   //
 2880   // Inputs:
 2881   //   c_rarg0   - source byte array address
 2882   //   c_rarg1   - destination byte array address
 2883   //   c_rarg2   - K (key) in little endian int array
 2884   //   c_rarg3   - r vector byte array address
 2885   //   c_rarg4   - input length
 2886   //
 2887   // Output:
 2888   //   r0        - input length
 2889   //
 2890   address generate_cipherBlockChaining_decryptAESCrypt() {
 2891     assert(UseAES, "need AES cryptographic extension support");
 2892     __ align(CodeEntryAlignment);
 2893     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2894     StubCodeMark mark(this, stub_id);
 2895 
 2896     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2897 
 2898     const Register from        = c_rarg0;  // source array address
 2899     const Register to          = c_rarg1;  // destination array address
 2900     const Register key         = c_rarg2;  // key array address
 2901     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2902                                            // and left with the results of the last encryption block
 2903     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2904     const Register keylen      = rscratch1;
 2905 
 2906     address start = __ pc();
 2907 
 2908       __ enter();
 2909 
 2910       __ movw(rscratch2, len_reg);
 2911 
 2912       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2913 
 2914       __ ld1(v2, __ T16B, rvec);
 2915 
 2916       __ ld1(v31, __ T16B, __ post(key, 16));
 2917       __ rev32(v31, __ T16B, v31);
 2918 
 2919       __ cmpw(keylen, 52);
 2920       __ br(Assembler::CC, L_loadkeys_44);
 2921       __ br(Assembler::EQ, L_loadkeys_52);
 2922 
 2923       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2924       __ rev32(v17, __ T16B, v17);
 2925       __ rev32(v18, __ T16B, v18);
 2926     __ BIND(L_loadkeys_52);
 2927       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2928       __ rev32(v19, __ T16B, v19);
 2929       __ rev32(v20, __ T16B, v20);
 2930     __ BIND(L_loadkeys_44);
 2931       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2932       __ rev32(v21, __ T16B, v21);
 2933       __ rev32(v22, __ T16B, v22);
 2934       __ rev32(v23, __ T16B, v23);
 2935       __ rev32(v24, __ T16B, v24);
 2936       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2937       __ rev32(v25, __ T16B, v25);
 2938       __ rev32(v26, __ T16B, v26);
 2939       __ rev32(v27, __ T16B, v27);
 2940       __ rev32(v28, __ T16B, v28);
 2941       __ ld1(v29, v30, __ T16B, key);
 2942       __ rev32(v29, __ T16B, v29);
 2943       __ rev32(v30, __ T16B, v30);
 2944 
 2945     __ BIND(L_aes_loop);
 2946       __ ld1(v0, __ T16B, __ post(from, 16));
 2947       __ orr(v1, __ T16B, v0, v0);
 2948 
 2949       __ br(Assembler::CC, L_rounds_44);
 2950       __ br(Assembler::EQ, L_rounds_52);
 2951 
 2952       __ aesd(v0, v17); __ aesimc(v0, v0);
 2953       __ aesd(v0, v18); __ aesimc(v0, v0);
 2954     __ BIND(L_rounds_52);
 2955       __ aesd(v0, v19); __ aesimc(v0, v0);
 2956       __ aesd(v0, v20); __ aesimc(v0, v0);
 2957     __ BIND(L_rounds_44);
 2958       __ aesd(v0, v21); __ aesimc(v0, v0);
 2959       __ aesd(v0, v22); __ aesimc(v0, v0);
 2960       __ aesd(v0, v23); __ aesimc(v0, v0);
 2961       __ aesd(v0, v24); __ aesimc(v0, v0);
 2962       __ aesd(v0, v25); __ aesimc(v0, v0);
 2963       __ aesd(v0, v26); __ aesimc(v0, v0);
 2964       __ aesd(v0, v27); __ aesimc(v0, v0);
 2965       __ aesd(v0, v28); __ aesimc(v0, v0);
 2966       __ aesd(v0, v29); __ aesimc(v0, v0);
 2967       __ aesd(v0, v30);
 2968       __ eor(v0, __ T16B, v0, v31);
 2969       __ eor(v0, __ T16B, v0, v2);
 2970 
 2971       __ st1(v0, __ T16B, __ post(to, 16));
 2972       __ orr(v2, __ T16B, v1, v1);
 2973 
 2974       __ subw(len_reg, len_reg, 16);
 2975       __ cbnzw(len_reg, L_aes_loop);
 2976 
 2977       __ st1(v2, __ T16B, rvec);
 2978 
 2979       __ mov(r0, rscratch2);
 2980 
 2981       __ leave();
 2982       __ ret(lr);
 2983 
 2984     return start;
 2985   }
 2986 
 2987   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 2988   // Inputs: 128-bits. in is preserved.
 2989   // The least-significant 64-bit word is in the upper dword of each vector.
 2990   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 2991   // Output: result
 2992   void be_add_128_64(FloatRegister result, FloatRegister in,
 2993                      FloatRegister inc, FloatRegister tmp) {
 2994     assert_different_registers(result, tmp, inc);
 2995 
 2996     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 2997                                            // input
 2998     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 2999     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3000                                            // MSD == 0 (must be!) to LSD
 3001     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3002   }
 3003 
 3004   // CTR AES crypt.
 3005   // Arguments:
 3006   //
 3007   // Inputs:
 3008   //   c_rarg0   - source byte array address
 3009   //   c_rarg1   - destination byte array address
 3010   //   c_rarg2   - K (key) in little endian int array
 3011   //   c_rarg3   - counter vector byte array address
 3012   //   c_rarg4   - input length
 3013   //   c_rarg5   - saved encryptedCounter start
 3014   //   c_rarg6   - saved used length
 3015   //
 3016   // Output:
 3017   //   r0       - input length
 3018   //
 3019   address generate_counterMode_AESCrypt() {
 3020     const Register in = c_rarg0;
 3021     const Register out = c_rarg1;
 3022     const Register key = c_rarg2;
 3023     const Register counter = c_rarg3;
 3024     const Register saved_len = c_rarg4, len = r10;
 3025     const Register saved_encrypted_ctr = c_rarg5;
 3026     const Register used_ptr = c_rarg6, used = r12;
 3027 
 3028     const Register offset = r7;
 3029     const Register keylen = r11;
 3030 
 3031     const unsigned char block_size = 16;
 3032     const int bulk_width = 4;
 3033     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3034     // performance with larger data sizes, but it also means that the
 3035     // fast path isn't used until you have at least 8 blocks, and up
 3036     // to 127 bytes of data will be executed on the slow path. For
 3037     // that reason, and also so as not to blow away too much icache, 4
 3038     // blocks seems like a sensible compromise.
 3039 
 3040     // Algorithm:
 3041     //
 3042     //    if (len == 0) {
 3043     //        goto DONE;
 3044     //    }
 3045     //    int result = len;
 3046     //    do {
 3047     //        if (used >= blockSize) {
 3048     //            if (len >= bulk_width * blockSize) {
 3049     //                CTR_large_block();
 3050     //                if (len == 0)
 3051     //                    goto DONE;
 3052     //            }
 3053     //            for (;;) {
 3054     //                16ByteVector v0 = counter;
 3055     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3056     //                used = 0;
 3057     //                if (len < blockSize)
 3058     //                    break;    /* goto NEXT */
 3059     //                16ByteVector v1 = load16Bytes(in, offset);
 3060     //                v1 = v1 ^ encryptedCounter;
 3061     //                store16Bytes(out, offset);
 3062     //                used = blockSize;
 3063     //                offset += blockSize;
 3064     //                len -= blockSize;
 3065     //                if (len == 0)
 3066     //                    goto DONE;
 3067     //            }
 3068     //        }
 3069     //      NEXT:
 3070     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3071     //        len--;
 3072     //    } while (len != 0);
 3073     //  DONE:
 3074     //    return result;
 3075     //
 3076     // CTR_large_block()
 3077     //    Wide bulk encryption of whole blocks.
 3078 
 3079     __ align(CodeEntryAlignment);
 3080     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3081     StubCodeMark mark(this, stub_id);
 3082     const address start = __ pc();
 3083     __ enter();
 3084 
 3085     Label DONE, CTR_large_block, large_block_return;
 3086     __ ldrw(used, Address(used_ptr));
 3087     __ cbzw(saved_len, DONE);
 3088 
 3089     __ mov(len, saved_len);
 3090     __ mov(offset, 0);
 3091 
 3092     // Compute #rounds for AES based on the length of the key array
 3093     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3094 
 3095     __ aesenc_loadkeys(key, keylen);
 3096 
 3097     {
 3098       Label L_CTR_loop, NEXT;
 3099 
 3100       __ bind(L_CTR_loop);
 3101 
 3102       __ cmp(used, block_size);
 3103       __ br(__ LO, NEXT);
 3104 
 3105       // Maybe we have a lot of data
 3106       __ subsw(rscratch1, len, bulk_width * block_size);
 3107       __ br(__ HS, CTR_large_block);
 3108       __ BIND(large_block_return);
 3109       __ cbzw(len, DONE);
 3110 
 3111       // Setup the counter
 3112       __ movi(v4, __ T4S, 0);
 3113       __ movi(v5, __ T4S, 1);
 3114       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3115 
 3116       // 128-bit big-endian increment
 3117       __ ld1(v0, __ T16B, counter);
 3118       __ rev64(v16, __ T16B, v0);
 3119       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3120       __ rev64(v16, __ T16B, v16);
 3121       __ st1(v16, __ T16B, counter);
 3122       // Previous counter value is in v0
 3123       // v4 contains { 0, 1 }
 3124 
 3125       {
 3126         // We have fewer than bulk_width blocks of data left. Encrypt
 3127         // them one by one until there is less than a full block
 3128         // remaining, being careful to save both the encrypted counter
 3129         // and the counter.
 3130 
 3131         Label inner_loop;
 3132         __ bind(inner_loop);
 3133         // Counter to encrypt is in v0
 3134         __ aesecb_encrypt(noreg, noreg, keylen);
 3135         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3136 
 3137         // Do we have a remaining full block?
 3138 
 3139         __ mov(used, 0);
 3140         __ cmp(len, block_size);
 3141         __ br(__ LO, NEXT);
 3142 
 3143         // Yes, we have a full block
 3144         __ ldrq(v1, Address(in, offset));
 3145         __ eor(v1, __ T16B, v1, v0);
 3146         __ strq(v1, Address(out, offset));
 3147         __ mov(used, block_size);
 3148         __ add(offset, offset, block_size);
 3149 
 3150         __ subw(len, len, block_size);
 3151         __ cbzw(len, DONE);
 3152 
 3153         // Increment the counter, store it back
 3154         __ orr(v0, __ T16B, v16, v16);
 3155         __ rev64(v16, __ T16B, v16);
 3156         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3157         __ rev64(v16, __ T16B, v16);
 3158         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3159 
 3160         __ b(inner_loop);
 3161       }
 3162 
 3163       __ BIND(NEXT);
 3164 
 3165       // Encrypt a single byte, and loop.
 3166       // We expect this to be a rare event.
 3167       __ ldrb(rscratch1, Address(in, offset));
 3168       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3169       __ eor(rscratch1, rscratch1, rscratch2);
 3170       __ strb(rscratch1, Address(out, offset));
 3171       __ add(offset, offset, 1);
 3172       __ add(used, used, 1);
 3173       __ subw(len, len,1);
 3174       __ cbnzw(len, L_CTR_loop);
 3175     }
 3176 
 3177     __ bind(DONE);
 3178     __ strw(used, Address(used_ptr));
 3179     __ mov(r0, saved_len);
 3180 
 3181     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3182     __ ret(lr);
 3183 
 3184     // Bulk encryption
 3185 
 3186     __ BIND (CTR_large_block);
 3187     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3188 
 3189     if (bulk_width == 8) {
 3190       __ sub(sp, sp, 4 * 16);
 3191       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3192     }
 3193     __ sub(sp, sp, 4 * 16);
 3194     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3195     RegSet saved_regs = (RegSet::of(in, out, offset)
 3196                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3197     __ push(saved_regs, sp);
 3198     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3199     __ add(in, in, offset);
 3200     __ add(out, out, offset);
 3201 
 3202     // Keys should already be loaded into the correct registers
 3203 
 3204     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3205     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3206 
 3207     // AES/CTR loop
 3208     {
 3209       Label L_CTR_loop;
 3210       __ BIND(L_CTR_loop);
 3211 
 3212       // Setup the counters
 3213       __ movi(v8, __ T4S, 0);
 3214       __ movi(v9, __ T4S, 1);
 3215       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3216 
 3217       for (int i = 0; i < bulk_width; i++) {
 3218         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3219         __ rev64(v0_ofs, __ T16B, v16);
 3220         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3221       }
 3222 
 3223       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3224 
 3225       // Encrypt the counters
 3226       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3227 
 3228       if (bulk_width == 8) {
 3229         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3230       }
 3231 
 3232       // XOR the encrypted counters with the inputs
 3233       for (int i = 0; i < bulk_width; i++) {
 3234         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3235         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3236         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3237       }
 3238 
 3239       // Write the encrypted data
 3240       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3241       if (bulk_width == 8) {
 3242         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3243       }
 3244 
 3245       __ subw(len, len, 16 * bulk_width);
 3246       __ cbnzw(len, L_CTR_loop);
 3247     }
 3248 
 3249     // Save the counter back where it goes
 3250     __ rev64(v16, __ T16B, v16);
 3251     __ st1(v16, __ T16B, counter);
 3252 
 3253     __ pop(saved_regs, sp);
 3254 
 3255     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3256     if (bulk_width == 8) {
 3257       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3258     }
 3259 
 3260     __ andr(rscratch1, len, -16 * bulk_width);
 3261     __ sub(len, len, rscratch1);
 3262     __ add(offset, offset, rscratch1);
 3263     __ mov(used, 16);
 3264     __ strw(used, Address(used_ptr));
 3265     __ b(large_block_return);
 3266 
 3267     return start;
 3268   }
 3269 
 3270   // Vector AES Galois Counter Mode implementation. Parameters:
 3271   //
 3272   // in = c_rarg0
 3273   // len = c_rarg1
 3274   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3275   // out = c_rarg3
 3276   // key = c_rarg4
 3277   // state = c_rarg5 - GHASH.state
 3278   // subkeyHtbl = c_rarg6 - powers of H
 3279   // counter = c_rarg7 - 16 bytes of CTR
 3280   // return - number of processed bytes
 3281   address generate_galoisCounterMode_AESCrypt() {
 3282     address ghash_polynomial = __ pc();
 3283     __ emit_int64(0x87);  // The low-order bits of the field
 3284                           // polynomial (i.e. p = z^7+z^2+z+1)
 3285                           // repeated in the low and high parts of a
 3286                           // 128-bit vector
 3287     __ emit_int64(0x87);
 3288 
 3289     __ align(CodeEntryAlignment);
 3290     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3291     StubCodeMark mark(this, stub_id);
 3292     address start = __ pc();
 3293     __ enter();
 3294 
 3295     const Register in = c_rarg0;
 3296     const Register len = c_rarg1;
 3297     const Register ct = c_rarg2;
 3298     const Register out = c_rarg3;
 3299     // and updated with the incremented counter in the end
 3300 
 3301     const Register key = c_rarg4;
 3302     const Register state = c_rarg5;
 3303 
 3304     const Register subkeyHtbl = c_rarg6;
 3305 
 3306     const Register counter = c_rarg7;
 3307 
 3308     const Register keylen = r10;
 3309     // Save state before entering routine
 3310     __ sub(sp, sp, 4 * 16);
 3311     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3312     __ sub(sp, sp, 4 * 16);
 3313     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3314 
 3315     // __ andr(len, len, -512);
 3316     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3317     __ str(len, __ pre(sp, -2 * wordSize));
 3318 
 3319     Label DONE;
 3320     __ cbz(len, DONE);
 3321 
 3322     // Compute #rounds for AES based on the length of the key array
 3323     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3324 
 3325     __ aesenc_loadkeys(key, keylen);
 3326     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3327     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3328 
 3329     // AES/CTR loop
 3330     {
 3331       Label L_CTR_loop;
 3332       __ BIND(L_CTR_loop);
 3333 
 3334       // Setup the counters
 3335       __ movi(v8, __ T4S, 0);
 3336       __ movi(v9, __ T4S, 1);
 3337       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3338 
 3339       assert(v0->encoding() < v8->encoding(), "");
 3340       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3341         FloatRegister f = as_FloatRegister(i);
 3342         __ rev32(f, __ T16B, v16);
 3343         __ addv(v16, __ T4S, v16, v8);
 3344       }
 3345 
 3346       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3347 
 3348       // Encrypt the counters
 3349       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3350 
 3351       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3352 
 3353       // XOR the encrypted counters with the inputs
 3354       for (int i = 0; i < 8; i++) {
 3355         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3356         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3357         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3358       }
 3359       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3360       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3361 
 3362       __ subw(len, len, 16 * 8);
 3363       __ cbnzw(len, L_CTR_loop);
 3364     }
 3365 
 3366     __ rev32(v16, __ T16B, v16);
 3367     __ st1(v16, __ T16B, counter);
 3368 
 3369     __ ldr(len, Address(sp));
 3370     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3371 
 3372     // GHASH/CTR loop
 3373     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3374                                 len, /*unrolls*/4);
 3375 
 3376 #ifdef ASSERT
 3377     { Label L;
 3378       __ cmp(len, (unsigned char)0);
 3379       __ br(Assembler::EQ, L);
 3380       __ stop("stubGenerator: abort");
 3381       __ bind(L);
 3382   }
 3383 #endif
 3384 
 3385   __ bind(DONE);
 3386     // Return the number of bytes processed
 3387     __ ldr(r0, __ post(sp, 2 * wordSize));
 3388 
 3389     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3390     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3391 
 3392     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3393     __ ret(lr);
 3394      return start;
 3395   }
 3396 
 3397   class Cached64Bytes {
 3398   private:
 3399     MacroAssembler *_masm;
 3400     Register _regs[8];
 3401 
 3402   public:
 3403     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3404       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3405       auto it = rs.begin();
 3406       for (auto &r: _regs) {
 3407         r = *it;
 3408         ++it;
 3409       }
 3410     }
 3411 
 3412     void gen_loads(Register base) {
 3413       for (int i = 0; i < 8; i += 2) {
 3414         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3415       }
 3416     }
 3417 
 3418     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3419     void extract_u32(Register dest, int i) {
 3420       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3421     }
 3422   };
 3423 
 3424   // Utility routines for md5.
 3425   // Clobbers r10 and r11.
 3426   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3427               int k, int s, int t) {
 3428     Register rscratch3 = r10;
 3429     Register rscratch4 = r11;
 3430 
 3431     __ eorw(rscratch3, r3, r4);
 3432     __ movw(rscratch2, t);
 3433     __ andw(rscratch3, rscratch3, r2);
 3434     __ addw(rscratch4, r1, rscratch2);
 3435     reg_cache.extract_u32(rscratch1, k);
 3436     __ eorw(rscratch3, rscratch3, r4);
 3437     __ addw(rscratch4, rscratch4, rscratch1);
 3438     __ addw(rscratch3, rscratch3, rscratch4);
 3439     __ rorw(rscratch2, rscratch3, 32 - s);
 3440     __ addw(r1, rscratch2, r2);
 3441   }
 3442 
 3443   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3444               int k, int s, int t) {
 3445     Register rscratch3 = r10;
 3446     Register rscratch4 = r11;
 3447 
 3448     reg_cache.extract_u32(rscratch1, k);
 3449     __ movw(rscratch2, t);
 3450     __ addw(rscratch4, r1, rscratch2);
 3451     __ addw(rscratch4, rscratch4, rscratch1);
 3452     __ bicw(rscratch2, r3, r4);
 3453     __ andw(rscratch3, r2, r4);
 3454     __ addw(rscratch2, rscratch2, rscratch4);
 3455     __ addw(rscratch2, rscratch2, rscratch3);
 3456     __ rorw(rscratch2, rscratch2, 32 - s);
 3457     __ addw(r1, rscratch2, r2);
 3458   }
 3459 
 3460   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3461               int k, int s, int t) {
 3462     Register rscratch3 = r10;
 3463     Register rscratch4 = r11;
 3464 
 3465     __ eorw(rscratch3, r3, r4);
 3466     __ movw(rscratch2, t);
 3467     __ addw(rscratch4, r1, rscratch2);
 3468     reg_cache.extract_u32(rscratch1, k);
 3469     __ eorw(rscratch3, rscratch3, r2);
 3470     __ addw(rscratch4, rscratch4, rscratch1);
 3471     __ addw(rscratch3, rscratch3, rscratch4);
 3472     __ rorw(rscratch2, rscratch3, 32 - s);
 3473     __ addw(r1, rscratch2, r2);
 3474   }
 3475 
 3476   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3477               int k, int s, int t) {
 3478     Register rscratch3 = r10;
 3479     Register rscratch4 = r11;
 3480 
 3481     __ movw(rscratch3, t);
 3482     __ ornw(rscratch2, r2, r4);
 3483     __ addw(rscratch4, r1, rscratch3);
 3484     reg_cache.extract_u32(rscratch1, k);
 3485     __ eorw(rscratch3, rscratch2, r3);
 3486     __ addw(rscratch4, rscratch4, rscratch1);
 3487     __ addw(rscratch3, rscratch3, rscratch4);
 3488     __ rorw(rscratch2, rscratch3, 32 - s);
 3489     __ addw(r1, rscratch2, r2);
 3490   }
 3491 
 3492   // Arguments:
 3493   //
 3494   // Inputs:
 3495   //   c_rarg0   - byte[]  source+offset
 3496   //   c_rarg1   - int[]   SHA.state
 3497   //   c_rarg2   - int     offset
 3498   //   c_rarg3   - int     limit
 3499   //
 3500   address generate_md5_implCompress(StubGenStubId stub_id) {
 3501     bool multi_block;
 3502     switch (stub_id) {
 3503     case md5_implCompress_id:
 3504       multi_block = false;
 3505       break;
 3506     case md5_implCompressMB_id:
 3507       multi_block = true;
 3508       break;
 3509     default:
 3510       ShouldNotReachHere();
 3511     }
 3512     __ align(CodeEntryAlignment);
 3513 
 3514     StubCodeMark mark(this, stub_id);
 3515     address start = __ pc();
 3516 
 3517     Register buf       = c_rarg0;
 3518     Register state     = c_rarg1;
 3519     Register ofs       = c_rarg2;
 3520     Register limit     = c_rarg3;
 3521     Register a         = r4;
 3522     Register b         = r5;
 3523     Register c         = r6;
 3524     Register d         = r7;
 3525     Register rscratch3 = r10;
 3526     Register rscratch4 = r11;
 3527 
 3528     Register state_regs[2] = { r12, r13 };
 3529     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3530     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3531 
 3532     __ push(saved_regs, sp);
 3533 
 3534     __ ldp(state_regs[0], state_regs[1], Address(state));
 3535     __ ubfx(a, state_regs[0],  0, 32);
 3536     __ ubfx(b, state_regs[0], 32, 32);
 3537     __ ubfx(c, state_regs[1],  0, 32);
 3538     __ ubfx(d, state_regs[1], 32, 32);
 3539 
 3540     Label md5_loop;
 3541     __ BIND(md5_loop);
 3542 
 3543     reg_cache.gen_loads(buf);
 3544 
 3545     // Round 1
 3546     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3547     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3548     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3549     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3550     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3551     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3552     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3553     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3554     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3555     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3556     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3557     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3558     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3559     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3560     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3561     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3562 
 3563     // Round 2
 3564     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3565     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3566     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3567     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3568     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3569     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3570     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3571     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3572     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3573     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3574     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3575     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3576     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3577     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3578     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3579     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3580 
 3581     // Round 3
 3582     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3583     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3584     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3585     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3586     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3587     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3588     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3589     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3590     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3591     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3592     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3593     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3594     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3595     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3596     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3597     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3598 
 3599     // Round 4
 3600     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3601     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3602     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3603     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3604     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3605     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3606     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3607     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3608     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3609     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3610     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3611     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3612     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3613     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3614     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3615     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3616 
 3617     __ addw(a, state_regs[0], a);
 3618     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3619     __ addw(b, rscratch2, b);
 3620     __ addw(c, state_regs[1], c);
 3621     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3622     __ addw(d, rscratch4, d);
 3623 
 3624     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3625     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3626 
 3627     if (multi_block) {
 3628       __ add(buf, buf, 64);
 3629       __ add(ofs, ofs, 64);
 3630       __ cmp(ofs, limit);
 3631       __ br(Assembler::LE, md5_loop);
 3632       __ mov(c_rarg0, ofs); // return ofs
 3633     }
 3634 
 3635     // write hash values back in the correct order
 3636     __ stp(state_regs[0], state_regs[1], Address(state));
 3637 
 3638     __ pop(saved_regs, sp);
 3639 
 3640     __ ret(lr);
 3641 
 3642     return start;
 3643   }
 3644 
 3645   // Arguments:
 3646   //
 3647   // Inputs:
 3648   //   c_rarg0   - byte[]  source+offset
 3649   //   c_rarg1   - int[]   SHA.state
 3650   //   c_rarg2   - int     offset
 3651   //   c_rarg3   - int     limit
 3652   //
 3653   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3654     bool multi_block;
 3655     switch (stub_id) {
 3656     case sha1_implCompress_id:
 3657       multi_block = false;
 3658       break;
 3659     case sha1_implCompressMB_id:
 3660       multi_block = true;
 3661       break;
 3662     default:
 3663       ShouldNotReachHere();
 3664     }
 3665 
 3666     __ align(CodeEntryAlignment);
 3667 
 3668     StubCodeMark mark(this, stub_id);
 3669     address start = __ pc();
 3670 
 3671     Register buf   = c_rarg0;
 3672     Register state = c_rarg1;
 3673     Register ofs   = c_rarg2;
 3674     Register limit = c_rarg3;
 3675 
 3676     Label keys;
 3677     Label sha1_loop;
 3678 
 3679     // load the keys into v0..v3
 3680     __ adr(rscratch1, keys);
 3681     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3682     // load 5 words state into v6, v7
 3683     __ ldrq(v6, Address(state, 0));
 3684     __ ldrs(v7, Address(state, 16));
 3685 
 3686 
 3687     __ BIND(sha1_loop);
 3688     // load 64 bytes of data into v16..v19
 3689     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3690     __ rev32(v16, __ T16B, v16);
 3691     __ rev32(v17, __ T16B, v17);
 3692     __ rev32(v18, __ T16B, v18);
 3693     __ rev32(v19, __ T16B, v19);
 3694 
 3695     // do the sha1
 3696     __ addv(v4, __ T4S, v16, v0);
 3697     __ orr(v20, __ T16B, v6, v6);
 3698 
 3699     FloatRegister d0 = v16;
 3700     FloatRegister d1 = v17;
 3701     FloatRegister d2 = v18;
 3702     FloatRegister d3 = v19;
 3703 
 3704     for (int round = 0; round < 20; round++) {
 3705       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3706       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3707       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3708       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3709       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3710 
 3711       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3712       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3713       __ sha1h(tmp2, __ T4S, v20);
 3714       if (round < 5)
 3715         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3716       else if (round < 10 || round >= 15)
 3717         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3718       else
 3719         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3720       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3721 
 3722       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3723     }
 3724 
 3725     __ addv(v7, __ T2S, v7, v21);
 3726     __ addv(v6, __ T4S, v6, v20);
 3727 
 3728     if (multi_block) {
 3729       __ add(ofs, ofs, 64);
 3730       __ cmp(ofs, limit);
 3731       __ br(Assembler::LE, sha1_loop);
 3732       __ mov(c_rarg0, ofs); // return ofs
 3733     }
 3734 
 3735     __ strq(v6, Address(state, 0));
 3736     __ strs(v7, Address(state, 16));
 3737 
 3738     __ ret(lr);
 3739 
 3740     __ bind(keys);
 3741     __ emit_int32(0x5a827999);
 3742     __ emit_int32(0x6ed9eba1);
 3743     __ emit_int32(0x8f1bbcdc);
 3744     __ emit_int32(0xca62c1d6);
 3745 
 3746     return start;
 3747   }
 3748 
 3749 
 3750   // Arguments:
 3751   //
 3752   // Inputs:
 3753   //   c_rarg0   - byte[]  source+offset
 3754   //   c_rarg1   - int[]   SHA.state
 3755   //   c_rarg2   - int     offset
 3756   //   c_rarg3   - int     limit
 3757   //
 3758   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3759     bool multi_block;
 3760     switch (stub_id) {
 3761     case sha256_implCompress_id:
 3762       multi_block = false;
 3763       break;
 3764     case sha256_implCompressMB_id:
 3765       multi_block = true;
 3766       break;
 3767     default:
 3768       ShouldNotReachHere();
 3769     }
 3770 
 3771     static const uint32_t round_consts[64] = {
 3772       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3773       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3774       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3775       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3776       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3777       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3778       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3779       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3780       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3781       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3782       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3783       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3784       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3785       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3786       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3787       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3788     };
 3789 
 3790     __ align(CodeEntryAlignment);
 3791 
 3792     StubCodeMark mark(this, stub_id);
 3793     address start = __ pc();
 3794 
 3795     Register buf   = c_rarg0;
 3796     Register state = c_rarg1;
 3797     Register ofs   = c_rarg2;
 3798     Register limit = c_rarg3;
 3799 
 3800     Label sha1_loop;
 3801 
 3802     __ stpd(v8, v9, __ pre(sp, -32));
 3803     __ stpd(v10, v11, Address(sp, 16));
 3804 
 3805 // dga == v0
 3806 // dgb == v1
 3807 // dg0 == v2
 3808 // dg1 == v3
 3809 // dg2 == v4
 3810 // t0 == v6
 3811 // t1 == v7
 3812 
 3813     // load 16 keys to v16..v31
 3814     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3815     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3816     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3817     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3818     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3819 
 3820     // load 8 words (256 bits) state
 3821     __ ldpq(v0, v1, state);
 3822 
 3823     __ BIND(sha1_loop);
 3824     // load 64 bytes of data into v8..v11
 3825     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3826     __ rev32(v8, __ T16B, v8);
 3827     __ rev32(v9, __ T16B, v9);
 3828     __ rev32(v10, __ T16B, v10);
 3829     __ rev32(v11, __ T16B, v11);
 3830 
 3831     __ addv(v6, __ T4S, v8, v16);
 3832     __ orr(v2, __ T16B, v0, v0);
 3833     __ orr(v3, __ T16B, v1, v1);
 3834 
 3835     FloatRegister d0 = v8;
 3836     FloatRegister d1 = v9;
 3837     FloatRegister d2 = v10;
 3838     FloatRegister d3 = v11;
 3839 
 3840 
 3841     for (int round = 0; round < 16; round++) {
 3842       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3843       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3844       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3845       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3846 
 3847       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3848        __ orr(v4, __ T16B, v2, v2);
 3849       if (round < 15)
 3850         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3851       __ sha256h(v2, __ T4S, v3, tmp2);
 3852       __ sha256h2(v3, __ T4S, v4, tmp2);
 3853       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3854 
 3855       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3856     }
 3857 
 3858     __ addv(v0, __ T4S, v0, v2);
 3859     __ addv(v1, __ T4S, v1, v3);
 3860 
 3861     if (multi_block) {
 3862       __ add(ofs, ofs, 64);
 3863       __ cmp(ofs, limit);
 3864       __ br(Assembler::LE, sha1_loop);
 3865       __ mov(c_rarg0, ofs); // return ofs
 3866     }
 3867 
 3868     __ ldpd(v10, v11, Address(sp, 16));
 3869     __ ldpd(v8, v9, __ post(sp, 32));
 3870 
 3871     __ stpq(v0, v1, state);
 3872 
 3873     __ ret(lr);
 3874 
 3875     return start;
 3876   }
 3877 
 3878   // Double rounds for sha512.
 3879   void sha512_dround(int dr,
 3880                      FloatRegister vi0, FloatRegister vi1,
 3881                      FloatRegister vi2, FloatRegister vi3,
 3882                      FloatRegister vi4, FloatRegister vrc0,
 3883                      FloatRegister vrc1, FloatRegister vin0,
 3884                      FloatRegister vin1, FloatRegister vin2,
 3885                      FloatRegister vin3, FloatRegister vin4) {
 3886       if (dr < 36) {
 3887         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3888       }
 3889       __ addv(v5, __ T2D, vrc0, vin0);
 3890       __ ext(v6, __ T16B, vi2, vi3, 8);
 3891       __ ext(v5, __ T16B, v5, v5, 8);
 3892       __ ext(v7, __ T16B, vi1, vi2, 8);
 3893       __ addv(vi3, __ T2D, vi3, v5);
 3894       if (dr < 32) {
 3895         __ ext(v5, __ T16B, vin3, vin4, 8);
 3896         __ sha512su0(vin0, __ T2D, vin1);
 3897       }
 3898       __ sha512h(vi3, __ T2D, v6, v7);
 3899       if (dr < 32) {
 3900         __ sha512su1(vin0, __ T2D, vin2, v5);
 3901       }
 3902       __ addv(vi4, __ T2D, vi1, vi3);
 3903       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3904   }
 3905 
 3906   // Arguments:
 3907   //
 3908   // Inputs:
 3909   //   c_rarg0   - byte[]  source+offset
 3910   //   c_rarg1   - int[]   SHA.state
 3911   //   c_rarg2   - int     offset
 3912   //   c_rarg3   - int     limit
 3913   //
 3914   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3915     bool multi_block;
 3916     switch (stub_id) {
 3917     case sha512_implCompress_id:
 3918       multi_block = false;
 3919       break;
 3920     case sha512_implCompressMB_id:
 3921       multi_block = true;
 3922       break;
 3923     default:
 3924       ShouldNotReachHere();
 3925     }
 3926 
 3927     static const uint64_t round_consts[80] = {
 3928       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3929       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3930       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3931       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3932       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3933       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3934       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3935       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3936       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3937       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3938       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3939       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3940       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3941       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3942       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3943       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3944       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3945       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3946       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3947       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3948       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3949       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3950       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3951       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3952       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3953       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3954       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3955     };
 3956 
 3957     __ align(CodeEntryAlignment);
 3958 
 3959     StubCodeMark mark(this, stub_id);
 3960     address start = __ pc();
 3961 
 3962     Register buf   = c_rarg0;
 3963     Register state = c_rarg1;
 3964     Register ofs   = c_rarg2;
 3965     Register limit = c_rarg3;
 3966 
 3967     __ stpd(v8, v9, __ pre(sp, -64));
 3968     __ stpd(v10, v11, Address(sp, 16));
 3969     __ stpd(v12, v13, Address(sp, 32));
 3970     __ stpd(v14, v15, Address(sp, 48));
 3971 
 3972     Label sha512_loop;
 3973 
 3974     // load state
 3975     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3976 
 3977     // load first 4 round constants
 3978     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3979     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 3980 
 3981     __ BIND(sha512_loop);
 3982     // load 128B of data into v12..v19
 3983     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 3984     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 3985     __ rev64(v12, __ T16B, v12);
 3986     __ rev64(v13, __ T16B, v13);
 3987     __ rev64(v14, __ T16B, v14);
 3988     __ rev64(v15, __ T16B, v15);
 3989     __ rev64(v16, __ T16B, v16);
 3990     __ rev64(v17, __ T16B, v17);
 3991     __ rev64(v18, __ T16B, v18);
 3992     __ rev64(v19, __ T16B, v19);
 3993 
 3994     __ mov(rscratch2, rscratch1);
 3995 
 3996     __ mov(v0, __ T16B, v8);
 3997     __ mov(v1, __ T16B, v9);
 3998     __ mov(v2, __ T16B, v10);
 3999     __ mov(v3, __ T16B, v11);
 4000 
 4001     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4002     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4003     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4004     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4005     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4006     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4007     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4008     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4009     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4010     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4011     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4012     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4013     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4014     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4015     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4016     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4017     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4018     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4019     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4020     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4021     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4022     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4023     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4024     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4025     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4026     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4027     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4028     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4029     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4030     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4031     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4032     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4033     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4034     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4035     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4036     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4037     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4038     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4039     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4040     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4041 
 4042     __ addv(v8, __ T2D, v8, v0);
 4043     __ addv(v9, __ T2D, v9, v1);
 4044     __ addv(v10, __ T2D, v10, v2);
 4045     __ addv(v11, __ T2D, v11, v3);
 4046 
 4047     if (multi_block) {
 4048       __ add(ofs, ofs, 128);
 4049       __ cmp(ofs, limit);
 4050       __ br(Assembler::LE, sha512_loop);
 4051       __ mov(c_rarg0, ofs); // return ofs
 4052     }
 4053 
 4054     __ st1(v8, v9, v10, v11, __ T2D, state);
 4055 
 4056     __ ldpd(v14, v15, Address(sp, 48));
 4057     __ ldpd(v12, v13, Address(sp, 32));
 4058     __ ldpd(v10, v11, Address(sp, 16));
 4059     __ ldpd(v8, v9, __ post(sp, 64));
 4060 
 4061     __ ret(lr);
 4062 
 4063     return start;
 4064   }
 4065 
 4066   // Execute one round of keccak of two computations in parallel.
 4067   // One of the states should be loaded into the lower halves of
 4068   // the vector registers v0-v24, the other should be loaded into
 4069   // the upper halves of those registers. The ld1r instruction loads
 4070   // the round constant into both halves of register v31.
 4071   // Intermediate results c0...c5 and d0...d5 are computed
 4072   // in registers v25...v30.
 4073   // All vector instructions that are used operate on both register
 4074   // halves in parallel.
 4075   // If only a single computation is needed, one can only load the lower halves.
 4076   void keccak_round(Register rscratch1) {
 4077   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4078   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4079   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4080   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4081   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4082   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4083   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4084   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4085   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4086   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4087 
 4088   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4089   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4090   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4091   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4092   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4093 
 4094   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4095   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4096   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4097   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4098   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4099   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4100   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4101   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4102   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4103   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4104   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4105   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4106   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4107   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4108   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4109   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4110   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4111   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4112   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4113   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4114   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4115   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4116   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4117   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4118   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4119 
 4120   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4121   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4122   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4123   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4124   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4125 
 4126   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4127 
 4128   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4129   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4130   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4131   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4132   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4133 
 4134   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4135   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4136   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4137   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4138   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4139 
 4140   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4141   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4142   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4143   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4144   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4145 
 4146   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4147   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4148   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4149   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4150   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4151 
 4152   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4153   }
 4154 
 4155   // Arguments:
 4156   //
 4157   // Inputs:
 4158   //   c_rarg0   - byte[]  source+offset
 4159   //   c_rarg1   - byte[]  SHA.state
 4160   //   c_rarg2   - int     block_size
 4161   //   c_rarg3   - int     offset
 4162   //   c_rarg4   - int     limit
 4163   //
 4164   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4165     bool multi_block;
 4166     switch (stub_id) {
 4167     case sha3_implCompress_id:
 4168       multi_block = false;
 4169       break;
 4170     case sha3_implCompressMB_id:
 4171       multi_block = true;
 4172       break;
 4173     default:
 4174       ShouldNotReachHere();
 4175     }
 4176 
 4177     static const uint64_t round_consts[24] = {
 4178       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4179       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4180       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4181       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4182       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4183       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4184       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4185       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4186     };
 4187 
 4188     __ align(CodeEntryAlignment);
 4189 
 4190     StubCodeMark mark(this, stub_id);
 4191     address start = __ pc();
 4192 
 4193     Register buf           = c_rarg0;
 4194     Register state         = c_rarg1;
 4195     Register block_size    = c_rarg2;
 4196     Register ofs           = c_rarg3;
 4197     Register limit         = c_rarg4;
 4198 
 4199     Label sha3_loop, rounds24_loop;
 4200     Label sha3_512_or_sha3_384, shake128;
 4201 
 4202     __ stpd(v8, v9, __ pre(sp, -64));
 4203     __ stpd(v10, v11, Address(sp, 16));
 4204     __ stpd(v12, v13, Address(sp, 32));
 4205     __ stpd(v14, v15, Address(sp, 48));
 4206 
 4207     // load state
 4208     __ add(rscratch1, state, 32);
 4209     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4210     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4211     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4212     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4213     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4214     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4215     __ ld1(v24, __ T1D, rscratch1);
 4216 
 4217     __ BIND(sha3_loop);
 4218 
 4219     // 24 keccak rounds
 4220     __ movw(rscratch2, 24);
 4221 
 4222     // load round_constants base
 4223     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4224 
 4225     // load input
 4226     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4227     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4228     __ eor(v0, __ T8B, v0, v25);
 4229     __ eor(v1, __ T8B, v1, v26);
 4230     __ eor(v2, __ T8B, v2, v27);
 4231     __ eor(v3, __ T8B, v3, v28);
 4232     __ eor(v4, __ T8B, v4, v29);
 4233     __ eor(v5, __ T8B, v5, v30);
 4234     __ eor(v6, __ T8B, v6, v31);
 4235 
 4236     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4237     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4238 
 4239     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4240     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4241     __ eor(v7, __ T8B, v7, v25);
 4242     __ eor(v8, __ T8B, v8, v26);
 4243     __ eor(v9, __ T8B, v9, v27);
 4244     __ eor(v10, __ T8B, v10, v28);
 4245     __ eor(v11, __ T8B, v11, v29);
 4246     __ eor(v12, __ T8B, v12, v30);
 4247     __ eor(v13, __ T8B, v13, v31);
 4248 
 4249     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4250     __ eor(v14, __ T8B, v14, v25);
 4251     __ eor(v15, __ T8B, v15, v26);
 4252     __ eor(v16, __ T8B, v16, v27);
 4253 
 4254     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4255     __ andw(c_rarg5, block_size, 48);
 4256     __ cbzw(c_rarg5, rounds24_loop);
 4257 
 4258     __ tbnz(block_size, 5, shake128);
 4259     // block_size == 144, bit5 == 0, SHA3-224
 4260     __ ldrd(v28, __ post(buf, 8));
 4261     __ eor(v17, __ T8B, v17, v28);
 4262     __ b(rounds24_loop);
 4263 
 4264     __ BIND(shake128);
 4265     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4266     __ eor(v17, __ T8B, v17, v28);
 4267     __ eor(v18, __ T8B, v18, v29);
 4268     __ eor(v19, __ T8B, v19, v30);
 4269     __ eor(v20, __ T8B, v20, v31);
 4270     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4271 
 4272     __ BIND(sha3_512_or_sha3_384);
 4273     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4274     __ eor(v7, __ T8B, v7, v25);
 4275     __ eor(v8, __ T8B, v8, v26);
 4276     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4277 
 4278     // SHA3-384
 4279     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4280     __ eor(v9,  __ T8B, v9,  v27);
 4281     __ eor(v10, __ T8B, v10, v28);
 4282     __ eor(v11, __ T8B, v11, v29);
 4283     __ eor(v12, __ T8B, v12, v30);
 4284 
 4285     __ BIND(rounds24_loop);
 4286     __ subw(rscratch2, rscratch2, 1);
 4287 
 4288     keccak_round(rscratch1);
 4289 
 4290     __ cbnzw(rscratch2, rounds24_loop);
 4291 
 4292     if (multi_block) {
 4293       __ add(ofs, ofs, block_size);
 4294       __ cmp(ofs, limit);
 4295       __ br(Assembler::LE, sha3_loop);
 4296       __ mov(c_rarg0, ofs); // return ofs
 4297     }
 4298 
 4299     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4300     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4301     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4302     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4303     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4304     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4305     __ st1(v24, __ T1D, state);
 4306 
 4307     // restore callee-saved registers
 4308     __ ldpd(v14, v15, Address(sp, 48));
 4309     __ ldpd(v12, v13, Address(sp, 32));
 4310     __ ldpd(v10, v11, Address(sp, 16));
 4311     __ ldpd(v8, v9, __ post(sp, 64));
 4312 
 4313     __ ret(lr);
 4314 
 4315     return start;
 4316   }
 4317 
 4318   // Inputs:
 4319   //   c_rarg0   - long[]  state0
 4320   //   c_rarg1   - long[]  state1
 4321   address generate_double_keccak() {
 4322     static const uint64_t round_consts[24] = {
 4323       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4324       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4325       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4326       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4327       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4328       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4329       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4330       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4331     };
 4332 
 4333     // Implements the double_keccak() method of the
 4334     // sun.secyrity.provider.SHA3Parallel class
 4335     __ align(CodeEntryAlignment);
 4336     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4337     address start = __ pc();
 4338     __ enter();
 4339 
 4340     Register state0        = c_rarg0;
 4341     Register state1        = c_rarg1;
 4342 
 4343     Label rounds24_loop;
 4344 
 4345     // save callee-saved registers
 4346     __ stpd(v8, v9, __ pre(sp, -64));
 4347     __ stpd(v10, v11, Address(sp, 16));
 4348     __ stpd(v12, v13, Address(sp, 32));
 4349     __ stpd(v14, v15, Address(sp, 48));
 4350 
 4351     // load states
 4352     __ add(rscratch1, state0, 32);
 4353     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4354     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4355     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4356     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4357     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4358     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4359     __ ld1(v24, __ D, 0, rscratch1);
 4360     __ add(rscratch1, state1, 32);
 4361     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4362     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4363     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4364     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4365     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4366     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4367     __ ld1(v24, __ D, 1, rscratch1);
 4368 
 4369     // 24 keccak rounds
 4370     __ movw(rscratch2, 24);
 4371 
 4372     // load round_constants base
 4373     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4374 
 4375     __ BIND(rounds24_loop);
 4376     __ subw(rscratch2, rscratch2, 1);
 4377     keccak_round(rscratch1);
 4378     __ cbnzw(rscratch2, rounds24_loop);
 4379 
 4380     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4381     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4382     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4383     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4384     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4385     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4386     __ st1(v24, __ D, 0, state0);
 4387     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4388     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4389     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4390     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4391     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4392     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4393     __ st1(v24, __ D, 1, state1);
 4394 
 4395     // restore callee-saved vector registers
 4396     __ ldpd(v14, v15, Address(sp, 48));
 4397     __ ldpd(v12, v13, Address(sp, 32));
 4398     __ ldpd(v10, v11, Address(sp, 16));
 4399     __ ldpd(v8, v9, __ post(sp, 64));
 4400 
 4401     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4402     __ mov(r0, zr); // return 0
 4403     __ ret(lr);
 4404 
 4405     return start;
 4406   }
 4407 
 4408   /**
 4409    *  Arguments:
 4410    *
 4411    * Inputs:
 4412    *   c_rarg0   - int crc
 4413    *   c_rarg1   - byte* buf
 4414    *   c_rarg2   - int length
 4415    *
 4416    * Output:
 4417    *       rax   - int crc result
 4418    */
 4419   address generate_updateBytesCRC32() {
 4420     assert(UseCRC32Intrinsics, "what are we doing here?");
 4421 
 4422     __ align(CodeEntryAlignment);
 4423     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 4424     StubCodeMark mark(this, stub_id);
 4425 
 4426     address start = __ pc();
 4427 
 4428     const Register crc   = c_rarg0;  // crc
 4429     const Register buf   = c_rarg1;  // source java byte array address
 4430     const Register len   = c_rarg2;  // length
 4431     const Register table0 = c_rarg3; // crc_table address
 4432     const Register table1 = c_rarg4;
 4433     const Register table2 = c_rarg5;
 4434     const Register table3 = c_rarg6;
 4435     const Register tmp3 = c_rarg7;
 4436 
 4437     BLOCK_COMMENT("Entry:");
 4438     __ enter(); // required for proper stackwalking of RuntimeStub frame
 4439 
 4440     __ kernel_crc32(crc, buf, len,
 4441               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 4442 
 4443     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4444     __ ret(lr);
 4445 
 4446     return start;
 4447   }
 4448 
 4449   // ChaCha20 block function.  This version parallelizes 4 quarter
 4450   // round operations at a time.  It uses 16 SIMD registers to
 4451   // produce 4 blocks of key stream.
 4452   //
 4453   // state (int[16]) = c_rarg0
 4454   // keystream (byte[256]) = c_rarg1
 4455   // return - number of bytes of keystream (always 256)
 4456   //
 4457   // In this approach, we load the 512-bit start state sequentially into
 4458   // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
 4459   // state, with each successive set of 4 vectors having a +1 added into
 4460   // the first 32-bit lane of the 4th vector in that group (the counter).
 4461   // By doing this, we can perform the block function on 4 512-bit blocks
 4462   // within one run of this intrinsic.
 4463   // The alignment of the data across the 4-vector group is such that at
 4464   // the start it is already aligned for the first round of each two-round
 4465   // loop iteration.  In other words, the corresponding lanes of each vector
 4466   // will contain the values needed for that quarter round operation (e.g.
 4467   // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
 4468   // In between each full round, a lane shift must occur.  Within a loop
 4469   // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
 4470   // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
 4471   // is effectively a diagonal orientation in columnar form.  After the
 4472   // second full round, those registers are left-rotated again, this time
 4473   // 96, 64, and 32 bits - returning the vectors to their columnar organization.
 4474   // After all 10 iterations, the original state is added to each 4-vector
 4475   // working state along with the add mask, and the 4 vector groups are
 4476   // sequentially written to the memory dedicated for the output key stream.
 4477   //
 4478   // For a more detailed explanation, see Goll and Gueron, "Vectorization of
 4479   // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
 4480   // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
 4481   address generate_chacha20Block_qrpar() {
 4482     Label L_Q_twoRounds, L_Q_cc20_const;
 4483     // The constant data is broken into two 128-bit segments to be loaded
 4484     // onto SIMD registers.  The first 128 bits are a counter add overlay
 4485     // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
 4486     // The second 128-bits is a table constant used for 8-bit left rotations.
 4487     // on 32-bit lanes within a SIMD register.
 4488     __ BIND(L_Q_cc20_const);
 4489     __ emit_int64(0x0000000000000001UL);
 4490     __ emit_int64(0x0000000000000000UL);
 4491     __ emit_int64(0x0605040702010003UL);
 4492     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4493 
 4494     __ align(CodeEntryAlignment);
 4495     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4496     StubCodeMark mark(this, stub_id);
 4497     address start = __ pc();
 4498     __ enter();
 4499 
 4500     const Register state = c_rarg0;
 4501     const Register keystream = c_rarg1;
 4502     const Register loopCtr = r10;
 4503     const Register tmpAddr = r11;
 4504 
 4505     const FloatRegister aState = v0;
 4506     const FloatRegister bState = v1;
 4507     const FloatRegister cState = v2;
 4508     const FloatRegister dState = v3;
 4509     const FloatRegister a1Vec = v4;
 4510     const FloatRegister b1Vec = v5;
 4511     const FloatRegister c1Vec = v6;
 4512     const FloatRegister d1Vec = v7;
 4513     // Skip the callee-saved registers v8 - v15
 4514     const FloatRegister a2Vec = v16;
 4515     const FloatRegister b2Vec = v17;
 4516     const FloatRegister c2Vec = v18;
 4517     const FloatRegister d2Vec = v19;
 4518     const FloatRegister a3Vec = v20;
 4519     const FloatRegister b3Vec = v21;
 4520     const FloatRegister c3Vec = v22;
 4521     const FloatRegister d3Vec = v23;
 4522     const FloatRegister a4Vec = v24;
 4523     const FloatRegister b4Vec = v25;
 4524     const FloatRegister c4Vec = v26;
 4525     const FloatRegister d4Vec = v27;
 4526     const FloatRegister scratch = v28;
 4527     const FloatRegister addMask = v29;
 4528     const FloatRegister lrot8Tbl = v30;
 4529 
 4530     // Load the initial state in the first 4 quadword registers,
 4531     // then copy the initial state into the next 4 quadword registers
 4532     // that will be used for the working state.
 4533     __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
 4534 
 4535     // Load the index register for 2 constant 128-bit data fields.
 4536     // The first represents the +1/+0/+0/+0 add mask.  The second is
 4537     // the 8-bit left rotation.
 4538     __ adr(tmpAddr, L_Q_cc20_const);
 4539     __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
 4540 
 4541     __ mov(a1Vec, __ T16B, aState);
 4542     __ mov(b1Vec, __ T16B, bState);
 4543     __ mov(c1Vec, __ T16B, cState);
 4544     __ mov(d1Vec, __ T16B, dState);
 4545 
 4546     __ mov(a2Vec, __ T16B, aState);
 4547     __ mov(b2Vec, __ T16B, bState);
 4548     __ mov(c2Vec, __ T16B, cState);
 4549     __ addv(d2Vec, __ T4S, d1Vec, addMask);
 4550 
 4551     __ mov(a3Vec, __ T16B, aState);
 4552     __ mov(b3Vec, __ T16B, bState);
 4553     __ mov(c3Vec, __ T16B, cState);
 4554     __ addv(d3Vec, __ T4S, d2Vec, addMask);
 4555 
 4556     __ mov(a4Vec, __ T16B, aState);
 4557     __ mov(b4Vec, __ T16B, bState);
 4558     __ mov(c4Vec, __ T16B, cState);
 4559     __ addv(d4Vec, __ T4S, d3Vec, addMask);
 4560 
 4561     // Set up the 10 iteration loop
 4562     __ mov(loopCtr, 10);
 4563     __ BIND(L_Q_twoRounds);
 4564 
 4565     // The first set of operations on the vectors covers the first 4 quarter
 4566     // round operations:
 4567     //  Qround(state, 0, 4, 8,12)
 4568     //  Qround(state, 1, 5, 9,13)
 4569     //  Qround(state, 2, 6,10,14)
 4570     //  Qround(state, 3, 7,11,15)
 4571     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4572     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4573     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4574     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4575 
 4576     // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
 4577     // diagonals. The a1Vec does not need to change orientation.
 4578     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
 4579     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
 4580     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
 4581     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
 4582 
 4583     // The second set of operations on the vectors covers the second 4 quarter
 4584     // round operations, now acting on the diagonals:
 4585     //  Qround(state, 0, 5,10,15)
 4586     //  Qround(state, 1, 6,11,12)
 4587     //  Qround(state, 2, 7, 8,13)
 4588     //  Qround(state, 3, 4, 9,14)
 4589     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4590     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4591     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4592     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4593 
 4594     // Before we start the next iteration, we need to perform shuffles
 4595     // on the b/c/d vectors to move them back to columnar organizations
 4596     // from their current diagonal orientation.
 4597     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
 4598     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
 4599     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
 4600     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
 4601 
 4602     // Decrement and iterate
 4603     __ sub(loopCtr, loopCtr, 1);
 4604     __ cbnz(loopCtr, L_Q_twoRounds);
 4605 
 4606     // Once the counter reaches zero, we fall out of the loop
 4607     // and need to add the initial state back into the working state
 4608     // represented by the a/b/c/d1Vec registers.  This is destructive
 4609     // on the dState register but we no longer will need it.
 4610     __ addv(a1Vec, __ T4S, a1Vec, aState);
 4611     __ addv(b1Vec, __ T4S, b1Vec, bState);
 4612     __ addv(c1Vec, __ T4S, c1Vec, cState);
 4613     __ addv(d1Vec, __ T4S, d1Vec, dState);
 4614 
 4615     __ addv(a2Vec, __ T4S, a2Vec, aState);
 4616     __ addv(b2Vec, __ T4S, b2Vec, bState);
 4617     __ addv(c2Vec, __ T4S, c2Vec, cState);
 4618     __ addv(dState, __ T4S, dState, addMask);
 4619     __ addv(d2Vec, __ T4S, d2Vec, dState);
 4620 
 4621     __ addv(a3Vec, __ T4S, a3Vec, aState);
 4622     __ addv(b3Vec, __ T4S, b3Vec, bState);
 4623     __ addv(c3Vec, __ T4S, c3Vec, cState);
 4624     __ addv(dState, __ T4S, dState, addMask);
 4625     __ addv(d3Vec, __ T4S, d3Vec, dState);
 4626 
 4627     __ addv(a4Vec, __ T4S, a4Vec, aState);
 4628     __ addv(b4Vec, __ T4S, b4Vec, bState);
 4629     __ addv(c4Vec, __ T4S, c4Vec, cState);
 4630     __ addv(dState, __ T4S, dState, addMask);
 4631     __ addv(d4Vec, __ T4S, d4Vec, dState);
 4632 
 4633     // Write the final state back to the result buffer
 4634     __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
 4635     __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
 4636     __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
 4637     __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
 4638 
 4639     __ mov(r0, 256);             // Return length of output keystream
 4640     __ leave();
 4641     __ ret(lr);
 4642 
 4643     return start;
 4644   }
 4645 
 4646   // Helpers to schedule parallel operation bundles across vector
 4647   // register sequences of size 2, 4 or 8.
 4648 
 4649   // Implement various primitive computations across vector sequences
 4650 
 4651   template<int N>
 4652   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4653                const VSeq<N>& v1, const VSeq<N>& v2) {
 4654     for (int i = 0; i < N; i++) {
 4655       __ addv(v[i], T, v1[i], v2[i]);
 4656     }
 4657   }
 4658 
 4659   template<int N>
 4660   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4661                const VSeq<N>& v1, const VSeq<N>& v2) {
 4662     for (int i = 0; i < N; i++) {
 4663       __ subv(v[i], T, v1[i], v2[i]);
 4664     }
 4665   }
 4666 
 4667   template<int N>
 4668   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4669                const VSeq<N>& v1, const VSeq<N>& v2) {
 4670     for (int i = 0; i < N; i++) {
 4671       __ mulv(v[i], T, v1[i], v2[i]);
 4672     }
 4673   }
 4674 
 4675   template<int N>
 4676   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4677     for (int i = 0; i < N; i++) {
 4678       __ negr(v[i], T, v1[i]);
 4679     }
 4680   }
 4681 
 4682   template<int N>
 4683   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4684                const VSeq<N>& v1, int shift) {
 4685     for (int i = 0; i < N; i++) {
 4686       __ sshr(v[i], T, v1[i], shift);
 4687     }
 4688   }
 4689 
 4690   template<int N>
 4691   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4692     for (int i = 0; i < N; i++) {
 4693       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4694     }
 4695   }
 4696 
 4697   template<int N>
 4698   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4699     for (int i = 0; i < N; i++) {
 4700       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4701     }
 4702   }
 4703 
 4704   template<int N>
 4705     void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4706     for (int i = 0; i < N; i++) {
 4707       __ notr(v[i], __ T16B, v1[i]);
 4708     }
 4709   }
 4710 
 4711   // load N/2 successive pairs of quadword values from memory in order
 4712   // into N successive vector registers of the sequence via the
 4713   // address supplied in base.
 4714   template<int N>
 4715   void vs_ldpq(const VSeq<N>& v, Register base) {
 4716     for (int i = 0; i < N; i += 2) {
 4717       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4718     }
 4719   }
 4720 
 4721   // load N/2 successive pairs of quadword values from memory in order
 4722   // into N vector registers of the sequence via the address supplied
 4723   // in base using post-increment addressing
 4724   template<int N>
 4725   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4726     for (int i = 0; i < N; i += 2) {
 4727       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4728     }
 4729   }
 4730 
 4731   // store N successive vector registers of the sequence into N/2
 4732   // successive pairs of quadword memory locations via the address
 4733   // supplied in base using post-increment addressing
 4734   template<int N>
 4735   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4736     for (int i = 0; i < N; i += 2) {
 4737       __ stpq(v[i], v[i+1], __ post(base, 32));
 4738     }
 4739   }
 4740 
 4741   // load N/2 pairs of quadword values from memory into N vector
 4742   // registers via the address supplied in base with each pair indexed
 4743   // using the the start offset plus the corresponding entry in the
 4744   // offsets array
 4745   template<int N>
 4746   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4747     for (int i = 0; i < N/2; i++) {
 4748       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4749     }
 4750   }
 4751 
 4752   // store N vector registers into N/2 pairs of quadword memory
 4753   // locations via the address supplied in base with each pair indexed
 4754   // using the the start offset plus the corresponding entry in the
 4755   // offsets array
 4756   template<int N>
 4757   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4758     for (int i = 0; i < N/2; i++) {
 4759       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4760     }
 4761   }
 4762 
 4763   // load N single quadword values from memory into N vector registers
 4764   // via the address supplied in base with each value indexed using
 4765   // the the start offset plus the corresponding entry in the offsets
 4766   // array
 4767   template<int N>
 4768   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4769                       int start, int (&offsets)[N]) {
 4770     for (int i = 0; i < N; i++) {
 4771       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4772     }
 4773   }
 4774 
 4775   // store N vector registers into N single quadword memory locations
 4776   // via the address supplied in base with each value indexed using
 4777   // the the start offset plus the corresponding entry in the offsets
 4778   // array
 4779   template<int N>
 4780   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4781                       int start, int (&offsets)[N]) {
 4782     for (int i = 0; i < N; i++) {
 4783       __ str(v[i], T, Address(base, start + offsets[i]));
 4784     }
 4785   }
 4786 
 4787   // load N/2 pairs of quadword values from memory de-interleaved into
 4788   // N vector registers 2 at a time via the address supplied in base
 4789   // with each pair indexed using the the start offset plus the
 4790   // corresponding entry in the offsets array
 4791   template<int N>
 4792   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4793                       Register tmp, int start, int (&offsets)[N/2]) {
 4794     for (int i = 0; i < N/2; i++) {
 4795       __ add(tmp, base, start + offsets[i]);
 4796       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4797     }
 4798   }
 4799 
 4800   // store N vector registers 2 at a time interleaved into N/2 pairs
 4801   // of quadword memory locations via the address supplied in base
 4802   // with each pair indexed using the the start offset plus the
 4803   // corresponding entry in the offsets array
 4804   template<int N>
 4805   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4806                       Register tmp, int start, int (&offsets)[N/2]) {
 4807     for (int i = 0; i < N/2; i++) {
 4808       __ add(tmp, base, start + offsets[i]);
 4809       __ st2(v[2*i], v[2*i+1], T, tmp);
 4810     }
 4811   }
 4812 
 4813   // Helper routines for various flavours of dilithium montgomery
 4814   // multiply
 4815 
 4816   // Perform 16 32-bit Montgomery multiplications in parallel
 4817   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4818   //
 4819   // Computes 4x4S results
 4820   //    a = b * c * 2^-32 mod MONT_Q
 4821   // Inputs:  vb, vc - 4x4S vector register sequences
 4822   //          vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R>
 4823   // Temps:   vtmp - 4x4S vector sequence trashed after call
 4824   // Outputs: va - 4x4S vector register sequences
 4825   // vb, vc, vtmp and vq must all be disjoint
 4826   // va must be disjoint from all other inputs/temps or must equal vc
 4827   // n.b. MONT_R_BITS is 32, so the right shift by it is implicit.
 4828   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 4829                     const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4830     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4831     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4832     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4833 
 4834     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4835     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4836 
 4837     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4838 
 4839     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4840     assert(vs_disjoint(va, vb), "va and vb overlap");
 4841     assert(vs_disjoint(va, vq), "va and vq overlap");
 4842     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4843 
 4844     // schedule 4 streams of instructions across the vector sequences
 4845     for (int i = 0; i < 4; i++) {
 4846       __ sqdmulh(vtmp[i], __ T4S, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4847       __ mulv(va[i], __ T4S, vb[i], vc[i]);    // aLow = lo32(b * c)
 4848     }
 4849 
 4850     for (int i = 0; i < 4; i++) {
 4851       __ mulv(va[i], __ T4S, va[i], vq[0]);     // m = aLow * qinv
 4852     }
 4853 
 4854     for (int i = 0; i < 4; i++) {
 4855       __ sqdmulh(va[i], __ T4S, va[i], vq[1]);  // n = hi32(2 * m * q)
 4856     }
 4857 
 4858     for (int i = 0; i < 4; i++) {
 4859       __ shsubv(va[i], __ T4S, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 4860     }
 4861   }
 4862 
 4863   // Perform 2x16 32-bit Montgomery multiplications in parallel
 4864   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4865   //
 4866   // Computes 8x4S results
 4867   //    a = b * c * 2^-32 mod MONT_Q
 4868   // Inputs:  vb, vc - 8x4S vector register sequences
 4869   //          vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R>
 4870   // Temps:   vtmp - 4x4S vector sequence trashed after call
 4871   // Outputs: va - 8x4S vector register sequences
 4872   // vb, vc, vtmp and vq must all be disjoint
 4873   // va must be disjoint from all other inputs/temps or must equal vc
 4874   // n.b. MONT_R_BITS is 32, so the right shift by it is implicit.
 4875   void vs_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 4876                     const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4877     // vb, vc, vtmp and vq must be disjoint. va must either be
 4878     // disjoint from all other registers or equal vc
 4879 
 4880     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4881     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4882     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4883 
 4884     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4885     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4886 
 4887     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4888 
 4889     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4890     assert(vs_disjoint(va, vb), "va and vb overlap");
 4891     assert(vs_disjoint(va, vq), "va and vq overlap");
 4892     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4893 
 4894     // we need to multiply the front and back halves of each sequence
 4895     // 4x4S at a time because
 4896     //
 4897     // 1) we are currently only able to get 4-way instruction
 4898     // parallelism at best
 4899     //
 4900     // 2) we need registers for the constants in vq and temporary
 4901     // scratch registers to hold intermediate results so vtmp can only
 4902     // be a VSeq<4> which means we only have 4 scratch slots
 4903 
 4904     dilithium_montmul16(vs_front(va), vs_front(vb), vs_front(vc), vtmp, vq);
 4905     dilithium_montmul16(vs_back(va), vs_back(vb), vs_back(vc), vtmp, vq);
 4906   }
 4907 
 4908   // perform combined montmul then add/sub on 4x4S vectors
 4909 
 4910   void dilithium_montmul16_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 4911                                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4912     // compute a = montmul(a1, c)
 4913     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 4914     // ouptut a1 = a0 - a
 4915     vs_subv(va1, __ T4S, va0, vc);
 4916     //    and a0 = a0 + a
 4917     vs_addv(va0, __ T4S, va0, vc);
 4918   }
 4919 
 4920   // perform combined add/sub then montul on 4x4S vectors
 4921 
 4922   void dilithium_sub_add_montmul16(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 4923                                    const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 4924     // compute c = a0 - a1
 4925     vs_subv(vtmp1, __ T4S, va0, va1);
 4926     // output a0 = a0 + a1
 4927     vs_addv(va0, __ T4S, va0, va1);
 4928     // output a1 = b montmul c
 4929     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 4930   }
 4931 
 4932   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 4933   // in the Java implementation come in sequences of at least 8, so we
 4934   // can use ldpq to collect the corresponding data into pairs of vector
 4935   // registers.
 4936   // We collect the coefficients corresponding to the 'j+l' indexes into
 4937   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 4938   // then we do the (Montgomery) multiplications by the zetas in parallel
 4939   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 4940   // v0-v7, then do the additions into v24-v31 and the subtractions into
 4941   // v0-v7 and finally save the results back to the coeffs array.
 4942   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 4943     const Register coeffs, const Register zetas) {
 4944     int c1 = 0;
 4945     int c2 = 512;
 4946     int startIncr;
 4947     // don't use callee save registers v8 - v15
 4948     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 4949     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 4950     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 4951     int offsets[4] = { 0, 32, 64, 96 };
 4952 
 4953     for (int level = 0; level < 5; level++) {
 4954       int c1Start = c1;
 4955       int c2Start = c2;
 4956       if (level == 3) {
 4957         offsets[1] = 32;
 4958         offsets[2] = 128;
 4959         offsets[3] = 160;
 4960       } else if (level == 4) {
 4961         offsets[1] = 64;
 4962         offsets[2] = 128;
 4963         offsets[3] = 192;
 4964       }
 4965 
 4966       // for levels 1 - 4 we simply load 2 x 4 adjacent values at a
 4967       // time at 4 different offsets and multiply them in order by the
 4968       // next set of input values. So we employ indexed load and store
 4969       // pair instructions with arrangement 4S
 4970       for (int i = 0; i < 4; i++) {
 4971         // reload q and qinv
 4972         vs_ldpq(vq, dilithiumConsts); // qInv, q
 4973         // load 8x4S coefficients via second start pos == c2
 4974         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 4975         // load next 8x4S inputs == b
 4976         vs_ldpq_post(vs2, zetas);
 4977         // compute a == c2 * b mod MONT_Q
 4978         vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 4979         // load 8x4s coefficients via first start pos == c1
 4980         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 4981         // compute a1 =  c1 + a
 4982         vs_addv(vs3, __ T4S, vs1, vs2);
 4983         // compute a2 =  c1 - a
 4984         vs_subv(vs1, __ T4S, vs1, vs2);
 4985         // output a1 and a2
 4986         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 4987         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 4988 
 4989         int k = 4 * level + i;
 4990 
 4991         if (k > 7) {
 4992           startIncr = 256;
 4993         } else if (k == 5) {
 4994           startIncr = 384;
 4995         } else {
 4996           startIncr = 128;
 4997         }
 4998 
 4999         c1Start += startIncr;
 5000         c2Start += startIncr;
 5001       }
 5002 
 5003       c2 /= 2;
 5004     }
 5005   }
 5006 
 5007   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 5008   // Implements the method
 5009   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 5010   // of the Java class sun.security.provider
 5011   //
 5012   // coeffs (int[256]) = c_rarg0
 5013   // zetas (int[256]) = c_rarg1
 5014   address generate_dilithiumAlmostNtt() {
 5015 
 5016     __ align(CodeEntryAlignment);
 5017     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 5018     StubCodeMark mark(this, stub_id);
 5019     address start = __ pc();
 5020     __ enter();
 5021 
 5022     const Register coeffs = c_rarg0;
 5023     const Register zetas = c_rarg1;
 5024 
 5025     const Register tmpAddr = r9;
 5026     const Register dilithiumConsts = r10;
 5027     const Register result = r11;
 5028     // don't use callee save registers v8 - v15
 5029     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5030     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5031     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5032     int offsets[4] = {0, 32, 64, 96};
 5033     int offsets1[8] = {16, 48, 80, 112, 144, 176, 208, 240 };
 5034     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5035     __ add(result, coeffs, 0);
 5036     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5037 
 5038     // Each level represents one iteration of the outer for loop of the Java version
 5039 
 5040     // level 0-4
 5041     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 5042 
 5043     // level 5
 5044 
 5045     // at level 5 the coefficients we need to combine with the zetas
 5046     // are grouped in memory in blocks of size 4. So, for both sets of
 5047     // coefficients we load 4 adjacent values at 8 different offsets
 5048     // using an indexed ldr with register variant Q and multiply them
 5049     // in sequence order by the next set of inputs. Likewise we store
 5050     // the resuls using an indexed str with register variant Q.
 5051     for (int i = 0; i < 1024; i += 256) {
 5052       // reload constants q, qinv each iteration as they get clobbered later
 5053       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5054       // load 32 (8x4S) coefficients via first offsets = c1
 5055       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 5056       // load next 32 (8x4S) inputs = b
 5057       vs_ldpq_post(vs2, zetas);
 5058       // a = b montul c1
 5059       vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5060       // load 32 (8x4S) coefficients via second offsets = c2
 5061       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 5062       // add/sub with result of multiply
 5063       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 5064       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 5065       // write back new coefficients using same offsets
 5066       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 5067       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 5068     }
 5069 
 5070     // level 6
 5071     // at level 6 the coefficients we need to combine with the zetas
 5072     // are grouped in memory in pairs, the first two being montmul
 5073     // inputs and the second add/sub inputs. We can still implement
 5074     // the montmul+sub+add using 4-way parallelism but only if we
 5075     // combine the coefficients with the zetas 16 at a time. We load 8
 5076     // adjacent values at 4 different offsets using an ld2 load with
 5077     // arrangement 2D. That interleaves the lower and upper halves of
 5078     // each pair of quadwords into successive vector registers. We
 5079     // then need to montmul the 4 even elements of the coefficients
 5080     // register sequence by the zetas in order and then add/sub the 4
 5081     // odd elements of the coefficients register sequence. We use an
 5082     // equivalent st2 operation to store the results back into memory
 5083     // de-interleaved.
 5084     for (int i = 0; i < 1024; i += 128) {
 5085       // reload constants q, qinv each iteration as they get clobbered later
 5086       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5087       // load interleaved 16 (4x2D) coefficients via offsets
 5088       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5089       // load next 16 (4x4S) inputs
 5090       vs_ldpq_post(vs_front(vs2), zetas);
 5091       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 5092       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 5093                                   vs_front(vs2), vtmp, vq);
 5094       // store interleaved 16 (4x2D) coefficients via offsets
 5095       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5096     }
 5097 
 5098     // level 7
 5099     // at level 7 the coefficients we need to combine with the zetas
 5100     // occur singly with montmul inputs alterating with add/sub
 5101     // inputs. Once again we can use 4-way parallelism to combine 16
 5102     // zetas at a time. However, we have to load 8 adjacent values at
 5103     // 4 different offsets using an ld2 load with arrangement 4S. That
 5104     // interleaves the the odd words of each pair into one
 5105     // coefficients vector register and the even words of the pair
 5106     // into the next register. We then need to montmul the 4 even
 5107     // elements of the coefficients register sequence by the zetas in
 5108     // order and then add/sub the 4 odd elements of the coefficients
 5109     // register sequence. We use an equivalent st2 operation to store
 5110     // the results back into memory de-interleaved.
 5111 
 5112     for (int i = 0; i < 1024; i += 128) {
 5113       // reload constants q, qinv each iteration as they get clobbered later
 5114       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5115       // load interleaved 16 (4x4S) coefficients via offsets
 5116       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5117       // load next 16 (4x4S) inputs
 5118       vs_ldpq_post(vs_front(vs2), zetas);
 5119       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 5120       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 5121                                   vs_front(vs2), vtmp, vq);
 5122       // store interleaved 16 (4x4S) coefficients via offsets
 5123       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5124     }
 5125     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5126     __ mov(r0, zr); // return 0
 5127     __ ret(lr);
 5128 
 5129     return start;
 5130   }
 5131 
 5132   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 5133   // in the Java implementation come in sequences of at least 8, so we
 5134   // can use ldpq to collect the corresponding data into pairs of vector
 5135   // registers
 5136   // We collect the coefficients that correspond to the 'j's into vs1
 5137   // the coefficiets that correspond to the 'j+l's into vs2 then
 5138   // do the additions into vs3 and the subtractions into vs1 then
 5139   // save the result of the additions, load the zetas into vs2
 5140   // do the (Montgomery) multiplications by zeta in parallel into vs2
 5141   // finally save the results back to the coeffs array
 5142   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 5143     const Register coeffs, const Register zetas) {
 5144     int c1 = 0;
 5145     int c2 = 32;
 5146     int startIncr;
 5147     int offsets[4];
 5148     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5149     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5150     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5151 
 5152     offsets[0] = 0;
 5153 
 5154     for (int level = 3; level < 8; level++) {
 5155       int c1Start = c1;
 5156       int c2Start = c2;
 5157       if (level == 3) {
 5158         offsets[1] = 64;
 5159         offsets[2] = 128;
 5160         offsets[3] = 192;
 5161       } else if (level == 4) {
 5162         offsets[1] = 32;
 5163         offsets[2] = 128;
 5164         offsets[3] = 160;
 5165       } else {
 5166         offsets[1] = 32;
 5167         offsets[2] = 64;
 5168         offsets[3] = 96;
 5169       }
 5170 
 5171       // for levels 3 - 7 we simply load 2 x 4 adjacent values at a
 5172       // time at 4 different offsets and multiply them in order by the
 5173       // next set of input values. So we employ indexed load and store
 5174       // pair instructions with arrangement 4S
 5175       for (int i = 0; i < 4; i++) {
 5176         // load v1 32 (8x4S) coefficients relative to first start index
 5177         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 5178         // load v2 32 (8x4S) coefficients relative to second start index
 5179         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 5180         // a0 = v1 + v2 -- n.b. clobbers vqs
 5181         vs_addv(vs3, __ T4S, vs1, vs2);
 5182         // a1 = v1 - v2
 5183         vs_subv(vs1, __ T4S, vs1, vs2);
 5184         // save a1 relative to first start index
 5185         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 5186         // load constants q, qinv each iteration as they get clobbered above
 5187         vs_ldpq(vq, dilithiumConsts); // qInv, q
 5188         // load b next 32 (8x4S) inputs
 5189         vs_ldpq_post(vs2, zetas);
 5190         // a = a1 montmul b
 5191         vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5192         // save a relative to second start index
 5193         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 5194 
 5195         int k = 4 * level + i;
 5196 
 5197         if (k < 24) {
 5198           startIncr = 256;
 5199         } else if (k == 25) {
 5200           startIncr = 384;
 5201         } else {
 5202           startIncr = 128;
 5203         }
 5204 
 5205         c1Start += startIncr;
 5206         c2Start += startIncr;
 5207       }
 5208 
 5209       c2 *= 2;
 5210     }
 5211   }
 5212 
 5213   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 5214   // Implements the method
 5215   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 5216   // the sun.security.provider.ML_DSA class.
 5217   //
 5218   // coeffs (int[256]) = c_rarg0
 5219   // zetas (int[256]) = c_rarg1
 5220   address generate_dilithiumAlmostInverseNtt() {
 5221 
 5222     __ align(CodeEntryAlignment);
 5223     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 5224     StubCodeMark mark(this, stub_id);
 5225     address start = __ pc();
 5226     __ enter();
 5227 
 5228     const Register coeffs = c_rarg0;
 5229     const Register zetas = c_rarg1;
 5230 
 5231     const Register tmpAddr = r9;
 5232     const Register dilithiumConsts = r10;
 5233     const Register result = r11;
 5234     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5235     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 5236     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5237     int offsets[4] = { 0, 32, 64, 96 };
 5238     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5239     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 5240 
 5241     __ add(result, coeffs, 0);
 5242     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5243 
 5244     // Each level represents one iteration of the outer for loop of the Java version
 5245     // level0
 5246 
 5247     // level 0
 5248     // At level 0 we need to interleave adjacent quartets of
 5249     // coefficients before we multiply and add/sub by the next 16
 5250     // zetas just as we did for level 7 in the multiply code. So we
 5251     // load and store the values using an ld2/st2 with arrangement 4S
 5252     for (int i = 0; i < 1024; i += 128) {
 5253       // load constants q, qinv
 5254       // n.b. this can be moved out of the loop as they do not get
 5255       // clobbered by first two loops
 5256       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5257       // a0/a1 load interleaved 32 (8x4S) coefficients
 5258       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5259       // b load next 32 (8x4S) inputs
 5260       vs_ldpq_post(vs_front(vs2), zetas);
 5261       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 5262       // n.b. second half of vs2 provides temporary register storage
 5263       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 5264                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 5265       // a0/a1 store interleaved 32 (8x4S) coefficients
 5266       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5267     }
 5268 
 5269     // level 1
 5270     // At level 1 we need to interleave pairs of adjacent pairs of
 5271     // coefficients before we multiply by the next 16 zetas just as we
 5272     // did for level 6 in the multiply code. So we load and store the
 5273     // values an ld2/st2 with arrangement 2D
 5274     for (int i = 0; i < 1024; i += 128) {
 5275       // a0/a1 load interleaved 32 (8x2D) coefficients
 5276       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5277       // b load next 16 (4x4S) inputs
 5278       vs_ldpq_post(vs_front(vs2), zetas);
 5279       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 5280       // n.b. second half of vs2 provides temporary register storage
 5281       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 5282                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 5283       // a0/a1 store interleaved 32 (8x2D) coefficients
 5284       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5285     }
 5286 
 5287     // level 2
 5288     // At level 2 coefficients come in blocks of 4. So, we load 4
 5289     // adjacent coefficients at 8 distinct offsets for both the first
 5290     // and second coefficient sequences, using an ldr with register
 5291     // variant Q then combine them with next set of 32 zetas. Likewise
 5292     // we store the results using an str with register variant Q.
 5293     for (int i = 0; i < 1024; i += 256) {
 5294       // c0 load 32 (8x4S) coefficients via first offsets
 5295       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 5296       // c1 load 32 (8x4S) coefficients via second offsets
 5297       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 5298       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 5299       vs_addv(vs3, __ T4S, vs1, vs2);
 5300       // c = c0 - c1
 5301       vs_subv(vs1, __ T4S, vs1, vs2);
 5302       // store a0 32 (8x4S) coefficients via first offsets
 5303       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 5304       // b load 32 (8x4S) next inputs
 5305       vs_ldpq_post(vs2, zetas);
 5306       // reload constants q, qinv -- they were clobbered earlier
 5307       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5308       // compute a1 = b montmul c
 5309       vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5310       // store a1 32 (8x4S) coefficients via second offsets
 5311       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 5312     }
 5313 
 5314     // level 3-7
 5315     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 5316 
 5317     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5318     __ mov(r0, zr); // return 0
 5319     __ ret(lr);
 5320 
 5321     return start;
 5322 
 5323   }
 5324 
 5325   // Dilithium multiply polynomials in the NTT domain.
 5326   // Straightforward implementation of the method
 5327   // static int implDilithiumNttMult(
 5328   //              int[] result, int[] ntta, int[] nttb {} of
 5329   // the sun.security.provider.ML_DSA class.
 5330   //
 5331   // result (int[256]) = c_rarg0
 5332   // poly1 (int[256]) = c_rarg1
 5333   // poly2 (int[256]) = c_rarg2
 5334   address generate_dilithiumNttMult() {
 5335 
 5336         __ align(CodeEntryAlignment);
 5337     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 5338     StubCodeMark mark(this, stub_id);
 5339     address start = __ pc();
 5340     __ enter();
 5341 
 5342     Label L_loop;
 5343 
 5344     const Register result = c_rarg0;
 5345     const Register poly1 = c_rarg1;
 5346     const Register poly2 = c_rarg2;
 5347 
 5348     const Register dilithiumConsts = r10;
 5349     const Register len = r11;
 5350 
 5351     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5352     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5353     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5354     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 5355 
 5356     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5357 
 5358     // load constants q, qinv
 5359     vs_ldpq(vq, dilithiumConsts); // qInv, q
 5360     // load constant rSquare into v29
 5361     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 5362 
 5363     __ mov(len, zr);
 5364     __ add(len, len, 1024);
 5365 
 5366     __ BIND(L_loop);
 5367 
 5368     // b load 32 (8x4S) next inputs from poly1
 5369     vs_ldpq_post(vs1, poly1);
 5370     // c load 32 (8x4S) next inputs from poly2
 5371     vs_ldpq_post(vs2, poly2);
 5372     // compute a = b montmul c
 5373     vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5374     // compute a = rsquare montmul a
 5375     vs_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 5376     // save a 32 (8x4S) results
 5377     vs_stpq_post(vs2, result);
 5378 
 5379     __ sub(len, len, 128);
 5380     __ cmp(len, (u1)128);
 5381     __ br(Assembler::GE, L_loop);
 5382 
 5383     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5384     __ mov(r0, zr); // return 0
 5385     __ ret(lr);
 5386 
 5387     return start;
 5388 
 5389   }
 5390 
 5391   // Dilithium Motgomery multiply an array by a constant.
 5392   // A straightforward implementation of the method
 5393   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 5394   // of the sun.security.provider.MLDSA class
 5395   //
 5396   // coeffs (int[256]) = c_rarg0
 5397   // constant (int) = c_rarg1
 5398   address generate_dilithiumMontMulByConstant() {
 5399 
 5400     __ align(CodeEntryAlignment);
 5401     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 5402     StubCodeMark mark(this, stub_id);
 5403     address start = __ pc();
 5404     __ enter();
 5405 
 5406     Label L_loop;
 5407 
 5408     const Register coeffs = c_rarg0;
 5409     const Register constant = c_rarg1;
 5410 
 5411     const Register dilithiumConsts = r10;
 5412     const Register result = r11;
 5413     const Register len = r12;
 5414 
 5415     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5416     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5417     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5418     VSeq<8> vconst(29, 0);             // for montmul by constant
 5419 
 5420     // results track inputs
 5421     __ add(result, coeffs, 0);
 5422     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5423 
 5424     // load constants q, qinv -- they do not get clobbered by first two loops
 5425     vs_ldpq(vq, dilithiumConsts); // qInv, q
 5426     // copy caller supplied constant across vconst
 5427     __ dup(vconst[0], __ T4S, constant);
 5428     __ mov(len, zr);
 5429     __ add(len, len, 1024);
 5430 
 5431     __ BIND(L_loop);
 5432 
 5433     // load next 32 inputs
 5434     vs_ldpq_post(vs2, coeffs);
 5435     // mont mul by constant
 5436     vs_montmul32(vs2, vconst, vs2, vtmp, vq);
 5437     // write next 32 results
 5438     vs_stpq_post(vs2, result);
 5439 
 5440     __ sub(len, len, 128);
 5441     __ cmp(len, (u1)128);
 5442     __ br(Assembler::GE, L_loop);
 5443 
 5444     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5445     __ mov(r0, zr); // return 0
 5446     __ ret(lr);
 5447 
 5448     return start;
 5449 
 5450   }
 5451 
 5452   // Dilithium decompose poly.
 5453   // Implements the method
 5454   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 5455   // of the sun.security.provider.ML_DSA class
 5456   //
 5457   // input (int[256]) = c_rarg0
 5458   // lowPart (int[256]) = c_rarg1
 5459   // highPart (int[256]) = c_rarg2
 5460   // twoGamma2  (int) = c_rarg3
 5461   // multiplier (int) = c_rarg4
 5462   address generate_dilithiumDecomposePoly() {
 5463 
 5464     __ align(CodeEntryAlignment);
 5465     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 5466     StubCodeMark mark(this, stub_id);
 5467     address start = __ pc();
 5468     Label L_loop;
 5469 
 5470     const Register input = c_rarg0;
 5471     const Register lowPart = c_rarg1;
 5472     const Register highPart = c_rarg2;
 5473     const Register twoGamma2 = c_rarg3;
 5474     const Register multiplier = c_rarg4;
 5475 
 5476     const Register len = r9;
 5477     const Register dilithiumConsts = r10;
 5478     const Register tmp = r11;
 5479 
 5480     VSeq<4> vs1(0), vs2(4), vs3(8); // 6 independent sets of 4x4s values
 5481     VSeq<4> vs4(12), vs5(16), vtmp(20);
 5482     VSeq<4> one(25, 0);            // 7 constants for cross-multiplying
 5483     VSeq<4> qminus1(26, 0);
 5484     VSeq<4> g2(27, 0);
 5485     VSeq<4> twog2(28, 0);
 5486     VSeq<4> mult(29, 0);
 5487     VSeq<4> q(30, 0);
 5488     VSeq<4> qadd(31, 0);
 5489 
 5490     __ enter();
 5491 
 5492     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5493 
 5494     // save callee-saved registers
 5495     __ stpd(v8, v9, __ pre(sp, -64));
 5496     __ stpd(v10, v11, Address(sp, 16));
 5497     __ stpd(v12, v13, Address(sp, 32));
 5498     __ stpd(v14, v15, Address(sp, 48));
 5499 
 5500     // populate constant registers
 5501     __ mov(tmp, zr);
 5502     __ add(tmp, tmp, 1);
 5503     __ dup(one[0], __ T4S, tmp); // 1
 5504     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 5505     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 5506     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 5507     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 5508     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 5509     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 5510 
 5511     __ mov(len, zr);
 5512     __ add(len, len, 1024);
 5513 
 5514     __ BIND(L_loop);
 5515 
 5516     // load next 4x4S inputs interleaved: rplus --> vs1
 5517     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 5518 
 5519     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 5520     vs_addv(vtmp, __ T4S, vs1, qadd);
 5521     vs_sshr(vtmp, __ T4S, vtmp, 23);
 5522     vs_mulv(vtmp, __ T4S, vtmp, q);
 5523     vs_subv(vs1, __ T4S, vs1, vtmp);
 5524 
 5525     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 5526     vs_sshr(vtmp, __ T4S, vs1, 31);
 5527     vs_andr(vtmp, vtmp, q);
 5528     vs_addv(vs1, __ T4S, vs1, vtmp);
 5529 
 5530     // quotient --> vs2
 5531     // int quotient = (rplus * multiplier) >> 22;
 5532     vs_mulv(vtmp, __ T4S, vs1, mult);
 5533     vs_sshr(vs2, __ T4S, vtmp, 22);
 5534 
 5535     // r0 --> vs3
 5536     // int r0 = rplus - quotient * twoGamma2;
 5537     vs_mulv(vtmp, __ T4S, vs2, twog2);
 5538     vs_subv(vs3, __ T4S, vs1, vtmp);
 5539 
 5540     // mask --> vs4
 5541     // int mask = (twoGamma2 - r0) >> 22;
 5542     vs_subv(vtmp, __ T4S, twog2, vs3);
 5543     vs_sshr(vs4, __ T4S, vtmp, 22);
 5544 
 5545     // r0 -= (mask & twoGamma2);
 5546     vs_andr(vtmp, vs4, twog2);
 5547     vs_subv(vs3, __ T4S, vs3, vtmp);
 5548 
 5549     //  quotient += (mask & 1);
 5550     vs_andr(vtmp, vs4, one);
 5551     vs_addv(vs2, __ T4S, vs2, vtmp);
 5552 
 5553     // mask = (twoGamma2 / 2 - r0) >> 31;
 5554     vs_subv(vtmp, __ T4S, g2, vs3);
 5555     vs_sshr(vs4, __ T4S, vtmp, 31);
 5556 
 5557     // r0 -= (mask & twoGamma2);
 5558     vs_andr(vtmp, vs4, twog2);
 5559     vs_subv(vs3, __ T4S, vs3, vtmp);
 5560 
 5561     // quotient += (mask & 1);
 5562     vs_andr(vtmp, vs4, one);
 5563     vs_addv(vs2, __ T4S, vs2, vtmp);
 5564 
 5565     // r1 --> vs5
 5566     // int r1 = rplus - r0 - (dilithium_q - 1);
 5567     vs_subv(vtmp, __ T4S, vs1, vs3);
 5568     vs_subv(vs5, __ T4S, vtmp, qminus1);
 5569 
 5570     // r1 --> vs1 (overwriting rplus)
 5571     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 5572     vs_negr(vtmp, __ T4S, vs5);
 5573     vs_orr(vtmp, vs5, vtmp);
 5574     vs_sshr(vs1, __ T4S, vtmp, 31);
 5575 
 5576     // r0 += ~r1;
 5577     vs_notr(vtmp, vs1);
 5578     vs_addv(vs3, __ T4S, vs3, vtmp);
 5579 
 5580     // r1 = r1 & quotient;
 5581     vs_andr(vs1, vs2, vs1);
 5582 
 5583     // store results inteleaved
 5584     // lowPart[m] = r0;
 5585     // highPart[m] = r1;
 5586     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 5587     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 5588 
 5589 
 5590     __ sub(len, len, 64);
 5591     __ cmp(len, (u1)64);
 5592     __ br(Assembler::GE, L_loop);
 5593 
 5594     // restore callee-saved vector registers
 5595     __ ldpd(v14, v15, Address(sp, 48));
 5596     __ ldpd(v12, v13, Address(sp, 32));
 5597     __ ldpd(v10, v11, Address(sp, 16));
 5598     __ ldpd(v8, v9, __ post(sp, 64));
 5599 
 5600     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5601     __ mov(r0, zr); // return 0
 5602     __ ret(lr);
 5603 
 5604     return start;
 5605 
 5606   }
 5607 
 5608   /**
 5609    *  Arguments:
 5610    *
 5611    * Inputs:
 5612    *   c_rarg0   - int crc
 5613    *   c_rarg1   - byte* buf
 5614    *   c_rarg2   - int length
 5615    *   c_rarg3   - int* table
 5616    *
 5617    * Output:
 5618    *       r0   - int crc result
 5619    */
 5620   address generate_updateBytesCRC32C() {
 5621     assert(UseCRC32CIntrinsics, "what are we doing here?");
 5622 
 5623     __ align(CodeEntryAlignment);
 5624     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 5625     StubCodeMark mark(this, stub_id);
 5626 
 5627     address start = __ pc();
 5628 
 5629     const Register crc   = c_rarg0;  // crc
 5630     const Register buf   = c_rarg1;  // source java byte array address
 5631     const Register len   = c_rarg2;  // length
 5632     const Register table0 = c_rarg3; // crc_table address
 5633     const Register table1 = c_rarg4;
 5634     const Register table2 = c_rarg5;
 5635     const Register table3 = c_rarg6;
 5636     const Register tmp3 = c_rarg7;
 5637 
 5638     BLOCK_COMMENT("Entry:");
 5639     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5640 
 5641     __ kernel_crc32c(crc, buf, len,
 5642               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 5643 
 5644     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5645     __ ret(lr);
 5646 
 5647     return start;
 5648   }
 5649 
 5650   /***
 5651    *  Arguments:
 5652    *
 5653    *  Inputs:
 5654    *   c_rarg0   - int   adler
 5655    *   c_rarg1   - byte* buff
 5656    *   c_rarg2   - int   len
 5657    *
 5658    * Output:
 5659    *   c_rarg0   - int adler result
 5660    */
 5661   address generate_updateBytesAdler32() {
 5662     __ align(CodeEntryAlignment);
 5663     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 5664     StubCodeMark mark(this, stub_id);
 5665     address start = __ pc();
 5666 
 5667     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 5668 
 5669     // Aliases
 5670     Register adler  = c_rarg0;
 5671     Register s1     = c_rarg0;
 5672     Register s2     = c_rarg3;
 5673     Register buff   = c_rarg1;
 5674     Register len    = c_rarg2;
 5675     Register nmax  = r4;
 5676     Register base  = r5;
 5677     Register count = r6;
 5678     Register temp0 = rscratch1;
 5679     Register temp1 = rscratch2;
 5680     FloatRegister vbytes = v0;
 5681     FloatRegister vs1acc = v1;
 5682     FloatRegister vs2acc = v2;
 5683     FloatRegister vtable = v3;
 5684 
 5685     // Max number of bytes we can process before having to take the mod
 5686     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 5687     uint64_t BASE = 0xfff1;
 5688     uint64_t NMAX = 0x15B0;
 5689 
 5690     __ mov(base, BASE);
 5691     __ mov(nmax, NMAX);
 5692 
 5693     // Load accumulation coefficients for the upper 16 bits
 5694     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 5695     __ ld1(vtable, __ T16B, Address(temp0));
 5696 
 5697     // s1 is initialized to the lower 16 bits of adler
 5698     // s2 is initialized to the upper 16 bits of adler
 5699     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 5700     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 5701 
 5702     // The pipelined loop needs at least 16 elements for 1 iteration
 5703     // It does check this, but it is more effective to skip to the cleanup loop
 5704     __ cmp(len, (u1)16);
 5705     __ br(Assembler::HS, L_nmax);
 5706     __ cbz(len, L_combine);
 5707 
 5708     __ bind(L_simple_by1_loop);
 5709     __ ldrb(temp0, Address(__ post(buff, 1)));
 5710     __ add(s1, s1, temp0);
 5711     __ add(s2, s2, s1);
 5712     __ subs(len, len, 1);
 5713     __ br(Assembler::HI, L_simple_by1_loop);
 5714 
 5715     // s1 = s1 % BASE
 5716     __ subs(temp0, s1, base);
 5717     __ csel(s1, temp0, s1, Assembler::HS);
 5718 
 5719     // s2 = s2 % BASE
 5720     __ lsr(temp0, s2, 16);
 5721     __ lsl(temp1, temp0, 4);
 5722     __ sub(temp1, temp1, temp0);
 5723     __ add(s2, temp1, s2, ext::uxth);
 5724 
 5725     __ subs(temp0, s2, base);
 5726     __ csel(s2, temp0, s2, Assembler::HS);
 5727 
 5728     __ b(L_combine);
 5729 
 5730     __ bind(L_nmax);
 5731     __ subs(len, len, nmax);
 5732     __ sub(count, nmax, 16);
 5733     __ br(Assembler::LO, L_by16);
 5734 
 5735     __ bind(L_nmax_loop);
 5736 
 5737     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5738                                       vbytes, vs1acc, vs2acc, vtable);
 5739 
 5740     __ subs(count, count, 16);
 5741     __ br(Assembler::HS, L_nmax_loop);
 5742 
 5743     // s1 = s1 % BASE
 5744     __ lsr(temp0, s1, 16);
 5745     __ lsl(temp1, temp0, 4);
 5746     __ sub(temp1, temp1, temp0);
 5747     __ add(temp1, temp1, s1, ext::uxth);
 5748 
 5749     __ lsr(temp0, temp1, 16);
 5750     __ lsl(s1, temp0, 4);
 5751     __ sub(s1, s1, temp0);
 5752     __ add(s1, s1, temp1, ext:: uxth);
 5753 
 5754     __ subs(temp0, s1, base);
 5755     __ csel(s1, temp0, s1, Assembler::HS);
 5756 
 5757     // s2 = s2 % BASE
 5758     __ lsr(temp0, s2, 16);
 5759     __ lsl(temp1, temp0, 4);
 5760     __ sub(temp1, temp1, temp0);
 5761     __ add(temp1, temp1, s2, ext::uxth);
 5762 
 5763     __ lsr(temp0, temp1, 16);
 5764     __ lsl(s2, temp0, 4);
 5765     __ sub(s2, s2, temp0);
 5766     __ add(s2, s2, temp1, ext:: uxth);
 5767 
 5768     __ subs(temp0, s2, base);
 5769     __ csel(s2, temp0, s2, Assembler::HS);
 5770 
 5771     __ subs(len, len, nmax);
 5772     __ sub(count, nmax, 16);
 5773     __ br(Assembler::HS, L_nmax_loop);
 5774 
 5775     __ bind(L_by16);
 5776     __ adds(len, len, count);
 5777     __ br(Assembler::LO, L_by1);
 5778 
 5779     __ bind(L_by16_loop);
 5780 
 5781     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5782                                       vbytes, vs1acc, vs2acc, vtable);
 5783 
 5784     __ subs(len, len, 16);
 5785     __ br(Assembler::HS, L_by16_loop);
 5786 
 5787     __ bind(L_by1);
 5788     __ adds(len, len, 15);
 5789     __ br(Assembler::LO, L_do_mod);
 5790 
 5791     __ bind(L_by1_loop);
 5792     __ ldrb(temp0, Address(__ post(buff, 1)));
 5793     __ add(s1, temp0, s1);
 5794     __ add(s2, s2, s1);
 5795     __ subs(len, len, 1);
 5796     __ br(Assembler::HS, L_by1_loop);
 5797 
 5798     __ bind(L_do_mod);
 5799     // s1 = s1 % BASE
 5800     __ lsr(temp0, s1, 16);
 5801     __ lsl(temp1, temp0, 4);
 5802     __ sub(temp1, temp1, temp0);
 5803     __ add(temp1, temp1, s1, ext::uxth);
 5804 
 5805     __ lsr(temp0, temp1, 16);
 5806     __ lsl(s1, temp0, 4);
 5807     __ sub(s1, s1, temp0);
 5808     __ add(s1, s1, temp1, ext:: uxth);
 5809 
 5810     __ subs(temp0, s1, base);
 5811     __ csel(s1, temp0, s1, Assembler::HS);
 5812 
 5813     // s2 = s2 % BASE
 5814     __ lsr(temp0, s2, 16);
 5815     __ lsl(temp1, temp0, 4);
 5816     __ sub(temp1, temp1, temp0);
 5817     __ add(temp1, temp1, s2, ext::uxth);
 5818 
 5819     __ lsr(temp0, temp1, 16);
 5820     __ lsl(s2, temp0, 4);
 5821     __ sub(s2, s2, temp0);
 5822     __ add(s2, s2, temp1, ext:: uxth);
 5823 
 5824     __ subs(temp0, s2, base);
 5825     __ csel(s2, temp0, s2, Assembler::HS);
 5826 
 5827     // Combine lower bits and higher bits
 5828     __ bind(L_combine);
 5829     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 5830 
 5831     __ ret(lr);
 5832 
 5833     return start;
 5834   }
 5835 
 5836   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 5837           Register temp0, Register temp1, FloatRegister vbytes,
 5838           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 5839     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 5840     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 5841     // In non-vectorized code, we update s1 and s2 as:
 5842     //   s1 <- s1 + b1
 5843     //   s2 <- s2 + s1
 5844     //   s1 <- s1 + b2
 5845     //   s2 <- s2 + b1
 5846     //   ...
 5847     //   s1 <- s1 + b16
 5848     //   s2 <- s2 + s1
 5849     // Putting above assignments together, we have:
 5850     //   s1_new = s1 + b1 + b2 + ... + b16
 5851     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 5852     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 5853     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 5854     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 5855 
 5856     // s2 = s2 + s1 * 16
 5857     __ add(s2, s2, s1, Assembler::LSL, 4);
 5858 
 5859     // vs1acc = b1 + b2 + b3 + ... + b16
 5860     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 5861     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 5862     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 5863     __ uaddlv(vs1acc, __ T16B, vbytes);
 5864     __ uaddlv(vs2acc, __ T8H, vs2acc);
 5865 
 5866     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 5867     __ fmovd(temp0, vs1acc);
 5868     __ fmovd(temp1, vs2acc);
 5869     __ add(s1, s1, temp0);
 5870     __ add(s2, s2, temp1);
 5871   }
 5872 
 5873   /**
 5874    *  Arguments:
 5875    *
 5876    *  Input:
 5877    *    c_rarg0   - x address
 5878    *    c_rarg1   - x length
 5879    *    c_rarg2   - y address
 5880    *    c_rarg3   - y length
 5881    *    c_rarg4   - z address
 5882    */
 5883   address generate_multiplyToLen() {
 5884     __ align(CodeEntryAlignment);
 5885     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 5886     StubCodeMark mark(this, stub_id);
 5887 
 5888     address start = __ pc();
 5889     const Register x     = r0;
 5890     const Register xlen  = r1;
 5891     const Register y     = r2;
 5892     const Register ylen  = r3;
 5893     const Register z     = r4;
 5894 
 5895     const Register tmp0  = r5;
 5896     const Register tmp1  = r10;
 5897     const Register tmp2  = r11;
 5898     const Register tmp3  = r12;
 5899     const Register tmp4  = r13;
 5900     const Register tmp5  = r14;
 5901     const Register tmp6  = r15;
 5902     const Register tmp7  = r16;
 5903 
 5904     BLOCK_COMMENT("Entry:");
 5905     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5906     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5907     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5908     __ ret(lr);
 5909 
 5910     return start;
 5911   }
 5912 
 5913   address generate_squareToLen() {
 5914     // squareToLen algorithm for sizes 1..127 described in java code works
 5915     // faster than multiply_to_len on some CPUs and slower on others, but
 5916     // multiply_to_len shows a bit better overall results
 5917     __ align(CodeEntryAlignment);
 5918     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 5919     StubCodeMark mark(this, stub_id);
 5920     address start = __ pc();
 5921 
 5922     const Register x     = r0;
 5923     const Register xlen  = r1;
 5924     const Register z     = r2;
 5925     const Register y     = r4; // == x
 5926     const Register ylen  = r5; // == xlen
 5927 
 5928     const Register tmp0  = r3;
 5929     const Register tmp1  = r10;
 5930     const Register tmp2  = r11;
 5931     const Register tmp3  = r12;
 5932     const Register tmp4  = r13;
 5933     const Register tmp5  = r14;
 5934     const Register tmp6  = r15;
 5935     const Register tmp7  = r16;
 5936 
 5937     RegSet spilled_regs = RegSet::of(y, ylen);
 5938     BLOCK_COMMENT("Entry:");
 5939     __ enter();
 5940     __ push(spilled_regs, sp);
 5941     __ mov(y, x);
 5942     __ mov(ylen, xlen);
 5943     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5944     __ pop(spilled_regs, sp);
 5945     __ leave();
 5946     __ ret(lr);
 5947     return start;
 5948   }
 5949 
 5950   address generate_mulAdd() {
 5951     __ align(CodeEntryAlignment);
 5952     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 5953     StubCodeMark mark(this, stub_id);
 5954 
 5955     address start = __ pc();
 5956 
 5957     const Register out     = r0;
 5958     const Register in      = r1;
 5959     const Register offset  = r2;
 5960     const Register len     = r3;
 5961     const Register k       = r4;
 5962 
 5963     BLOCK_COMMENT("Entry:");
 5964     __ enter();
 5965     __ mul_add(out, in, offset, len, k);
 5966     __ leave();
 5967     __ ret(lr);
 5968 
 5969     return start;
 5970   }
 5971 
 5972   // Arguments:
 5973   //
 5974   // Input:
 5975   //   c_rarg0   - newArr address
 5976   //   c_rarg1   - oldArr address
 5977   //   c_rarg2   - newIdx
 5978   //   c_rarg3   - shiftCount
 5979   //   c_rarg4   - numIter
 5980   //
 5981   address generate_bigIntegerRightShift() {
 5982     __ align(CodeEntryAlignment);
 5983     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 5984     StubCodeMark mark(this, stub_id);
 5985     address start = __ pc();
 5986 
 5987     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 5988 
 5989     Register newArr        = c_rarg0;
 5990     Register oldArr        = c_rarg1;
 5991     Register newIdx        = c_rarg2;
 5992     Register shiftCount    = c_rarg3;
 5993     Register numIter       = c_rarg4;
 5994     Register idx           = numIter;
 5995 
 5996     Register newArrCur     = rscratch1;
 5997     Register shiftRevCount = rscratch2;
 5998     Register oldArrCur     = r13;
 5999     Register oldArrNext    = r14;
 6000 
 6001     FloatRegister oldElem0        = v0;
 6002     FloatRegister oldElem1        = v1;
 6003     FloatRegister newElem         = v2;
 6004     FloatRegister shiftVCount     = v3;
 6005     FloatRegister shiftVRevCount  = v4;
 6006 
 6007     __ cbz(idx, Exit);
 6008 
 6009     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6010 
 6011     // left shift count
 6012     __ movw(shiftRevCount, 32);
 6013     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6014 
 6015     // numIter too small to allow a 4-words SIMD loop, rolling back
 6016     __ cmp(numIter, (u1)4);
 6017     __ br(Assembler::LT, ShiftThree);
 6018 
 6019     __ dup(shiftVCount,    __ T4S, shiftCount);
 6020     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 6021     __ negr(shiftVCount,   __ T4S, shiftVCount);
 6022 
 6023     __ BIND(ShiftSIMDLoop);
 6024 
 6025     // Calculate the load addresses
 6026     __ sub(idx, idx, 4);
 6027     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6028     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6029     __ add(oldArrCur,  oldArrNext, 4);
 6030 
 6031     // Load 4 words and process
 6032     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 6033     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 6034     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6035     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6036     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6037     __ st1(newElem,   __ T4S,  Address(newArrCur));
 6038 
 6039     __ cmp(idx, (u1)4);
 6040     __ br(Assembler::LT, ShiftTwoLoop);
 6041     __ b(ShiftSIMDLoop);
 6042 
 6043     __ BIND(ShiftTwoLoop);
 6044     __ cbz(idx, Exit);
 6045     __ cmp(idx, (u1)1);
 6046     __ br(Assembler::EQ, ShiftOne);
 6047 
 6048     // Calculate the load addresses
 6049     __ sub(idx, idx, 2);
 6050     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6051     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6052     __ add(oldArrCur,  oldArrNext, 4);
 6053 
 6054     // Load 2 words and process
 6055     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 6056     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 6057     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 6058     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 6059     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 6060     __ st1(newElem,   __ T2S, Address(newArrCur));
 6061     __ b(ShiftTwoLoop);
 6062 
 6063     __ BIND(ShiftThree);
 6064     __ tbz(idx, 1, ShiftOne);
 6065     __ tbz(idx, 0, ShiftTwo);
 6066     __ ldrw(r10,  Address(oldArr, 12));
 6067     __ ldrw(r11,  Address(oldArr, 8));
 6068     __ lsrvw(r10, r10, shiftCount);
 6069     __ lslvw(r11, r11, shiftRevCount);
 6070     __ orrw(r12,  r10, r11);
 6071     __ strw(r12,  Address(newArr, 8));
 6072 
 6073     __ BIND(ShiftTwo);
 6074     __ ldrw(r10,  Address(oldArr, 8));
 6075     __ ldrw(r11,  Address(oldArr, 4));
 6076     __ lsrvw(r10, r10, shiftCount);
 6077     __ lslvw(r11, r11, shiftRevCount);
 6078     __ orrw(r12,  r10, r11);
 6079     __ strw(r12,  Address(newArr, 4));
 6080 
 6081     __ BIND(ShiftOne);
 6082     __ ldrw(r10,  Address(oldArr, 4));
 6083     __ ldrw(r11,  Address(oldArr));
 6084     __ lsrvw(r10, r10, shiftCount);
 6085     __ lslvw(r11, r11, shiftRevCount);
 6086     __ orrw(r12,  r10, r11);
 6087     __ strw(r12,  Address(newArr));
 6088 
 6089     __ BIND(Exit);
 6090     __ ret(lr);
 6091 
 6092     return start;
 6093   }
 6094 
 6095   // Arguments:
 6096   //
 6097   // Input:
 6098   //   c_rarg0   - newArr address
 6099   //   c_rarg1   - oldArr address
 6100   //   c_rarg2   - newIdx
 6101   //   c_rarg3   - shiftCount
 6102   //   c_rarg4   - numIter
 6103   //
 6104   address generate_bigIntegerLeftShift() {
 6105     __ align(CodeEntryAlignment);
 6106     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 6107     StubCodeMark mark(this, stub_id);
 6108     address start = __ pc();
 6109 
 6110     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6111 
 6112     Register newArr        = c_rarg0;
 6113     Register oldArr        = c_rarg1;
 6114     Register newIdx        = c_rarg2;
 6115     Register shiftCount    = c_rarg3;
 6116     Register numIter       = c_rarg4;
 6117 
 6118     Register shiftRevCount = rscratch1;
 6119     Register oldArrNext    = rscratch2;
 6120 
 6121     FloatRegister oldElem0        = v0;
 6122     FloatRegister oldElem1        = v1;
 6123     FloatRegister newElem         = v2;
 6124     FloatRegister shiftVCount     = v3;
 6125     FloatRegister shiftVRevCount  = v4;
 6126 
 6127     __ cbz(numIter, Exit);
 6128 
 6129     __ add(oldArrNext, oldArr, 4);
 6130     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6131 
 6132     // right shift count
 6133     __ movw(shiftRevCount, 32);
 6134     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6135 
 6136     // numIter too small to allow a 4-words SIMD loop, rolling back
 6137     __ cmp(numIter, (u1)4);
 6138     __ br(Assembler::LT, ShiftThree);
 6139 
 6140     __ dup(shiftVCount,     __ T4S, shiftCount);
 6141     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 6142     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 6143 
 6144     __ BIND(ShiftSIMDLoop);
 6145 
 6146     // load 4 words and process
 6147     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 6148     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 6149     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6150     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6151     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6152     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 6153     __ sub(numIter,   numIter, 4);
 6154 
 6155     __ cmp(numIter, (u1)4);
 6156     __ br(Assembler::LT, ShiftTwoLoop);
 6157     __ b(ShiftSIMDLoop);
 6158 
 6159     __ BIND(ShiftTwoLoop);
 6160     __ cbz(numIter, Exit);
 6161     __ cmp(numIter, (u1)1);
 6162     __ br(Assembler::EQ, ShiftOne);
 6163 
 6164     // load 2 words and process
 6165     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 6166     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 6167     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 6168     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 6169     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 6170     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 6171     __ sub(numIter,   numIter, 2);
 6172     __ b(ShiftTwoLoop);
 6173 
 6174     __ BIND(ShiftThree);
 6175     __ ldrw(r10,  __ post(oldArr, 4));
 6176     __ ldrw(r11,  __ post(oldArrNext, 4));
 6177     __ lslvw(r10, r10, shiftCount);
 6178     __ lsrvw(r11, r11, shiftRevCount);
 6179     __ orrw(r12,  r10, r11);
 6180     __ strw(r12,  __ post(newArr, 4));
 6181     __ tbz(numIter, 1, Exit);
 6182     __ tbz(numIter, 0, ShiftOne);
 6183 
 6184     __ BIND(ShiftTwo);
 6185     __ ldrw(r10,  __ post(oldArr, 4));
 6186     __ ldrw(r11,  __ post(oldArrNext, 4));
 6187     __ lslvw(r10, r10, shiftCount);
 6188     __ lsrvw(r11, r11, shiftRevCount);
 6189     __ orrw(r12,  r10, r11);
 6190     __ strw(r12,  __ post(newArr, 4));
 6191 
 6192     __ BIND(ShiftOne);
 6193     __ ldrw(r10,  Address(oldArr));
 6194     __ ldrw(r11,  Address(oldArrNext));
 6195     __ lslvw(r10, r10, shiftCount);
 6196     __ lsrvw(r11, r11, shiftRevCount);
 6197     __ orrw(r12,  r10, r11);
 6198     __ strw(r12,  Address(newArr));
 6199 
 6200     __ BIND(Exit);
 6201     __ ret(lr);
 6202 
 6203     return start;
 6204   }
 6205 
 6206   address generate_count_positives(address &count_positives_long) {
 6207     const u1 large_loop_size = 64;
 6208     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 6209     int dcache_line = VM_Version::dcache_line_size();
 6210 
 6211     Register ary1 = r1, len = r2, result = r0;
 6212 
 6213     __ align(CodeEntryAlignment);
 6214 
 6215     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 6216     StubCodeMark mark(this, stub_id);
 6217 
 6218     address entry = __ pc();
 6219 
 6220     __ enter();
 6221     // precondition: a copy of len is already in result
 6222     // __ mov(result, len);
 6223 
 6224   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 6225         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 6226 
 6227   __ cmp(len, (u1)15);
 6228   __ br(Assembler::GT, LEN_OVER_15);
 6229   // The only case when execution falls into this code is when pointer is near
 6230   // the end of memory page and we have to avoid reading next page
 6231   __ add(ary1, ary1, len);
 6232   __ subs(len, len, 8);
 6233   __ br(Assembler::GT, LEN_OVER_8);
 6234   __ ldr(rscratch2, Address(ary1, -8));
 6235   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 6236   __ lsrv(rscratch2, rscratch2, rscratch1);
 6237   __ tst(rscratch2, UPPER_BIT_MASK);
 6238   __ csel(result, zr, result, Assembler::NE);
 6239   __ leave();
 6240   __ ret(lr);
 6241   __ bind(LEN_OVER_8);
 6242   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 6243   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 6244   __ tst(rscratch2, UPPER_BIT_MASK);
 6245   __ br(Assembler::NE, RET_NO_POP);
 6246   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 6247   __ lsrv(rscratch1, rscratch1, rscratch2);
 6248   __ tst(rscratch1, UPPER_BIT_MASK);
 6249   __ bind(RET_NO_POP);
 6250   __ csel(result, zr, result, Assembler::NE);
 6251   __ leave();
 6252   __ ret(lr);
 6253 
 6254   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 6255   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 6256 
 6257   count_positives_long = __ pc(); // 2nd entry point
 6258 
 6259   __ enter();
 6260 
 6261   __ bind(LEN_OVER_15);
 6262     __ push(spilled_regs, sp);
 6263     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 6264     __ cbz(rscratch2, ALIGNED);
 6265     __ ldp(tmp6, tmp1, Address(ary1));
 6266     __ mov(tmp5, 16);
 6267     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 6268     __ add(ary1, ary1, rscratch1);
 6269     __ orr(tmp6, tmp6, tmp1);
 6270     __ tst(tmp6, UPPER_BIT_MASK);
 6271     __ br(Assembler::NE, RET_ADJUST);
 6272     __ sub(len, len, rscratch1);
 6273 
 6274   __ bind(ALIGNED);
 6275     __ cmp(len, large_loop_size);
 6276     __ br(Assembler::LT, CHECK_16);
 6277     // Perform 16-byte load as early return in pre-loop to handle situation
 6278     // when initially aligned large array has negative values at starting bytes,
 6279     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 6280     // slower. Cases with negative bytes further ahead won't be affected that
 6281     // much. In fact, it'll be faster due to early loads, less instructions and
 6282     // less branches in LARGE_LOOP.
 6283     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 6284     __ sub(len, len, 16);
 6285     __ orr(tmp6, tmp6, tmp1);
 6286     __ tst(tmp6, UPPER_BIT_MASK);
 6287     __ br(Assembler::NE, RET_ADJUST_16);
 6288     __ cmp(len, large_loop_size);
 6289     __ br(Assembler::LT, CHECK_16);
 6290 
 6291     if (SoftwarePrefetchHintDistance >= 0
 6292         && SoftwarePrefetchHintDistance >= dcache_line) {
 6293       // initial prefetch
 6294       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 6295     }
 6296   __ bind(LARGE_LOOP);
 6297     if (SoftwarePrefetchHintDistance >= 0) {
 6298       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 6299     }
 6300     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 6301     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 6302     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 6303     // instructions per cycle and have less branches, but this approach disables
 6304     // early return, thus, all 64 bytes are loaded and checked every time.
 6305     __ ldp(tmp2, tmp3, Address(ary1));
 6306     __ ldp(tmp4, tmp5, Address(ary1, 16));
 6307     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 6308     __ ldp(tmp6, tmp1, Address(ary1, 48));
 6309     __ add(ary1, ary1, large_loop_size);
 6310     __ sub(len, len, large_loop_size);
 6311     __ orr(tmp2, tmp2, tmp3);
 6312     __ orr(tmp4, tmp4, tmp5);
 6313     __ orr(rscratch1, rscratch1, rscratch2);
 6314     __ orr(tmp6, tmp6, tmp1);
 6315     __ orr(tmp2, tmp2, tmp4);
 6316     __ orr(rscratch1, rscratch1, tmp6);
 6317     __ orr(tmp2, tmp2, rscratch1);
 6318     __ tst(tmp2, UPPER_BIT_MASK);
 6319     __ br(Assembler::NE, RET_ADJUST_LONG);
 6320     __ cmp(len, large_loop_size);
 6321     __ br(Assembler::GE, LARGE_LOOP);
 6322 
 6323   __ bind(CHECK_16); // small 16-byte load pre-loop
 6324     __ cmp(len, (u1)16);
 6325     __ br(Assembler::LT, POST_LOOP16);
 6326 
 6327   __ bind(LOOP16); // small 16-byte load loop
 6328     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 6329     __ sub(len, len, 16);
 6330     __ orr(tmp2, tmp2, tmp3);
 6331     __ tst(tmp2, UPPER_BIT_MASK);
 6332     __ br(Assembler::NE, RET_ADJUST_16);
 6333     __ cmp(len, (u1)16);
 6334     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 6335 
 6336   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 6337     __ cmp(len, (u1)8);
 6338     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 6339     __ ldr(tmp3, Address(__ post(ary1, 8)));
 6340     __ tst(tmp3, UPPER_BIT_MASK);
 6341     __ br(Assembler::NE, RET_ADJUST);
 6342     __ sub(len, len, 8);
 6343 
 6344   __ bind(POST_LOOP16_LOAD_TAIL);
 6345     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 6346     __ ldr(tmp1, Address(ary1));
 6347     __ mov(tmp2, 64);
 6348     __ sub(tmp4, tmp2, len, __ LSL, 3);
 6349     __ lslv(tmp1, tmp1, tmp4);
 6350     __ tst(tmp1, UPPER_BIT_MASK);
 6351     __ br(Assembler::NE, RET_ADJUST);
 6352     // Fallthrough
 6353 
 6354   __ bind(RET_LEN);
 6355     __ pop(spilled_regs, sp);
 6356     __ leave();
 6357     __ ret(lr);
 6358 
 6359     // difference result - len is the count of guaranteed to be
 6360     // positive bytes
 6361 
 6362   __ bind(RET_ADJUST_LONG);
 6363     __ add(len, len, (u1)(large_loop_size - 16));
 6364   __ bind(RET_ADJUST_16);
 6365     __ add(len, len, 16);
 6366   __ bind(RET_ADJUST);
 6367     __ pop(spilled_regs, sp);
 6368     __ leave();
 6369     __ sub(result, result, len);
 6370     __ ret(lr);
 6371 
 6372     return entry;
 6373   }
 6374 
 6375   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 6376         bool usePrefetch, Label &NOT_EQUAL) {
 6377     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6378         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6379         tmp7 = r12, tmp8 = r13;
 6380     Label LOOP;
 6381 
 6382     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6383     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6384     __ bind(LOOP);
 6385     if (usePrefetch) {
 6386       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6387       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6388     }
 6389     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6390     __ eor(tmp1, tmp1, tmp2);
 6391     __ eor(tmp3, tmp3, tmp4);
 6392     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6393     __ orr(tmp1, tmp1, tmp3);
 6394     __ cbnz(tmp1, NOT_EQUAL);
 6395     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6396     __ eor(tmp5, tmp5, tmp6);
 6397     __ eor(tmp7, tmp7, tmp8);
 6398     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6399     __ orr(tmp5, tmp5, tmp7);
 6400     __ cbnz(tmp5, NOT_EQUAL);
 6401     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6402     __ eor(tmp1, tmp1, tmp2);
 6403     __ eor(tmp3, tmp3, tmp4);
 6404     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6405     __ orr(tmp1, tmp1, tmp3);
 6406     __ cbnz(tmp1, NOT_EQUAL);
 6407     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6408     __ eor(tmp5, tmp5, tmp6);
 6409     __ sub(cnt1, cnt1, 8 * wordSize);
 6410     __ eor(tmp7, tmp7, tmp8);
 6411     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6412     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 6413     // cmp) because subs allows an unlimited range of immediate operand.
 6414     __ subs(tmp6, cnt1, loopThreshold);
 6415     __ orr(tmp5, tmp5, tmp7);
 6416     __ cbnz(tmp5, NOT_EQUAL);
 6417     __ br(__ GE, LOOP);
 6418     // post-loop
 6419     __ eor(tmp1, tmp1, tmp2);
 6420     __ eor(tmp3, tmp3, tmp4);
 6421     __ orr(tmp1, tmp1, tmp3);
 6422     __ sub(cnt1, cnt1, 2 * wordSize);
 6423     __ cbnz(tmp1, NOT_EQUAL);
 6424   }
 6425 
 6426   void generate_large_array_equals_loop_simd(int loopThreshold,
 6427         bool usePrefetch, Label &NOT_EQUAL) {
 6428     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6429         tmp2 = rscratch2;
 6430     Label LOOP;
 6431 
 6432     __ bind(LOOP);
 6433     if (usePrefetch) {
 6434       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6435       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6436     }
 6437     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 6438     __ sub(cnt1, cnt1, 8 * wordSize);
 6439     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 6440     __ subs(tmp1, cnt1, loopThreshold);
 6441     __ eor(v0, __ T16B, v0, v4);
 6442     __ eor(v1, __ T16B, v1, v5);
 6443     __ eor(v2, __ T16B, v2, v6);
 6444     __ eor(v3, __ T16B, v3, v7);
 6445     __ orr(v0, __ T16B, v0, v1);
 6446     __ orr(v1, __ T16B, v2, v3);
 6447     __ orr(v0, __ T16B, v0, v1);
 6448     __ umov(tmp1, v0, __ D, 0);
 6449     __ umov(tmp2, v0, __ D, 1);
 6450     __ orr(tmp1, tmp1, tmp2);
 6451     __ cbnz(tmp1, NOT_EQUAL);
 6452     __ br(__ GE, LOOP);
 6453   }
 6454 
 6455   // a1 = r1 - array1 address
 6456   // a2 = r2 - array2 address
 6457   // result = r0 - return value. Already contains "false"
 6458   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 6459   // r3-r5 are reserved temporary registers
 6460   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 6461   address generate_large_array_equals() {
 6462     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6463         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6464         tmp7 = r12, tmp8 = r13;
 6465     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 6466         SMALL_LOOP, POST_LOOP;
 6467     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 6468     // calculate if at least 32 prefetched bytes are used
 6469     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 6470     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 6471     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 6472     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 6473         tmp5, tmp6, tmp7, tmp8);
 6474 
 6475     __ align(CodeEntryAlignment);
 6476 
 6477     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 6478     StubCodeMark mark(this, stub_id);
 6479 
 6480     address entry = __ pc();
 6481     __ enter();
 6482     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 6483     // also advance pointers to use post-increment instead of pre-increment
 6484     __ add(a1, a1, wordSize);
 6485     __ add(a2, a2, wordSize);
 6486     if (AvoidUnalignedAccesses) {
 6487       // both implementations (SIMD/nonSIMD) are using relatively large load
 6488       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 6489       // on some CPUs in case of address is not at least 16-byte aligned.
 6490       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 6491       // load if needed at least for 1st address and make if 16-byte aligned.
 6492       Label ALIGNED16;
 6493       __ tbz(a1, 3, ALIGNED16);
 6494       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6495       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6496       __ sub(cnt1, cnt1, wordSize);
 6497       __ eor(tmp1, tmp1, tmp2);
 6498       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 6499       __ bind(ALIGNED16);
 6500     }
 6501     if (UseSIMDForArrayEquals) {
 6502       if (SoftwarePrefetchHintDistance >= 0) {
 6503         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6504         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6505         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 6506             /* prfm = */ true, NOT_EQUAL);
 6507         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6508         __ br(__ LT, TAIL);
 6509       }
 6510       __ bind(NO_PREFETCH_LARGE_LOOP);
 6511       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 6512           /* prfm = */ false, NOT_EQUAL);
 6513     } else {
 6514       __ push(spilled_regs, sp);
 6515       if (SoftwarePrefetchHintDistance >= 0) {
 6516         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6517         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6518         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 6519             /* prfm = */ true, NOT_EQUAL);
 6520         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6521         __ br(__ LT, TAIL);
 6522       }
 6523       __ bind(NO_PREFETCH_LARGE_LOOP);
 6524       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 6525           /* prfm = */ false, NOT_EQUAL);
 6526     }
 6527     __ bind(TAIL);
 6528       __ cbz(cnt1, EQUAL);
 6529       __ subs(cnt1, cnt1, wordSize);
 6530       __ br(__ LE, POST_LOOP);
 6531     __ bind(SMALL_LOOP);
 6532       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6533       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6534       __ subs(cnt1, cnt1, wordSize);
 6535       __ eor(tmp1, tmp1, tmp2);
 6536       __ cbnz(tmp1, NOT_EQUAL);
 6537       __ br(__ GT, SMALL_LOOP);
 6538     __ bind(POST_LOOP);
 6539       __ ldr(tmp1, Address(a1, cnt1));
 6540       __ ldr(tmp2, Address(a2, cnt1));
 6541       __ eor(tmp1, tmp1, tmp2);
 6542       __ cbnz(tmp1, NOT_EQUAL);
 6543     __ bind(EQUAL);
 6544       __ mov(result, true);
 6545     __ bind(NOT_EQUAL);
 6546       if (!UseSIMDForArrayEquals) {
 6547         __ pop(spilled_regs, sp);
 6548       }
 6549     __ bind(NOT_EQUAL_NO_POP);
 6550     __ leave();
 6551     __ ret(lr);
 6552     return entry;
 6553   }
 6554 
 6555   // result = r0 - return value. Contains initial hashcode value on entry.
 6556   // ary = r1 - array address
 6557   // cnt = r2 - elements count
 6558   // Clobbers: v0-v13, rscratch1, rscratch2
 6559   address generate_large_arrays_hashcode(BasicType eltype) {
 6560     const Register result = r0, ary = r1, cnt = r2;
 6561     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 6562     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 6563     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 6564     const FloatRegister vpowm = v13;
 6565 
 6566     ARRAYS_HASHCODE_REGISTERS;
 6567 
 6568     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 6569 
 6570     unsigned int vf; // vectorization factor
 6571     bool multiply_by_halves;
 6572     Assembler::SIMD_Arrangement load_arrangement;
 6573     switch (eltype) {
 6574     case T_BOOLEAN:
 6575     case T_BYTE:
 6576       load_arrangement = Assembler::T8B;
 6577       multiply_by_halves = true;
 6578       vf = 8;
 6579       break;
 6580     case T_CHAR:
 6581     case T_SHORT:
 6582       load_arrangement = Assembler::T8H;
 6583       multiply_by_halves = true;
 6584       vf = 8;
 6585       break;
 6586     case T_INT:
 6587       load_arrangement = Assembler::T4S;
 6588       multiply_by_halves = false;
 6589       vf = 4;
 6590       break;
 6591     default:
 6592       ShouldNotReachHere();
 6593     }
 6594 
 6595     // Unroll factor
 6596     const unsigned uf = 4;
 6597 
 6598     // Effective vectorization factor
 6599     const unsigned evf = vf * uf;
 6600 
 6601     __ align(CodeEntryAlignment);
 6602 
 6603     StubGenStubId stub_id;
 6604     switch (eltype) {
 6605     case T_BOOLEAN:
 6606       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 6607       break;
 6608     case T_BYTE:
 6609       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 6610       break;
 6611     case T_CHAR:
 6612       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 6613       break;
 6614     case T_SHORT:
 6615       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 6616       break;
 6617     case T_INT:
 6618       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 6619       break;
 6620     default:
 6621       stub_id = StubGenStubId::NO_STUBID;
 6622       ShouldNotReachHere();
 6623     };
 6624 
 6625     StubCodeMark mark(this, stub_id);
 6626 
 6627     address entry = __ pc();
 6628     __ enter();
 6629 
 6630     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 6631     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 6632     // value shouldn't change throughout both loops.
 6633     __ movw(rscratch1, intpow(31U, 3));
 6634     __ mov(vpow, Assembler::S, 0, rscratch1);
 6635     __ movw(rscratch1, intpow(31U, 2));
 6636     __ mov(vpow, Assembler::S, 1, rscratch1);
 6637     __ movw(rscratch1, intpow(31U, 1));
 6638     __ mov(vpow, Assembler::S, 2, rscratch1);
 6639     __ movw(rscratch1, intpow(31U, 0));
 6640     __ mov(vpow, Assembler::S, 3, rscratch1);
 6641 
 6642     __ mov(vmul0, Assembler::T16B, 0);
 6643     __ mov(vmul0, Assembler::S, 3, result);
 6644 
 6645     __ andr(rscratch2, cnt, (uf - 1) * vf);
 6646     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 6647 
 6648     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 6649     __ mov(vpowm, Assembler::S, 0, rscratch1);
 6650 
 6651     // SMALL LOOP
 6652     __ bind(SMALL_LOOP);
 6653 
 6654     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 6655     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6656     __ subsw(rscratch2, rscratch2, vf);
 6657 
 6658     if (load_arrangement == Assembler::T8B) {
 6659       // Extend 8B to 8H to be able to use vector multiply
 6660       // instructions
 6661       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6662       if (is_signed_subword_type(eltype)) {
 6663         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6664       } else {
 6665         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6666       }
 6667     }
 6668 
 6669     switch (load_arrangement) {
 6670     case Assembler::T4S:
 6671       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6672       break;
 6673     case Assembler::T8B:
 6674     case Assembler::T8H:
 6675       assert(is_subword_type(eltype), "subword type expected");
 6676       if (is_signed_subword_type(eltype)) {
 6677         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6678       } else {
 6679         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6680       }
 6681       break;
 6682     default:
 6683       __ should_not_reach_here();
 6684     }
 6685 
 6686     // Process the upper half of a vector
 6687     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6688       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6689       if (is_signed_subword_type(eltype)) {
 6690         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6691       } else {
 6692         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6693       }
 6694     }
 6695 
 6696     __ br(Assembler::HI, SMALL_LOOP);
 6697 
 6698     // SMALL LOOP'S EPILOQUE
 6699     __ lsr(rscratch2, cnt, exact_log2(evf));
 6700     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 6701 
 6702     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6703     __ addv(vmul0, Assembler::T4S, vmul0);
 6704     __ umov(result, vmul0, Assembler::S, 0);
 6705 
 6706     // TAIL
 6707     __ bind(TAIL);
 6708 
 6709     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 6710     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 6711     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 6712     __ andr(rscratch2, cnt, vf - 1);
 6713     __ bind(TAIL_SHORTCUT);
 6714     __ adr(rscratch1, BR_BASE);
 6715     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 6716     __ movw(rscratch2, 0x1f);
 6717     __ br(rscratch1);
 6718 
 6719     for (size_t i = 0; i < vf - 1; ++i) {
 6720       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 6721                                    eltype);
 6722       __ maddw(result, result, rscratch2, rscratch1);
 6723     }
 6724     __ bind(BR_BASE);
 6725 
 6726     __ leave();
 6727     __ ret(lr);
 6728 
 6729     // LARGE LOOP
 6730     __ bind(LARGE_LOOP_PREHEADER);
 6731 
 6732     __ lsr(rscratch2, cnt, exact_log2(evf));
 6733 
 6734     if (multiply_by_halves) {
 6735       // 31^4 - multiplier between lower and upper parts of a register
 6736       __ movw(rscratch1, intpow(31U, vf / 2));
 6737       __ mov(vpowm, Assembler::S, 1, rscratch1);
 6738       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 6739       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 6740       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6741     } else {
 6742       // 31^16
 6743       __ movw(rscratch1, intpow(31U, evf));
 6744       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6745     }
 6746 
 6747     __ mov(vmul3, Assembler::T16B, 0);
 6748     __ mov(vmul2, Assembler::T16B, 0);
 6749     __ mov(vmul1, Assembler::T16B, 0);
 6750 
 6751     __ bind(LARGE_LOOP);
 6752 
 6753     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 6754     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 6755     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 6756     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6757 
 6758     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 6759            Address(__ post(ary, evf * type2aelembytes(eltype))));
 6760 
 6761     if (load_arrangement == Assembler::T8B) {
 6762       // Extend 8B to 8H to be able to use vector multiply
 6763       // instructions
 6764       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6765       if (is_signed_subword_type(eltype)) {
 6766         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6767         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6768         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6769         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6770       } else {
 6771         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6772         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6773         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6774         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6775       }
 6776     }
 6777 
 6778     switch (load_arrangement) {
 6779     case Assembler::T4S:
 6780       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 6781       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 6782       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 6783       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6784       break;
 6785     case Assembler::T8B:
 6786     case Assembler::T8H:
 6787       assert(is_subword_type(eltype), "subword type expected");
 6788       if (is_signed_subword_type(eltype)) {
 6789         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6790         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6791         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6792         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6793       } else {
 6794         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6795         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6796         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6797         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6798       }
 6799       break;
 6800     default:
 6801       __ should_not_reach_here();
 6802     }
 6803 
 6804     // Process the upper half of a vector
 6805     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6806       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 6807       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 6808       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 6809       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 6810       if (is_signed_subword_type(eltype)) {
 6811         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6812         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6813         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6814         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6815       } else {
 6816         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6817         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6818         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6819         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6820       }
 6821     }
 6822 
 6823     __ subsw(rscratch2, rscratch2, 1);
 6824     __ br(Assembler::HI, LARGE_LOOP);
 6825 
 6826     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 6827     __ addv(vmul3, Assembler::T4S, vmul3);
 6828     __ umov(result, vmul3, Assembler::S, 0);
 6829 
 6830     __ mov(rscratch2, intpow(31U, vf));
 6831 
 6832     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 6833     __ addv(vmul2, Assembler::T4S, vmul2);
 6834     __ umov(rscratch1, vmul2, Assembler::S, 0);
 6835     __ maddw(result, result, rscratch2, rscratch1);
 6836 
 6837     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 6838     __ addv(vmul1, Assembler::T4S, vmul1);
 6839     __ umov(rscratch1, vmul1, Assembler::S, 0);
 6840     __ maddw(result, result, rscratch2, rscratch1);
 6841 
 6842     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6843     __ addv(vmul0, Assembler::T4S, vmul0);
 6844     __ umov(rscratch1, vmul0, Assembler::S, 0);
 6845     __ maddw(result, result, rscratch2, rscratch1);
 6846 
 6847     __ andr(rscratch2, cnt, vf - 1);
 6848     __ cbnz(rscratch2, TAIL_SHORTCUT);
 6849 
 6850     __ leave();
 6851     __ ret(lr);
 6852 
 6853     return entry;
 6854   }
 6855 
 6856   address generate_dsin_dcos(bool isCos) {
 6857     __ align(CodeEntryAlignment);
 6858     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 6859     StubCodeMark mark(this, stub_id);
 6860     address start = __ pc();
 6861     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 6862         (address)StubRoutines::aarch64::_two_over_pi,
 6863         (address)StubRoutines::aarch64::_pio2,
 6864         (address)StubRoutines::aarch64::_dsin_coef,
 6865         (address)StubRoutines::aarch64::_dcos_coef);
 6866     return start;
 6867   }
 6868 
 6869   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 6870   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 6871       Label &DIFF2) {
 6872     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 6873     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 6874 
 6875     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 6876     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6877     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 6878     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 6879 
 6880     __ fmovd(tmpL, vtmp3);
 6881     __ eor(rscratch2, tmp3, tmpL);
 6882     __ cbnz(rscratch2, DIFF2);
 6883 
 6884     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6885     __ umov(tmpL, vtmp3, __ D, 1);
 6886     __ eor(rscratch2, tmpU, tmpL);
 6887     __ cbnz(rscratch2, DIFF1);
 6888 
 6889     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 6890     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6891     __ fmovd(tmpL, vtmp);
 6892     __ eor(rscratch2, tmp3, tmpL);
 6893     __ cbnz(rscratch2, DIFF2);
 6894 
 6895     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6896     __ umov(tmpL, vtmp, __ D, 1);
 6897     __ eor(rscratch2, tmpU, tmpL);
 6898     __ cbnz(rscratch2, DIFF1);
 6899   }
 6900 
 6901   // r0  = result
 6902   // r1  = str1
 6903   // r2  = cnt1
 6904   // r3  = str2
 6905   // r4  = cnt2
 6906   // r10 = tmp1
 6907   // r11 = tmp2
 6908   address generate_compare_long_string_different_encoding(bool isLU) {
 6909     __ align(CodeEntryAlignment);
 6910     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 6911     StubCodeMark mark(this, stub_id);
 6912     address entry = __ pc();
 6913     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 6914         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 6915         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 6916     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 6917         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 6918     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 6919     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 6920 
 6921     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 6922 
 6923     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 6924     // cnt2 == amount of characters left to compare
 6925     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 6926     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 6927     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 6928     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 6929     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 6930     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 6931     __ eor(rscratch2, tmp1, tmp2);
 6932     __ mov(rscratch1, tmp2);
 6933     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 6934     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 6935              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 6936     __ push(spilled_regs, sp);
 6937     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 6938     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 6939 
 6940     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6941 
 6942     if (SoftwarePrefetchHintDistance >= 0) {
 6943       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6944       __ br(__ LT, NO_PREFETCH);
 6945       __ bind(LARGE_LOOP_PREFETCH);
 6946         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 6947         __ mov(tmp4, 2);
 6948         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6949         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 6950           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6951           __ subs(tmp4, tmp4, 1);
 6952           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 6953           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6954           __ mov(tmp4, 2);
 6955         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 6956           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6957           __ subs(tmp4, tmp4, 1);
 6958           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 6959           __ sub(cnt2, cnt2, 64);
 6960           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6961           __ br(__ GE, LARGE_LOOP_PREFETCH);
 6962     }
 6963     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 6964     __ bind(NO_PREFETCH);
 6965     __ subs(cnt2, cnt2, 16);
 6966     __ br(__ LT, TAIL);
 6967     __ align(OptoLoopAlignment);
 6968     __ bind(SMALL_LOOP); // smaller loop
 6969       __ subs(cnt2, cnt2, 16);
 6970       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6971       __ br(__ GE, SMALL_LOOP);
 6972       __ cmn(cnt2, (u1)16);
 6973       __ br(__ EQ, LOAD_LAST);
 6974     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 6975       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 6976       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 6977       __ ldr(tmp3, Address(cnt1, -8));
 6978       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 6979       __ b(LOAD_LAST);
 6980     __ bind(DIFF2);
 6981       __ mov(tmpU, tmp3);
 6982     __ bind(DIFF1);
 6983       __ pop(spilled_regs, sp);
 6984       __ b(CALCULATE_DIFFERENCE);
 6985     __ bind(LOAD_LAST);
 6986       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 6987       // No need to load it again
 6988       __ mov(tmpU, tmp3);
 6989       __ pop(spilled_regs, sp);
 6990 
 6991       // tmp2 points to the address of the last 4 Latin1 characters right now
 6992       __ ldrs(vtmp, Address(tmp2));
 6993       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 6994       __ fmovd(tmpL, vtmp);
 6995 
 6996       __ eor(rscratch2, tmpU, tmpL);
 6997       __ cbz(rscratch2, DONE);
 6998 
 6999     // Find the first different characters in the longwords and
 7000     // compute their difference.
 7001     __ bind(CALCULATE_DIFFERENCE);
 7002       __ rev(rscratch2, rscratch2);
 7003       __ clz(rscratch2, rscratch2);
 7004       __ andr(rscratch2, rscratch2, -16);
 7005       __ lsrv(tmp1, tmp1, rscratch2);
 7006       __ uxthw(tmp1, tmp1);
 7007       __ lsrv(rscratch1, rscratch1, rscratch2);
 7008       __ uxthw(rscratch1, rscratch1);
 7009       __ subw(result, tmp1, rscratch1);
 7010     __ bind(DONE);
 7011       __ ret(lr);
 7012     return entry;
 7013   }
 7014 
 7015   // r0 = input (float16)
 7016   // v0 = result (float)
 7017   // v1 = temporary float register
 7018   address generate_float16ToFloat() {
 7019     __ align(CodeEntryAlignment);
 7020     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 7021     StubCodeMark mark(this, stub_id);
 7022     address entry = __ pc();
 7023     BLOCK_COMMENT("Entry:");
 7024     __ flt16_to_flt(v0, r0, v1);
 7025     __ ret(lr);
 7026     return entry;
 7027   }
 7028 
 7029   // v0 = input (float)
 7030   // r0 = result (float16)
 7031   // v1 = temporary float register
 7032   address generate_floatToFloat16() {
 7033     __ align(CodeEntryAlignment);
 7034     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 7035     StubCodeMark mark(this, stub_id);
 7036     address entry = __ pc();
 7037     BLOCK_COMMENT("Entry:");
 7038     __ flt_to_flt16(r0, v0, v1);
 7039     __ ret(lr);
 7040     return entry;
 7041   }
 7042 
 7043   address generate_method_entry_barrier() {
 7044     __ align(CodeEntryAlignment);
 7045     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 7046     StubCodeMark mark(this, stub_id);
 7047 
 7048     Label deoptimize_label;
 7049 
 7050     address start = __ pc();
 7051 
 7052     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 7053 
 7054     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 7055       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 7056       // We can get here despite the nmethod being good, if we have not
 7057       // yet applied our cross modification fence (or data fence).
 7058       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 7059       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 7060       __ ldrw(rscratch2, rscratch2);
 7061       __ strw(rscratch2, thread_epoch_addr);
 7062       __ isb();
 7063       __ membar(__ LoadLoad);
 7064     }
 7065 
 7066     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 7067 
 7068     __ enter();
 7069     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 7070 
 7071     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 7072 
 7073     __ push_call_clobbered_registers();
 7074 
 7075     __ mov(c_rarg0, rscratch2);
 7076     __ call_VM_leaf
 7077          (CAST_FROM_FN_PTR
 7078           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 7079 
 7080     __ reset_last_Java_frame(true);
 7081 
 7082     __ mov(rscratch1, r0);
 7083 
 7084     __ pop_call_clobbered_registers();
 7085 
 7086     __ cbnz(rscratch1, deoptimize_label);
 7087 
 7088     __ leave();
 7089     __ ret(lr);
 7090 
 7091     __ BIND(deoptimize_label);
 7092 
 7093     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 7094     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 7095 
 7096     __ mov(sp, rscratch1);
 7097     __ br(rscratch2);
 7098 
 7099     return start;
 7100   }
 7101 
 7102   // r0  = result
 7103   // r1  = str1
 7104   // r2  = cnt1
 7105   // r3  = str2
 7106   // r4  = cnt2
 7107   // r10 = tmp1
 7108   // r11 = tmp2
 7109   address generate_compare_long_string_same_encoding(bool isLL) {
 7110     __ align(CodeEntryAlignment);
 7111     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 7112     StubCodeMark mark(this, stub_id);
 7113     address entry = __ pc();
 7114     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7115         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 7116 
 7117     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 7118 
 7119     // exit from large loop when less than 64 bytes left to read or we're about
 7120     // to prefetch memory behind array border
 7121     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 7122 
 7123     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 7124     __ eor(rscratch2, tmp1, tmp2);
 7125     __ cbnz(rscratch2, CAL_DIFFERENCE);
 7126 
 7127     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 7128     // update pointers, because of previous read
 7129     __ add(str1, str1, wordSize);
 7130     __ add(str2, str2, wordSize);
 7131     if (SoftwarePrefetchHintDistance >= 0) {
 7132       __ align(OptoLoopAlignment);
 7133       __ bind(LARGE_LOOP_PREFETCH);
 7134         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 7135         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 7136 
 7137         for (int i = 0; i < 4; i++) {
 7138           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 7139           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 7140           __ cmp(tmp1, tmp2);
 7141           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7142           __ br(Assembler::NE, DIFF);
 7143         }
 7144         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 7145         __ add(str1, str1, 64);
 7146         __ add(str2, str2, 64);
 7147         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 7148         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 7149         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 7150     }
 7151 
 7152     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 7153     __ br(Assembler::LE, LESS16);
 7154     __ align(OptoLoopAlignment);
 7155     __ bind(LOOP_COMPARE16);
 7156       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7157       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7158       __ cmp(tmp1, tmp2);
 7159       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7160       __ br(Assembler::NE, DIFF);
 7161       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7162       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7163       __ br(Assembler::LT, LESS16);
 7164 
 7165       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7166       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7167       __ cmp(tmp1, tmp2);
 7168       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7169       __ br(Assembler::NE, DIFF);
 7170       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7171       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7172       __ br(Assembler::GE, LOOP_COMPARE16);
 7173       __ cbz(cnt2, LENGTH_DIFF);
 7174 
 7175     __ bind(LESS16);
 7176       // each 8 compare
 7177       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 7178       __ br(Assembler::LE, LESS8);
 7179       __ ldr(tmp1, Address(__ post(str1, 8)));
 7180       __ ldr(tmp2, Address(__ post(str2, 8)));
 7181       __ eor(rscratch2, tmp1, tmp2);
 7182       __ cbnz(rscratch2, CAL_DIFFERENCE);
 7183       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 7184 
 7185     __ bind(LESS8); // directly load last 8 bytes
 7186       if (!isLL) {
 7187         __ add(cnt2, cnt2, cnt2);
 7188       }
 7189       __ ldr(tmp1, Address(str1, cnt2));
 7190       __ ldr(tmp2, Address(str2, cnt2));
 7191       __ eor(rscratch2, tmp1, tmp2);
 7192       __ cbz(rscratch2, LENGTH_DIFF);
 7193       __ b(CAL_DIFFERENCE);
 7194 
 7195     __ bind(DIFF);
 7196       __ cmp(tmp1, tmp2);
 7197       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 7198       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 7199       // reuse rscratch2 register for the result of eor instruction
 7200       __ eor(rscratch2, tmp1, tmp2);
 7201 
 7202     __ bind(CAL_DIFFERENCE);
 7203       __ rev(rscratch2, rscratch2);
 7204       __ clz(rscratch2, rscratch2);
 7205       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 7206       __ lsrv(tmp1, tmp1, rscratch2);
 7207       __ lsrv(tmp2, tmp2, rscratch2);
 7208       if (isLL) {
 7209         __ uxtbw(tmp1, tmp1);
 7210         __ uxtbw(tmp2, tmp2);
 7211       } else {
 7212         __ uxthw(tmp1, tmp1);
 7213         __ uxthw(tmp2, tmp2);
 7214       }
 7215       __ subw(result, tmp1, tmp2);
 7216 
 7217     __ bind(LENGTH_DIFF);
 7218       __ ret(lr);
 7219     return entry;
 7220   }
 7221 
 7222   enum string_compare_mode {
 7223     LL,
 7224     LU,
 7225     UL,
 7226     UU,
 7227   };
 7228 
 7229   // The following registers are declared in aarch64.ad
 7230   // r0  = result
 7231   // r1  = str1
 7232   // r2  = cnt1
 7233   // r3  = str2
 7234   // r4  = cnt2
 7235   // r10 = tmp1
 7236   // r11 = tmp2
 7237   // z0  = ztmp1
 7238   // z1  = ztmp2
 7239   // p0  = pgtmp1
 7240   // p1  = pgtmp2
 7241   address generate_compare_long_string_sve(string_compare_mode mode) {
 7242     StubGenStubId stub_id;
 7243     switch (mode) {
 7244       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 7245       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 7246       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 7247       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 7248       default: ShouldNotReachHere();
 7249     }
 7250 
 7251     __ align(CodeEntryAlignment);
 7252     address entry = __ pc();
 7253     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7254              tmp1 = r10, tmp2 = r11;
 7255 
 7256     Label LOOP, DONE, MISMATCH;
 7257     Register vec_len = tmp1;
 7258     Register idx = tmp2;
 7259     // The minimum of the string lengths has been stored in cnt2.
 7260     Register cnt = cnt2;
 7261     FloatRegister ztmp1 = z0, ztmp2 = z1;
 7262     PRegister pgtmp1 = p0, pgtmp2 = p1;
 7263 
 7264 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 7265     switch (mode) {                                                            \
 7266       case LL:                                                                 \
 7267         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 7268         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 7269         break;                                                                 \
 7270       case LU:                                                                 \
 7271         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 7272         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7273         break;                                                                 \
 7274       case UL:                                                                 \
 7275         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7276         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 7277         break;                                                                 \
 7278       case UU:                                                                 \
 7279         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7280         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7281         break;                                                                 \
 7282       default:                                                                 \
 7283         ShouldNotReachHere();                                                  \
 7284     }
 7285 
 7286     StubCodeMark mark(this, stub_id);
 7287 
 7288     __ mov(idx, 0);
 7289     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7290 
 7291     if (mode == LL) {
 7292       __ sve_cntb(vec_len);
 7293     } else {
 7294       __ sve_cnth(vec_len);
 7295     }
 7296 
 7297     __ sub(rscratch1, cnt, vec_len);
 7298 
 7299     __ bind(LOOP);
 7300 
 7301       // main loop
 7302       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7303       __ add(idx, idx, vec_len);
 7304       // Compare strings.
 7305       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7306       __ br(__ NE, MISMATCH);
 7307       __ cmp(idx, rscratch1);
 7308       __ br(__ LT, LOOP);
 7309 
 7310     // post loop, last iteration
 7311     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7312 
 7313     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7314     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7315     __ br(__ EQ, DONE);
 7316 
 7317     __ bind(MISMATCH);
 7318 
 7319     // Crop the vector to find its location.
 7320     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 7321     // Extract the first different characters of each string.
 7322     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 7323     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 7324 
 7325     // Compute the difference of the first different characters.
 7326     __ sub(result, rscratch1, rscratch2);
 7327 
 7328     __ bind(DONE);
 7329     __ ret(lr);
 7330 #undef LOAD_PAIR
 7331     return entry;
 7332   }
 7333 
 7334   void generate_compare_long_strings() {
 7335     if (UseSVE == 0) {
 7336       StubRoutines::aarch64::_compare_long_string_LL
 7337           = generate_compare_long_string_same_encoding(true);
 7338       StubRoutines::aarch64::_compare_long_string_UU
 7339           = generate_compare_long_string_same_encoding(false);
 7340       StubRoutines::aarch64::_compare_long_string_LU
 7341           = generate_compare_long_string_different_encoding(true);
 7342       StubRoutines::aarch64::_compare_long_string_UL
 7343           = generate_compare_long_string_different_encoding(false);
 7344     } else {
 7345       StubRoutines::aarch64::_compare_long_string_LL
 7346           = generate_compare_long_string_sve(LL);
 7347       StubRoutines::aarch64::_compare_long_string_UU
 7348           = generate_compare_long_string_sve(UU);
 7349       StubRoutines::aarch64::_compare_long_string_LU
 7350           = generate_compare_long_string_sve(LU);
 7351       StubRoutines::aarch64::_compare_long_string_UL
 7352           = generate_compare_long_string_sve(UL);
 7353     }
 7354   }
 7355 
 7356   // R0 = result
 7357   // R1 = str2
 7358   // R2 = cnt1
 7359   // R3 = str1
 7360   // R4 = cnt2
 7361   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 7362   //
 7363   // This generic linear code use few additional ideas, which makes it faster:
 7364   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 7365   // in order to skip initial loading(help in systems with 1 ld pipeline)
 7366   // 2) we can use "fast" algorithm of finding single character to search for
 7367   // first symbol with less branches(1 branch per each loaded register instead
 7368   // of branch for each symbol), so, this is where constants like
 7369   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 7370   // 3) after loading and analyzing 1st register of source string, it can be
 7371   // used to search for every 1st character entry, saving few loads in
 7372   // comparison with "simplier-but-slower" implementation
 7373   // 4) in order to avoid lots of push/pop operations, code below is heavily
 7374   // re-using/re-initializing/compressing register values, which makes code
 7375   // larger and a bit less readable, however, most of extra operations are
 7376   // issued during loads or branches, so, penalty is minimal
 7377   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 7378     StubGenStubId stub_id;
 7379     if (str1_isL) {
 7380       if (str2_isL) {
 7381         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 7382       } else {
 7383         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 7384       }
 7385     } else {
 7386       if (str2_isL) {
 7387         ShouldNotReachHere();
 7388       } else {
 7389         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 7390       }
 7391     }
 7392     __ align(CodeEntryAlignment);
 7393     StubCodeMark mark(this, stub_id);
 7394     address entry = __ pc();
 7395 
 7396     int str1_chr_size = str1_isL ? 1 : 2;
 7397     int str2_chr_size = str2_isL ? 1 : 2;
 7398     int str1_chr_shift = str1_isL ? 0 : 1;
 7399     int str2_chr_shift = str2_isL ? 0 : 1;
 7400     bool isL = str1_isL && str2_isL;
 7401    // parameters
 7402     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 7403     // temporary registers
 7404     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 7405     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 7406     // redefinitions
 7407     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 7408 
 7409     __ push(spilled_regs, sp);
 7410     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 7411         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 7412         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 7413         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 7414         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 7415         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 7416     // Read whole register from str1. It is safe, because length >=8 here
 7417     __ ldr(ch1, Address(str1));
 7418     // Read whole register from str2. It is safe, because length >=8 here
 7419     __ ldr(ch2, Address(str2));
 7420     __ sub(cnt2, cnt2, cnt1);
 7421     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 7422     if (str1_isL != str2_isL) {
 7423       __ eor(v0, __ T16B, v0, v0);
 7424     }
 7425     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 7426     __ mul(first, first, tmp1);
 7427     // check if we have less than 1 register to check
 7428     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 7429     if (str1_isL != str2_isL) {
 7430       __ fmovd(v1, ch1);
 7431     }
 7432     __ br(__ LE, L_SMALL);
 7433     __ eor(ch2, first, ch2);
 7434     if (str1_isL != str2_isL) {
 7435       __ zip1(v1, __ T16B, v1, v0);
 7436     }
 7437     __ sub(tmp2, ch2, tmp1);
 7438     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7439     __ bics(tmp2, tmp2, ch2);
 7440     if (str1_isL != str2_isL) {
 7441       __ fmovd(ch1, v1);
 7442     }
 7443     __ br(__ NE, L_HAS_ZERO);
 7444     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7445     __ add(result, result, wordSize/str2_chr_size);
 7446     __ add(str2, str2, wordSize);
 7447     __ br(__ LT, L_POST_LOOP);
 7448     __ BIND(L_LOOP);
 7449       __ ldr(ch2, Address(str2));
 7450       __ eor(ch2, first, ch2);
 7451       __ sub(tmp2, ch2, tmp1);
 7452       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7453       __ bics(tmp2, tmp2, ch2);
 7454       __ br(__ NE, L_HAS_ZERO);
 7455     __ BIND(L_LOOP_PROCEED);
 7456       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7457       __ add(str2, str2, wordSize);
 7458       __ add(result, result, wordSize/str2_chr_size);
 7459       __ br(__ GE, L_LOOP);
 7460     __ BIND(L_POST_LOOP);
 7461       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 7462       __ br(__ LE, NOMATCH);
 7463       __ ldr(ch2, Address(str2));
 7464       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7465       __ eor(ch2, first, ch2);
 7466       __ sub(tmp2, ch2, tmp1);
 7467       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7468       __ mov(tmp4, -1); // all bits set
 7469       __ b(L_SMALL_PROCEED);
 7470     __ align(OptoLoopAlignment);
 7471     __ BIND(L_SMALL);
 7472       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7473       __ eor(ch2, first, ch2);
 7474       if (str1_isL != str2_isL) {
 7475         __ zip1(v1, __ T16B, v1, v0);
 7476       }
 7477       __ sub(tmp2, ch2, tmp1);
 7478       __ mov(tmp4, -1); // all bits set
 7479       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7480       if (str1_isL != str2_isL) {
 7481         __ fmovd(ch1, v1); // move converted 4 symbols
 7482       }
 7483     __ BIND(L_SMALL_PROCEED);
 7484       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 7485       __ bic(tmp2, tmp2, ch2);
 7486       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 7487       __ rbit(tmp2, tmp2);
 7488       __ br(__ EQ, NOMATCH);
 7489     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 7490       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 7491       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 7492       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 7493       if (str2_isL) { // LL
 7494         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7495         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7496         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7497         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7498         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7499       } else {
 7500         __ mov(ch2, 0xE); // all bits in byte set except last one
 7501         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7502         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7503         __ lslv(tmp2, tmp2, tmp4);
 7504         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7505         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7506         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7507         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7508       }
 7509       __ cmp(ch1, ch2);
 7510       __ mov(tmp4, wordSize/str2_chr_size);
 7511       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7512     __ BIND(L_SMALL_CMP_LOOP);
 7513       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7514                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7515       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7516                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7517       __ add(tmp4, tmp4, 1);
 7518       __ cmp(tmp4, cnt1);
 7519       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 7520       __ cmp(first, ch2);
 7521       __ br(__ EQ, L_SMALL_CMP_LOOP);
 7522     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 7523       __ cbz(tmp2, NOMATCH); // no more matches. exit
 7524       __ clz(tmp4, tmp2);
 7525       __ add(result, result, 1); // advance index
 7526       __ add(str2, str2, str2_chr_size); // advance pointer
 7527       __ b(L_SMALL_HAS_ZERO_LOOP);
 7528     __ align(OptoLoopAlignment);
 7529     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 7530       __ cmp(first, ch2);
 7531       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7532       __ b(DONE);
 7533     __ align(OptoLoopAlignment);
 7534     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 7535       if (str2_isL) { // LL
 7536         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7537         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7538         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7539         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7540         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7541       } else {
 7542         __ mov(ch2, 0xE); // all bits in byte set except last one
 7543         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7544         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7545         __ lslv(tmp2, tmp2, tmp4);
 7546         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7547         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7548         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7549         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7550       }
 7551       __ cmp(ch1, ch2);
 7552       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7553       __ b(DONE);
 7554     __ align(OptoLoopAlignment);
 7555     __ BIND(L_HAS_ZERO);
 7556       __ rbit(tmp2, tmp2);
 7557       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 7558       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 7559       // It's fine because both counters are 32bit and are not changed in this
 7560       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 7561       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 7562       __ sub(result, result, 1);
 7563     __ BIND(L_HAS_ZERO_LOOP);
 7564       __ mov(cnt1, wordSize/str2_chr_size);
 7565       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7566       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 7567       if (str2_isL) {
 7568         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7569         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7570         __ lslv(tmp2, tmp2, tmp4);
 7571         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7572         __ add(tmp4, tmp4, 1);
 7573         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7574         __ lsl(tmp2, tmp2, 1);
 7575         __ mov(tmp4, wordSize/str2_chr_size);
 7576       } else {
 7577         __ mov(ch2, 0xE);
 7578         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7579         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7580         __ lslv(tmp2, tmp2, tmp4);
 7581         __ add(tmp4, tmp4, 1);
 7582         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7583         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7584         __ lsl(tmp2, tmp2, 1);
 7585         __ mov(tmp4, wordSize/str2_chr_size);
 7586         __ sub(str2, str2, str2_chr_size);
 7587       }
 7588       __ cmp(ch1, ch2);
 7589       __ mov(tmp4, wordSize/str2_chr_size);
 7590       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7591     __ BIND(L_CMP_LOOP);
 7592       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7593                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7594       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7595                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7596       __ add(tmp4, tmp4, 1);
 7597       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7598       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 7599       __ cmp(cnt1, ch2);
 7600       __ br(__ EQ, L_CMP_LOOP);
 7601     __ BIND(L_CMP_LOOP_NOMATCH);
 7602       // here we're not matched
 7603       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 7604       __ clz(tmp4, tmp2);
 7605       __ add(str2, str2, str2_chr_size); // advance pointer
 7606       __ b(L_HAS_ZERO_LOOP);
 7607     __ align(OptoLoopAlignment);
 7608     __ BIND(L_CMP_LOOP_LAST_CMP);
 7609       __ cmp(cnt1, ch2);
 7610       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7611       __ b(DONE);
 7612     __ align(OptoLoopAlignment);
 7613     __ BIND(L_CMP_LOOP_LAST_CMP2);
 7614       if (str2_isL) {
 7615         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7616         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7617         __ lslv(tmp2, tmp2, tmp4);
 7618         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7619         __ add(tmp4, tmp4, 1);
 7620         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7621         __ lsl(tmp2, tmp2, 1);
 7622       } else {
 7623         __ mov(ch2, 0xE);
 7624         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7625         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7626         __ lslv(tmp2, tmp2, tmp4);
 7627         __ add(tmp4, tmp4, 1);
 7628         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7629         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7630         __ lsl(tmp2, tmp2, 1);
 7631         __ sub(str2, str2, str2_chr_size);
 7632       }
 7633       __ cmp(ch1, ch2);
 7634       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7635       __ b(DONE);
 7636     __ align(OptoLoopAlignment);
 7637     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 7638       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 7639       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 7640       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 7641       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 7642       // result by analyzed characters value, so, we can just reset lower bits
 7643       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 7644       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 7645       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 7646       // index of last analyzed substring inside current octet. So, str2 in at
 7647       // respective start address. We need to advance it to next octet
 7648       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 7649       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 7650       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 7651       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 7652       __ movw(cnt2, cnt2);
 7653       __ b(L_LOOP_PROCEED);
 7654     __ align(OptoLoopAlignment);
 7655     __ BIND(NOMATCH);
 7656       __ mov(result, -1);
 7657     __ BIND(DONE);
 7658       __ pop(spilled_regs, sp);
 7659       __ ret(lr);
 7660     return entry;
 7661   }
 7662 
 7663   void generate_string_indexof_stubs() {
 7664     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 7665     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 7666     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 7667   }
 7668 
 7669   void inflate_and_store_2_fp_registers(bool generatePrfm,
 7670       FloatRegister src1, FloatRegister src2) {
 7671     Register dst = r1;
 7672     __ zip1(v1, __ T16B, src1, v0);
 7673     __ zip2(v2, __ T16B, src1, v0);
 7674     if (generatePrfm) {
 7675       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 7676     }
 7677     __ zip1(v3, __ T16B, src2, v0);
 7678     __ zip2(v4, __ T16B, src2, v0);
 7679     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 7680   }
 7681 
 7682   // R0 = src
 7683   // R1 = dst
 7684   // R2 = len
 7685   // R3 = len >> 3
 7686   // V0 = 0
 7687   // v1 = loaded 8 bytes
 7688   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 7689   address generate_large_byte_array_inflate() {
 7690     __ align(CodeEntryAlignment);
 7691     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 7692     StubCodeMark mark(this, stub_id);
 7693     address entry = __ pc();
 7694     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 7695     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 7696     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 7697 
 7698     // do one more 8-byte read to have address 16-byte aligned in most cases
 7699     // also use single store instruction
 7700     __ ldrd(v2, __ post(src, 8));
 7701     __ sub(octetCounter, octetCounter, 2);
 7702     __ zip1(v1, __ T16B, v1, v0);
 7703     __ zip1(v2, __ T16B, v2, v0);
 7704     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 7705     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7706     __ subs(rscratch1, octetCounter, large_loop_threshold);
 7707     __ br(__ LE, LOOP_START);
 7708     __ b(LOOP_PRFM_START);
 7709     __ bind(LOOP_PRFM);
 7710       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7711     __ bind(LOOP_PRFM_START);
 7712       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 7713       __ sub(octetCounter, octetCounter, 8);
 7714       __ subs(rscratch1, octetCounter, large_loop_threshold);
 7715       inflate_and_store_2_fp_registers(true, v3, v4);
 7716       inflate_and_store_2_fp_registers(true, v5, v6);
 7717       __ br(__ GT, LOOP_PRFM);
 7718       __ cmp(octetCounter, (u1)8);
 7719       __ br(__ LT, DONE);
 7720     __ bind(LOOP);
 7721       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7722       __ bind(LOOP_START);
 7723       __ sub(octetCounter, octetCounter, 8);
 7724       __ cmp(octetCounter, (u1)8);
 7725       inflate_and_store_2_fp_registers(false, v3, v4);
 7726       inflate_and_store_2_fp_registers(false, v5, v6);
 7727       __ br(__ GE, LOOP);
 7728     __ bind(DONE);
 7729       __ ret(lr);
 7730     return entry;
 7731   }
 7732 
 7733   /**
 7734    *  Arguments:
 7735    *
 7736    *  Input:
 7737    *  c_rarg0   - current state address
 7738    *  c_rarg1   - H key address
 7739    *  c_rarg2   - data address
 7740    *  c_rarg3   - number of blocks
 7741    *
 7742    *  Output:
 7743    *  Updated state at c_rarg0
 7744    */
 7745   address generate_ghash_processBlocks() {
 7746     // Bafflingly, GCM uses little-endian for the byte order, but
 7747     // big-endian for the bit order.  For example, the polynomial 1 is
 7748     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 7749     //
 7750     // So, we must either reverse the bytes in each word and do
 7751     // everything big-endian or reverse the bits in each byte and do
 7752     // it little-endian.  On AArch64 it's more idiomatic to reverse
 7753     // the bits in each byte (we have an instruction, RBIT, to do
 7754     // that) and keep the data in little-endian bit order through the
 7755     // calculation, bit-reversing the inputs and outputs.
 7756 
 7757     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 7758     StubCodeMark mark(this, stub_id);
 7759     __ align(wordSize * 2);
 7760     address p = __ pc();
 7761     __ emit_int64(0x87);  // The low-order bits of the field
 7762                           // polynomial (i.e. p = z^7+z^2+z+1)
 7763                           // repeated in the low and high parts of a
 7764                           // 128-bit vector
 7765     __ emit_int64(0x87);
 7766 
 7767     __ align(CodeEntryAlignment);
 7768     address start = __ pc();
 7769 
 7770     Register state   = c_rarg0;
 7771     Register subkeyH = c_rarg1;
 7772     Register data    = c_rarg2;
 7773     Register blocks  = c_rarg3;
 7774 
 7775     FloatRegister vzr = v30;
 7776     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 7777 
 7778     __ ldrq(v24, p);    // The field polynomial
 7779 
 7780     __ ldrq(v0, Address(state));
 7781     __ ldrq(v1, Address(subkeyH));
 7782 
 7783     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 7784     __ rbit(v0, __ T16B, v0);
 7785     __ rev64(v1, __ T16B, v1);
 7786     __ rbit(v1, __ T16B, v1);
 7787 
 7788     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 7789     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 7790 
 7791     {
 7792       Label L_ghash_loop;
 7793       __ bind(L_ghash_loop);
 7794 
 7795       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 7796                                                  // reversing each byte
 7797       __ rbit(v2, __ T16B, v2);
 7798       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 7799 
 7800       // Multiply state in v2 by subkey in v1
 7801       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 7802                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 7803                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 7804       // Reduce v7:v5 by the field polynomial
 7805       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 7806 
 7807       __ sub(blocks, blocks, 1);
 7808       __ cbnz(blocks, L_ghash_loop);
 7809     }
 7810 
 7811     // The bit-reversed result is at this point in v0
 7812     __ rev64(v0, __ T16B, v0);
 7813     __ rbit(v0, __ T16B, v0);
 7814 
 7815     __ st1(v0, __ T16B, state);
 7816     __ ret(lr);
 7817 
 7818     return start;
 7819   }
 7820 
 7821   address generate_ghash_processBlocks_wide() {
 7822     address small = generate_ghash_processBlocks();
 7823 
 7824     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 7825     StubCodeMark mark(this, stub_id);
 7826     __ align(wordSize * 2);
 7827     address p = __ pc();
 7828     __ emit_int64(0x87);  // The low-order bits of the field
 7829                           // polynomial (i.e. p = z^7+z^2+z+1)
 7830                           // repeated in the low and high parts of a
 7831                           // 128-bit vector
 7832     __ emit_int64(0x87);
 7833 
 7834     __ align(CodeEntryAlignment);
 7835     address start = __ pc();
 7836 
 7837     Register state   = c_rarg0;
 7838     Register subkeyH = c_rarg1;
 7839     Register data    = c_rarg2;
 7840     Register blocks  = c_rarg3;
 7841 
 7842     const int unroll = 4;
 7843 
 7844     __ cmp(blocks, (unsigned char)(unroll * 2));
 7845     __ br(__ LT, small);
 7846 
 7847     if (unroll > 1) {
 7848     // Save state before entering routine
 7849       __ sub(sp, sp, 4 * 16);
 7850       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 7851       __ sub(sp, sp, 4 * 16);
 7852       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 7853     }
 7854 
 7855     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 7856 
 7857     if (unroll > 1) {
 7858       // And restore state
 7859       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 7860       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 7861     }
 7862 
 7863     __ cmp(blocks, (unsigned char)0);
 7864     __ br(__ GT, small);
 7865 
 7866     __ ret(lr);
 7867 
 7868     return start;
 7869   }
 7870 
 7871   void generate_base64_encode_simdround(Register src, Register dst,
 7872         FloatRegister codec, u8 size) {
 7873 
 7874     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 7875     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 7876     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 7877 
 7878     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 7879 
 7880     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 7881 
 7882     __ ushr(ind0, arrangement, in0,  2);
 7883 
 7884     __ ushr(ind1, arrangement, in1,  2);
 7885     __ shl(in0,   arrangement, in0,  6);
 7886     __ orr(ind1,  arrangement, ind1, in0);
 7887     __ ushr(ind1, arrangement, ind1, 2);
 7888 
 7889     __ ushr(ind2, arrangement, in2,  4);
 7890     __ shl(in1,   arrangement, in1,  4);
 7891     __ orr(ind2,  arrangement, in1,  ind2);
 7892     __ ushr(ind2, arrangement, ind2, 2);
 7893 
 7894     __ shl(ind3,  arrangement, in2,  2);
 7895     __ ushr(ind3, arrangement, ind3, 2);
 7896 
 7897     __ tbl(out0,  arrangement, codec,  4, ind0);
 7898     __ tbl(out1,  arrangement, codec,  4, ind1);
 7899     __ tbl(out2,  arrangement, codec,  4, ind2);
 7900     __ tbl(out3,  arrangement, codec,  4, ind3);
 7901 
 7902     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 7903   }
 7904 
 7905    /**
 7906    *  Arguments:
 7907    *
 7908    *  Input:
 7909    *  c_rarg0   - src_start
 7910    *  c_rarg1   - src_offset
 7911    *  c_rarg2   - src_length
 7912    *  c_rarg3   - dest_start
 7913    *  c_rarg4   - dest_offset
 7914    *  c_rarg5   - isURL
 7915    *
 7916    */
 7917   address generate_base64_encodeBlock() {
 7918 
 7919     static const char toBase64[64] = {
 7920       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7921       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7922       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7923       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7924       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 7925     };
 7926 
 7927     static const char toBase64URL[64] = {
 7928       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7929       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7930       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7931       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7932       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 7933     };
 7934 
 7935     __ align(CodeEntryAlignment);
 7936     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 7937     StubCodeMark mark(this, stub_id);
 7938     address start = __ pc();
 7939 
 7940     Register src   = c_rarg0;  // source array
 7941     Register soff  = c_rarg1;  // source start offset
 7942     Register send  = c_rarg2;  // source end offset
 7943     Register dst   = c_rarg3;  // dest array
 7944     Register doff  = c_rarg4;  // position for writing to dest array
 7945     Register isURL = c_rarg5;  // Base64 or URL character set
 7946 
 7947     // c_rarg6 and c_rarg7 are free to use as temps
 7948     Register codec  = c_rarg6;
 7949     Register length = c_rarg7;
 7950 
 7951     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 7952 
 7953     __ add(src, src, soff);
 7954     __ add(dst, dst, doff);
 7955     __ sub(length, send, soff);
 7956 
 7957     // load the codec base address
 7958     __ lea(codec, ExternalAddress((address) toBase64));
 7959     __ cbz(isURL, ProcessData);
 7960     __ lea(codec, ExternalAddress((address) toBase64URL));
 7961 
 7962     __ BIND(ProcessData);
 7963 
 7964     // too short to formup a SIMD loop, roll back
 7965     __ cmp(length, (u1)24);
 7966     __ br(Assembler::LT, Process3B);
 7967 
 7968     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 7969 
 7970     __ BIND(Process48B);
 7971     __ cmp(length, (u1)48);
 7972     __ br(Assembler::LT, Process24B);
 7973     generate_base64_encode_simdround(src, dst, v0, 16);
 7974     __ sub(length, length, 48);
 7975     __ b(Process48B);
 7976 
 7977     __ BIND(Process24B);
 7978     __ cmp(length, (u1)24);
 7979     __ br(Assembler::LT, SIMDExit);
 7980     generate_base64_encode_simdround(src, dst, v0, 8);
 7981     __ sub(length, length, 24);
 7982 
 7983     __ BIND(SIMDExit);
 7984     __ cbz(length, Exit);
 7985 
 7986     __ BIND(Process3B);
 7987     //  3 src bytes, 24 bits
 7988     __ ldrb(r10, __ post(src, 1));
 7989     __ ldrb(r11, __ post(src, 1));
 7990     __ ldrb(r12, __ post(src, 1));
 7991     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 7992     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 7993     // codec index
 7994     __ ubfmw(r15, r12, 18, 23);
 7995     __ ubfmw(r14, r12, 12, 17);
 7996     __ ubfmw(r13, r12, 6,  11);
 7997     __ andw(r12,  r12, 63);
 7998     // get the code based on the codec
 7999     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 8000     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 8001     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 8002     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 8003     __ strb(r15, __ post(dst, 1));
 8004     __ strb(r14, __ post(dst, 1));
 8005     __ strb(r13, __ post(dst, 1));
 8006     __ strb(r12, __ post(dst, 1));
 8007     __ sub(length, length, 3);
 8008     __ cbnz(length, Process3B);
 8009 
 8010     __ BIND(Exit);
 8011     __ ret(lr);
 8012 
 8013     return start;
 8014   }
 8015 
 8016   void generate_base64_decode_simdround(Register src, Register dst,
 8017         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 8018 
 8019     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 8020     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 8021 
 8022     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 8023     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 8024 
 8025     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 8026 
 8027     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 8028 
 8029     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 8030 
 8031     // we need unsigned saturating subtract, to make sure all input values
 8032     // in range [0, 63] will have 0U value in the higher half lookup
 8033     __ uqsubv(decH0, __ T16B, in0, v27);
 8034     __ uqsubv(decH1, __ T16B, in1, v27);
 8035     __ uqsubv(decH2, __ T16B, in2, v27);
 8036     __ uqsubv(decH3, __ T16B, in3, v27);
 8037 
 8038     // lower half lookup
 8039     __ tbl(decL0, arrangement, codecL, 4, in0);
 8040     __ tbl(decL1, arrangement, codecL, 4, in1);
 8041     __ tbl(decL2, arrangement, codecL, 4, in2);
 8042     __ tbl(decL3, arrangement, codecL, 4, in3);
 8043 
 8044     // higher half lookup
 8045     __ tbx(decH0, arrangement, codecH, 4, decH0);
 8046     __ tbx(decH1, arrangement, codecH, 4, decH1);
 8047     __ tbx(decH2, arrangement, codecH, 4, decH2);
 8048     __ tbx(decH3, arrangement, codecH, 4, decH3);
 8049 
 8050     // combine lower and higher
 8051     __ orr(decL0, arrangement, decL0, decH0);
 8052     __ orr(decL1, arrangement, decL1, decH1);
 8053     __ orr(decL2, arrangement, decL2, decH2);
 8054     __ orr(decL3, arrangement, decL3, decH3);
 8055 
 8056     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 8057     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 8058     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 8059     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 8060     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 8061     __ orr(in0, arrangement, decH0, decH1);
 8062     __ orr(in1, arrangement, decH2, decH3);
 8063     __ orr(in2, arrangement, in0,   in1);
 8064     __ umaxv(in3, arrangement, in2);
 8065     __ umov(rscratch2, in3, __ B, 0);
 8066 
 8067     // get the data to output
 8068     __ shl(out0,  arrangement, decL0, 2);
 8069     __ ushr(out1, arrangement, decL1, 4);
 8070     __ orr(out0,  arrangement, out0,  out1);
 8071     __ shl(out1,  arrangement, decL1, 4);
 8072     __ ushr(out2, arrangement, decL2, 2);
 8073     __ orr(out1,  arrangement, out1,  out2);
 8074     __ shl(out2,  arrangement, decL2, 6);
 8075     __ orr(out2,  arrangement, out2,  decL3);
 8076 
 8077     __ cbz(rscratch2, NoIllegalData);
 8078 
 8079     // handle illegal input
 8080     __ umov(r10, in2, __ D, 0);
 8081     if (size == 16) {
 8082       __ cbnz(r10, ErrorInLowerHalf);
 8083 
 8084       // illegal input is in higher half, store the lower half now.
 8085       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 8086 
 8087       __ umov(r10, in2,  __ D, 1);
 8088       __ umov(r11, out0, __ D, 1);
 8089       __ umov(r12, out1, __ D, 1);
 8090       __ umov(r13, out2, __ D, 1);
 8091       __ b(StoreLegalData);
 8092 
 8093       __ BIND(ErrorInLowerHalf);
 8094     }
 8095     __ umov(r11, out0, __ D, 0);
 8096     __ umov(r12, out1, __ D, 0);
 8097     __ umov(r13, out2, __ D, 0);
 8098 
 8099     __ BIND(StoreLegalData);
 8100     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 8101     __ strb(r11, __ post(dst, 1));
 8102     __ strb(r12, __ post(dst, 1));
 8103     __ strb(r13, __ post(dst, 1));
 8104     __ lsr(r10, r10, 8);
 8105     __ lsr(r11, r11, 8);
 8106     __ lsr(r12, r12, 8);
 8107     __ lsr(r13, r13, 8);
 8108     __ b(StoreLegalData);
 8109 
 8110     __ BIND(NoIllegalData);
 8111     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 8112   }
 8113 
 8114 
 8115    /**
 8116    *  Arguments:
 8117    *
 8118    *  Input:
 8119    *  c_rarg0   - src_start
 8120    *  c_rarg1   - src_offset
 8121    *  c_rarg2   - src_length
 8122    *  c_rarg3   - dest_start
 8123    *  c_rarg4   - dest_offset
 8124    *  c_rarg5   - isURL
 8125    *  c_rarg6   - isMIME
 8126    *
 8127    */
 8128   address generate_base64_decodeBlock() {
 8129 
 8130     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 8131     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 8132     // titled "Base64 decoding".
 8133 
 8134     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 8135     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 8136     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 8137     static const uint8_t fromBase64ForNoSIMD[256] = {
 8138       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8139       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8140       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8141        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8142       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8143        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 8144       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8145        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8146       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8147       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8148       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8149       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8150       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8151       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8152       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8153       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8154     };
 8155 
 8156     static const uint8_t fromBase64URLForNoSIMD[256] = {
 8157       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8158       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8159       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8160        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8161       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8162        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 8163       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8164        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8166       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8167       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8168       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8169       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8170       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8171       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8172       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8173     };
 8174 
 8175     // A legal value of base64 code is in range [0, 127].  We need two lookups
 8176     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 8177     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 8178     // table vector lookup use tbx, out of range indices are unchanged in
 8179     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 8180     // The value of index 64 is set to 0, so that we know that we already get the
 8181     // decoded data with the 1st lookup.
 8182     static const uint8_t fromBase64ForSIMD[128] = {
 8183       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8184       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8185       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8186        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8187         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8188        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8189       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8190        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8191     };
 8192 
 8193     static const uint8_t fromBase64URLForSIMD[128] = {
 8194       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8195       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8196       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8197        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8198         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8199        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8200        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8201        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8202     };
 8203 
 8204     __ align(CodeEntryAlignment);
 8205     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 8206     StubCodeMark mark(this, stub_id);
 8207     address start = __ pc();
 8208 
 8209     Register src    = c_rarg0;  // source array
 8210     Register soff   = c_rarg1;  // source start offset
 8211     Register send   = c_rarg2;  // source end offset
 8212     Register dst    = c_rarg3;  // dest array
 8213     Register doff   = c_rarg4;  // position for writing to dest array
 8214     Register isURL  = c_rarg5;  // Base64 or URL character set
 8215     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 8216 
 8217     Register length = send;    // reuse send as length of source data to process
 8218 
 8219     Register simd_codec   = c_rarg6;
 8220     Register nosimd_codec = c_rarg7;
 8221 
 8222     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 8223 
 8224     __ enter();
 8225 
 8226     __ add(src, src, soff);
 8227     __ add(dst, dst, doff);
 8228 
 8229     __ mov(doff, dst);
 8230 
 8231     __ sub(length, send, soff);
 8232     __ bfm(length, zr, 0, 1);
 8233 
 8234     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 8235     __ cbz(isURL, ProcessData);
 8236     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 8237 
 8238     __ BIND(ProcessData);
 8239     __ mov(rscratch1, length);
 8240     __ cmp(length, (u1)144); // 144 = 80 + 64
 8241     __ br(Assembler::LT, Process4B);
 8242 
 8243     // In the MIME case, the line length cannot be more than 76
 8244     // bytes (see RFC 2045). This is too short a block for SIMD
 8245     // to be worthwhile, so we use non-SIMD here.
 8246     __ movw(rscratch1, 79);
 8247 
 8248     __ BIND(Process4B);
 8249     __ ldrw(r14, __ post(src, 4));
 8250     __ ubfxw(r10, r14, 0,  8);
 8251     __ ubfxw(r11, r14, 8,  8);
 8252     __ ubfxw(r12, r14, 16, 8);
 8253     __ ubfxw(r13, r14, 24, 8);
 8254     // get the de-code
 8255     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 8256     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 8257     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 8258     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 8259     // error detection, 255u indicates an illegal input
 8260     __ orrw(r14, r10, r11);
 8261     __ orrw(r15, r12, r13);
 8262     __ orrw(r14, r14, r15);
 8263     __ tbnz(r14, 7, Exit);
 8264     // recover the data
 8265     __ lslw(r14, r10, 10);
 8266     __ bfiw(r14, r11, 4, 6);
 8267     __ bfmw(r14, r12, 2, 5);
 8268     __ rev16w(r14, r14);
 8269     __ bfiw(r13, r12, 6, 2);
 8270     __ strh(r14, __ post(dst, 2));
 8271     __ strb(r13, __ post(dst, 1));
 8272     // non-simd loop
 8273     __ subsw(rscratch1, rscratch1, 4);
 8274     __ br(Assembler::GT, Process4B);
 8275 
 8276     // if exiting from PreProcess80B, rscratch1 == -1;
 8277     // otherwise, rscratch1 == 0.
 8278     __ cbzw(rscratch1, Exit);
 8279     __ sub(length, length, 80);
 8280 
 8281     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 8282     __ cbz(isURL, SIMDEnter);
 8283     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 8284 
 8285     __ BIND(SIMDEnter);
 8286     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 8287     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 8288     __ mov(rscratch1, 63);
 8289     __ dup(v27, __ T16B, rscratch1);
 8290 
 8291     __ BIND(Process64B);
 8292     __ cmp(length, (u1)64);
 8293     __ br(Assembler::LT, Process32B);
 8294     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 8295     __ sub(length, length, 64);
 8296     __ b(Process64B);
 8297 
 8298     __ BIND(Process32B);
 8299     __ cmp(length, (u1)32);
 8300     __ br(Assembler::LT, SIMDExit);
 8301     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 8302     __ sub(length, length, 32);
 8303     __ b(Process32B);
 8304 
 8305     __ BIND(SIMDExit);
 8306     __ cbz(length, Exit);
 8307     __ movw(rscratch1, length);
 8308     __ b(Process4B);
 8309 
 8310     __ BIND(Exit);
 8311     __ sub(c_rarg0, dst, doff);
 8312 
 8313     __ leave();
 8314     __ ret(lr);
 8315 
 8316     return start;
 8317   }
 8318 
 8319   // Support for spin waits.
 8320   address generate_spin_wait() {
 8321     __ align(CodeEntryAlignment);
 8322     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 8323     StubCodeMark mark(this, stub_id);
 8324     address start = __ pc();
 8325 
 8326     __ spin_wait();
 8327     __ ret(lr);
 8328 
 8329     return start;
 8330   }
 8331 
 8332   void generate_lookup_secondary_supers_table_stub() {
 8333     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 8334     StubCodeMark mark(this, stub_id);
 8335 
 8336     const Register
 8337       r_super_klass  = r0,
 8338       r_array_base   = r1,
 8339       r_array_length = r2,
 8340       r_array_index  = r3,
 8341       r_sub_klass    = r4,
 8342       r_bitmap       = rscratch2,
 8343       result         = r5;
 8344     const FloatRegister
 8345       vtemp          = v0;
 8346 
 8347     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 8348       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 8349       Label L_success;
 8350       __ enter();
 8351       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 8352                                              r_array_base, r_array_length, r_array_index,
 8353                                              vtemp, result, slot,
 8354                                              /*stub_is_near*/true);
 8355       __ leave();
 8356       __ ret(lr);
 8357     }
 8358   }
 8359 
 8360   // Slow path implementation for UseSecondarySupersTable.
 8361   address generate_lookup_secondary_supers_table_slow_path_stub() {
 8362     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 8363     StubCodeMark mark(this, stub_id);
 8364 
 8365     address start = __ pc();
 8366     const Register
 8367       r_super_klass  = r0,        // argument
 8368       r_array_base   = r1,        // argument
 8369       temp1          = r2,        // temp
 8370       r_array_index  = r3,        // argument
 8371       r_bitmap       = rscratch2, // argument
 8372       result         = r5;        // argument
 8373 
 8374     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 8375     __ ret(lr);
 8376 
 8377     return start;
 8378   }
 8379 
 8380 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 8381 
 8382   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 8383   //
 8384   // If LSE is in use, generate LSE versions of all the stubs. The
 8385   // non-LSE versions are in atomic_aarch64.S.
 8386 
 8387   // class AtomicStubMark records the entry point of a stub and the
 8388   // stub pointer which will point to it. The stub pointer is set to
 8389   // the entry point when ~AtomicStubMark() is called, which must be
 8390   // after ICache::invalidate_range. This ensures safe publication of
 8391   // the generated code.
 8392   class AtomicStubMark {
 8393     address _entry_point;
 8394     aarch64_atomic_stub_t *_stub;
 8395     MacroAssembler *_masm;
 8396   public:
 8397     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 8398       _masm = masm;
 8399       __ align(32);
 8400       _entry_point = __ pc();
 8401       _stub = stub;
 8402     }
 8403     ~AtomicStubMark() {
 8404       *_stub = (aarch64_atomic_stub_t)_entry_point;
 8405     }
 8406   };
 8407 
 8408   // NB: For memory_order_conservative we need a trailing membar after
 8409   // LSE atomic operations but not a leading membar.
 8410   //
 8411   // We don't need a leading membar because a clause in the Arm ARM
 8412   // says:
 8413   //
 8414   //   Barrier-ordered-before
 8415   //
 8416   //   Barrier instructions order prior Memory effects before subsequent
 8417   //   Memory effects generated by the same Observer. A read or a write
 8418   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 8419   //   Observer if and only if RW1 appears in program order before RW 2
 8420   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 8421   //   instruction with both Acquire and Release semantics.
 8422   //
 8423   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 8424   // and Release semantics, therefore we don't need a leading
 8425   // barrier. However, there is no corresponding Barrier-ordered-after
 8426   // relationship, therefore we need a trailing membar to prevent a
 8427   // later store or load from being reordered with the store in an
 8428   // atomic instruction.
 8429   //
 8430   // This was checked by using the herd7 consistency model simulator
 8431   // (http://diy.inria.fr/) with this test case:
 8432   //
 8433   // AArch64 LseCas
 8434   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 8435   // P0 | P1;
 8436   // LDR W4, [X2] | MOV W3, #0;
 8437   // DMB LD       | MOV W4, #1;
 8438   // LDR W3, [X1] | CASAL W3, W4, [X1];
 8439   //              | DMB ISH;
 8440   //              | STR W4, [X2];
 8441   // exists
 8442   // (0:X3=0 /\ 0:X4=1)
 8443   //
 8444   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 8445   // with the store to x in P1. Without the DMB in P1 this may happen.
 8446   //
 8447   // At the time of writing we don't know of any AArch64 hardware that
 8448   // reorders stores in this way, but the Reference Manual permits it.
 8449 
 8450   void gen_cas_entry(Assembler::operand_size size,
 8451                      atomic_memory_order order) {
 8452     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 8453       exchange_val = c_rarg2;
 8454     bool acquire, release;
 8455     switch (order) {
 8456       case memory_order_relaxed:
 8457         acquire = false;
 8458         release = false;
 8459         break;
 8460       case memory_order_release:
 8461         acquire = false;
 8462         release = true;
 8463         break;
 8464       default:
 8465         acquire = true;
 8466         release = true;
 8467         break;
 8468     }
 8469     __ mov(prev, compare_val);
 8470     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 8471     if (order == memory_order_conservative) {
 8472       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8473     }
 8474     if (size == Assembler::xword) {
 8475       __ mov(r0, prev);
 8476     } else {
 8477       __ movw(r0, prev);
 8478     }
 8479     __ ret(lr);
 8480   }
 8481 
 8482   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 8483     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8484     // If not relaxed, then default to conservative.  Relaxed is the only
 8485     // case we use enough to be worth specializing.
 8486     if (order == memory_order_relaxed) {
 8487       __ ldadd(size, incr, prev, addr);
 8488     } else {
 8489       __ ldaddal(size, incr, prev, addr);
 8490       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8491     }
 8492     if (size == Assembler::xword) {
 8493       __ mov(r0, prev);
 8494     } else {
 8495       __ movw(r0, prev);
 8496     }
 8497     __ ret(lr);
 8498   }
 8499 
 8500   void gen_swpal_entry(Assembler::operand_size size) {
 8501     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8502     __ swpal(size, incr, prev, addr);
 8503     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8504     if (size == Assembler::xword) {
 8505       __ mov(r0, prev);
 8506     } else {
 8507       __ movw(r0, prev);
 8508     }
 8509     __ ret(lr);
 8510   }
 8511 
 8512   void generate_atomic_entry_points() {
 8513     if (! UseLSE) {
 8514       return;
 8515     }
 8516     __ align(CodeEntryAlignment);
 8517     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 8518     StubCodeMark mark(this, stub_id);
 8519     address first_entry = __ pc();
 8520 
 8521     // ADD, memory_order_conservative
 8522     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 8523     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 8524     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 8525     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 8526 
 8527     // ADD, memory_order_relaxed
 8528     AtomicStubMark mark_fetch_add_4_relaxed
 8529       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 8530     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 8531     AtomicStubMark mark_fetch_add_8_relaxed
 8532       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 8533     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 8534 
 8535     // XCHG, memory_order_conservative
 8536     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 8537     gen_swpal_entry(Assembler::word);
 8538     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 8539     gen_swpal_entry(Assembler::xword);
 8540 
 8541     // CAS, memory_order_conservative
 8542     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 8543     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 8544     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 8545     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 8546     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 8547     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 8548 
 8549     // CAS, memory_order_relaxed
 8550     AtomicStubMark mark_cmpxchg_1_relaxed
 8551       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 8552     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 8553     AtomicStubMark mark_cmpxchg_4_relaxed
 8554       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 8555     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 8556     AtomicStubMark mark_cmpxchg_8_relaxed
 8557       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 8558     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 8559 
 8560     AtomicStubMark mark_cmpxchg_4_release
 8561       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 8562     gen_cas_entry(MacroAssembler::word, memory_order_release);
 8563     AtomicStubMark mark_cmpxchg_8_release
 8564       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 8565     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 8566 
 8567     AtomicStubMark mark_cmpxchg_4_seq_cst
 8568       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 8569     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 8570     AtomicStubMark mark_cmpxchg_8_seq_cst
 8571       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 8572     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 8573 
 8574     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 8575   }
 8576 #endif // LINUX
 8577 
 8578   address generate_cont_thaw(Continuation::thaw_kind kind) {
 8579     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 8580     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
 8581 
 8582     address start = __ pc();
 8583 
 8584     if (return_barrier) {
 8585       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
 8586       __ mov(sp, rscratch1);
 8587     }
 8588     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8589 
 8590     if (return_barrier) {
 8591       // preserve possible return value from a method returning to the return barrier
 8592       __ fmovd(rscratch1, v0);
 8593       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8594     }
 8595 
 8596     __ movw(c_rarg1, (return_barrier ? 1 : 0));
 8597     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
 8598     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
 8599 
 8600     if (return_barrier) {
 8601       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8602       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8603       __ fmovd(v0, rscratch1);
 8604     }
 8605     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8606 
 8607 
 8608     Label thaw_success;
 8609     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
 8610     __ cbnz(rscratch2, thaw_success);
 8611     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
 8612     __ br(rscratch1);
 8613     __ bind(thaw_success);
 8614 
 8615     // make room for the thawed frames
 8616     __ sub(rscratch1, sp, rscratch2);
 8617     __ andr(rscratch1, rscratch1, -16); // align
 8618     __ mov(sp, rscratch1);
 8619 
 8620     if (return_barrier) {
 8621       // save original return value -- again
 8622       __ fmovd(rscratch1, v0);
 8623       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8624     }
 8625 
 8626     // If we want, we can templatize thaw by kind, and have three different entries
 8627     __ movw(c_rarg1, (uint32_t)kind);
 8628 
 8629     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
 8630     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
 8631 
 8632     if (return_barrier) {
 8633       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8634       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8635       __ fmovd(v0, rscratch1);
 8636     } else {
 8637       __ mov(r0, zr); // return 0 (success) from doYield
 8638     }
 8639 
 8640     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
 8641     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
 8642     __ mov(rfp, sp);
 8643 
 8644     if (return_barrier_exception) {
 8645       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
 8646       __ authenticate_return_address(c_rarg1);
 8647       __ verify_oop(r0);
 8648       // save return value containing the exception oop in callee-saved R19
 8649       __ mov(r19, r0);
 8650 
 8651       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
 8652 
 8653       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
 8654       // __ reinitialize_ptrue();
 8655 
 8656       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
 8657 
 8658       __ mov(r1, r0); // the exception handler
 8659       __ mov(r0, r19); // restore return value containing the exception oop
 8660       __ verify_oop(r0);
 8661 
 8662       __ leave();
 8663       __ mov(r3, lr);
 8664       __ br(r1); // the exception handler
 8665     } else {
 8666       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
 8667       __ leave();
 8668       __ ret(lr);
 8669     }
 8670 
 8671     return start;
 8672   }
 8673 
 8674   address generate_cont_thaw() {
 8675     if (!Continuations::enabled()) return nullptr;
 8676 
 8677     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
 8678     StubCodeMark mark(this, stub_id);
 8679     address start = __ pc();
 8680     generate_cont_thaw(Continuation::thaw_top);
 8681     return start;
 8682   }
 8683 
 8684   address generate_cont_returnBarrier() {
 8685     if (!Continuations::enabled()) return nullptr;
 8686 
 8687     // TODO: will probably need multiple return barriers depending on return type
 8688     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
 8689     StubCodeMark mark(this, stub_id);
 8690     address start = __ pc();
 8691 
 8692     generate_cont_thaw(Continuation::thaw_return_barrier);
 8693 
 8694     return start;
 8695   }
 8696 
 8697   address generate_cont_returnBarrier_exception() {
 8698     if (!Continuations::enabled()) return nullptr;
 8699 
 8700     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
 8701     StubCodeMark mark(this, stub_id);
 8702     address start = __ pc();
 8703 
 8704     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
 8705 
 8706     return start;
 8707   }
 8708 
 8709   address generate_cont_preempt_stub() {
 8710     if (!Continuations::enabled()) return nullptr;
 8711     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
 8712     StubCodeMark mark(this, stub_id);
 8713     address start = __ pc();
 8714 
 8715     __ reset_last_Java_frame(true);
 8716 
 8717     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
 8718     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
 8719     __ mov(sp, rscratch2);
 8720 
 8721     Label preemption_cancelled;
 8722     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8723     __ cbnz(rscratch1, preemption_cancelled);
 8724 
 8725     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
 8726     SharedRuntime::continuation_enter_cleanup(_masm);
 8727     __ leave();
 8728     __ ret(lr);
 8729 
 8730     // We acquired the monitor after freezing the frames so call thaw to continue execution.
 8731     __ bind(preemption_cancelled);
 8732     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8733     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
 8734     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
 8735     __ ldr(rscratch1, Address(rscratch1));
 8736     __ br(rscratch1);
 8737 
 8738     return start;
 8739   }
 8740 
 8741   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
 8742   // are represented as long[5], with BITS_PER_LIMB = 26.
 8743   // Pack five 26-bit limbs into three 64-bit registers.
 8744   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
 8745     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
 8746     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
 8747     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
 8748     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
 8749 
 8750     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
 8751     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
 8752     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
 8753     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
 8754 
 8755     if (dest2->is_valid()) {
 8756       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8757     } else {
 8758 #ifdef ASSERT
 8759       Label OK;
 8760       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8761       __ br(__ EQ, OK);
 8762       __ stop("high bits of Poly1305 integer should be zero");
 8763       __ should_not_reach_here();
 8764       __ bind(OK);
 8765 #endif
 8766     }
 8767   }
 8768 
 8769   // As above, but return only a 128-bit integer, packed into two
 8770   // 64-bit registers.
 8771   void pack_26(Register dest0, Register dest1, Register src) {
 8772     pack_26(dest0, dest1, noreg, src);
 8773   }
 8774 
 8775   // Multiply and multiply-accumulate unsigned 64-bit registers.
 8776   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
 8777     __ mul(prod_lo, n, m);
 8778     __ umulh(prod_hi, n, m);
 8779   }
 8780   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
 8781     wide_mul(rscratch1, rscratch2, n, m);
 8782     __ adds(sum_lo, sum_lo, rscratch1);
 8783     __ adc(sum_hi, sum_hi, rscratch2);
 8784   }
 8785 
 8786   // Poly1305, RFC 7539
 8787 
 8788   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
 8789   // description of the tricks used to simplify and accelerate this
 8790   // computation.
 8791 
 8792   address generate_poly1305_processBlocks() {
 8793     __ align(CodeEntryAlignment);
 8794     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
 8795     StubCodeMark mark(this, stub_id);
 8796     address start = __ pc();
 8797     Label here;
 8798     __ enter();
 8799     RegSet callee_saved = RegSet::range(r19, r28);
 8800     __ push(callee_saved, sp);
 8801 
 8802     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
 8803 
 8804     // Arguments
 8805     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
 8806 
 8807     // R_n is the 128-bit randomly-generated key, packed into two
 8808     // registers.  The caller passes this key to us as long[5], with
 8809     // BITS_PER_LIMB = 26.
 8810     const Register R_0 = *++regs, R_1 = *++regs;
 8811     pack_26(R_0, R_1, r_start);
 8812 
 8813     // RR_n is (R_n >> 2) * 5
 8814     const Register RR_0 = *++regs, RR_1 = *++regs;
 8815     __ lsr(RR_0, R_0, 2);
 8816     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
 8817     __ lsr(RR_1, R_1, 2);
 8818     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
 8819 
 8820     // U_n is the current checksum
 8821     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
 8822     pack_26(U_0, U_1, U_2, acc_start);
 8823 
 8824     static constexpr int BLOCK_LENGTH = 16;
 8825     Label DONE, LOOP;
 8826 
 8827     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8828     __ br(Assembler::LT, DONE); {
 8829       __ bind(LOOP);
 8830 
 8831       // S_n is to be the sum of U_n and the next block of data
 8832       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
 8833       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
 8834       __ adds(S_0, U_0, S_0);
 8835       __ adcs(S_1, U_1, S_1);
 8836       __ adc(S_2, U_2, zr);
 8837       __ add(S_2, S_2, 1);
 8838 
 8839       const Register U_0HI = *++regs, U_1HI = *++regs;
 8840 
 8841       // NB: this logic depends on some of the special properties of
 8842       // Poly1305 keys. In particular, because we know that the top
 8843       // four bits of R_0 and R_1 are zero, we can add together
 8844       // partial products without any risk of needing to propagate a
 8845       // carry out.
 8846       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
 8847       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
 8848       __ andr(U_2, R_0, 3);
 8849       __ mul(U_2, S_2, U_2);
 8850 
 8851       // Recycle registers S_0, S_1, S_2
 8852       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
 8853 
 8854       // Partial reduction mod 2**130 - 5
 8855       __ adds(U_1, U_0HI, U_1);
 8856       __ adc(U_2, U_1HI, U_2);
 8857       // Sum now in U_2:U_1:U_0.
 8858       // Dead: U_0HI, U_1HI.
 8859       regs = (regs.remaining() + U_0HI + U_1HI).begin();
 8860 
 8861       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
 8862 
 8863       // First, U_2:U_1:U_0 += (U_2 >> 2)
 8864       __ lsr(rscratch1, U_2, 2);
 8865       __ andr(U_2, U_2, (u8)3);
 8866       __ adds(U_0, U_0, rscratch1);
 8867       __ adcs(U_1, U_1, zr);
 8868       __ adc(U_2, U_2, zr);
 8869       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
 8870       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
 8871       __ adcs(U_1, U_1, zr);
 8872       __ adc(U_2, U_2, zr);
 8873 
 8874       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
 8875       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8876       __ br(~ Assembler::LT, LOOP);
 8877     }
 8878 
 8879     // Further reduce modulo 2^130 - 5
 8880     __ lsr(rscratch1, U_2, 2);
 8881     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
 8882     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
 8883     __ adcs(U_1, U_1, zr);
 8884     __ andr(U_2, U_2, (u1)3);
 8885     __ adc(U_2, U_2, zr);
 8886 
 8887     // Unpack the sum into five 26-bit limbs and write to memory.
 8888     __ ubfiz(rscratch1, U_0, 0, 26);
 8889     __ ubfx(rscratch2, U_0, 26, 26);
 8890     __ stp(rscratch1, rscratch2, Address(acc_start));
 8891     __ ubfx(rscratch1, U_0, 52, 12);
 8892     __ bfi(rscratch1, U_1, 12, 14);
 8893     __ ubfx(rscratch2, U_1, 14, 26);
 8894     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
 8895     __ ubfx(rscratch1, U_1, 40, 24);
 8896     __ bfi(rscratch1, U_2, 24, 3);
 8897     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
 8898 
 8899     __ bind(DONE);
 8900     __ pop(callee_saved, sp);
 8901     __ leave();
 8902     __ ret(lr);
 8903 
 8904     return start;
 8905   }
 8906 
 8907   // exception handler for upcall stubs
 8908   address generate_upcall_stub_exception_handler() {
 8909     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
 8910     StubCodeMark mark(this, stub_id);
 8911     address start = __ pc();
 8912 
 8913     // Native caller has no idea how to handle exceptions,
 8914     // so we just crash here. Up to callee to catch exceptions.
 8915     __ verify_oop(r0);
 8916     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
 8917     __ blr(rscratch1);
 8918     __ should_not_reach_here();
 8919 
 8920     return start;
 8921   }
 8922 
 8923   // load Method* target of MethodHandle
 8924   // j_rarg0 = jobject receiver
 8925   // rmethod = result
 8926   address generate_upcall_stub_load_target() {
 8927     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
 8928     StubCodeMark mark(this, stub_id);
 8929     address start = __ pc();
 8930 
 8931     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
 8932       // Load target method from receiver
 8933     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
 8934     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
 8935     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
 8936     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
 8937                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
 8938                       noreg, noreg);
 8939     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
 8940 
 8941     __ ret(lr);
 8942 
 8943     return start;
 8944   }
 8945 
 8946 #undef __
 8947 #define __ masm->
 8948 
 8949   class MontgomeryMultiplyGenerator : public MacroAssembler {
 8950 
 8951     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
 8952       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
 8953 
 8954     RegSet _toSave;
 8955     bool _squaring;
 8956 
 8957   public:
 8958     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
 8959       : MacroAssembler(as->code()), _squaring(squaring) {
 8960 
 8961       // Register allocation
 8962 
 8963       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
 8964       Pa_base = *regs;       // Argument registers
 8965       if (squaring)
 8966         Pb_base = Pa_base;
 8967       else
 8968         Pb_base = *++regs;
 8969       Pn_base = *++regs;
 8970       Rlen= *++regs;
 8971       inv = *++regs;
 8972       Pm_base = *++regs;
 8973 
 8974                           // Working registers:
 8975       Ra =  *++regs;        // The current digit of a, b, n, and m.
 8976       Rb =  *++regs;
 8977       Rm =  *++regs;
 8978       Rn =  *++regs;
 8979 
 8980       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
 8981       Pb =  *++regs;
 8982       Pm =  *++regs;
 8983       Pn =  *++regs;
 8984 
 8985       t0 =  *++regs;        // Three registers which form a
 8986       t1 =  *++regs;        // triple-precision accumuator.
 8987       t2 =  *++regs;
 8988 
 8989       Ri =  *++regs;        // Inner and outer loop indexes.
 8990       Rj =  *++regs;
 8991 
 8992       Rhi_ab = *++regs;     // Product registers: low and high parts
 8993       Rlo_ab = *++regs;     // of a*b and m*n.
 8994       Rhi_mn = *++regs;
 8995       Rlo_mn = *++regs;
 8996 
 8997       // r19 and up are callee-saved.
 8998       _toSave = RegSet::range(r19, *regs) + Pm_base;
 8999     }
 9000 
 9001   private:
 9002     void save_regs() {
 9003       push(_toSave, sp);
 9004     }
 9005 
 9006     void restore_regs() {
 9007       pop(_toSave, sp);
 9008     }
 9009 
 9010     template <typename T>
 9011     void unroll_2(Register count, T block) {
 9012       Label loop, end, odd;
 9013       tbnz(count, 0, odd);
 9014       cbz(count, end);
 9015       align(16);
 9016       bind(loop);
 9017       (this->*block)();
 9018       bind(odd);
 9019       (this->*block)();
 9020       subs(count, count, 2);
 9021       br(Assembler::GT, loop);
 9022       bind(end);
 9023     }
 9024 
 9025     template <typename T>
 9026     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
 9027       Label loop, end, odd;
 9028       tbnz(count, 0, odd);
 9029       cbz(count, end);
 9030       align(16);
 9031       bind(loop);
 9032       (this->*block)(d, s, tmp);
 9033       bind(odd);
 9034       (this->*block)(d, s, tmp);
 9035       subs(count, count, 2);
 9036       br(Assembler::GT, loop);
 9037       bind(end);
 9038     }
 9039 
 9040     void pre1(RegisterOrConstant i) {
 9041       block_comment("pre1");
 9042       // Pa = Pa_base;
 9043       // Pb = Pb_base + i;
 9044       // Pm = Pm_base;
 9045       // Pn = Pn_base + i;
 9046       // Ra = *Pa;
 9047       // Rb = *Pb;
 9048       // Rm = *Pm;
 9049       // Rn = *Pn;
 9050       ldr(Ra, Address(Pa_base));
 9051       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9052       ldr(Rm, Address(Pm_base));
 9053       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9054       lea(Pa, Address(Pa_base));
 9055       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9056       lea(Pm, Address(Pm_base));
 9057       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9058 
 9059       // Zero the m*n result.
 9060       mov(Rhi_mn, zr);
 9061       mov(Rlo_mn, zr);
 9062     }
 9063 
 9064     // The core multiply-accumulate step of a Montgomery
 9065     // multiplication.  The idea is to schedule operations as a
 9066     // pipeline so that instructions with long latencies (loads and
 9067     // multiplies) have time to complete before their results are
 9068     // used.  This most benefits in-order implementations of the
 9069     // architecture but out-of-order ones also benefit.
 9070     void step() {
 9071       block_comment("step");
 9072       // MACC(Ra, Rb, t0, t1, t2);
 9073       // Ra = *++Pa;
 9074       // Rb = *--Pb;
 9075       umulh(Rhi_ab, Ra, Rb);
 9076       mul(Rlo_ab, Ra, Rb);
 9077       ldr(Ra, pre(Pa, wordSize));
 9078       ldr(Rb, pre(Pb, -wordSize));
 9079       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
 9080                                        // previous iteration.
 9081       // MACC(Rm, Rn, t0, t1, t2);
 9082       // Rm = *++Pm;
 9083       // Rn = *--Pn;
 9084       umulh(Rhi_mn, Rm, Rn);
 9085       mul(Rlo_mn, Rm, Rn);
 9086       ldr(Rm, pre(Pm, wordSize));
 9087       ldr(Rn, pre(Pn, -wordSize));
 9088       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9089     }
 9090 
 9091     void post1() {
 9092       block_comment("post1");
 9093 
 9094       // MACC(Ra, Rb, t0, t1, t2);
 9095       // Ra = *++Pa;
 9096       // Rb = *--Pb;
 9097       umulh(Rhi_ab, Ra, Rb);
 9098       mul(Rlo_ab, Ra, Rb);
 9099       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9100       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9101 
 9102       // *Pm = Rm = t0 * inv;
 9103       mul(Rm, t0, inv);
 9104       str(Rm, Address(Pm));
 9105 
 9106       // MACC(Rm, Rn, t0, t1, t2);
 9107       // t0 = t1; t1 = t2; t2 = 0;
 9108       umulh(Rhi_mn, Rm, Rn);
 9109 
 9110 #ifndef PRODUCT
 9111       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9112       {
 9113         mul(Rlo_mn, Rm, Rn);
 9114         add(Rlo_mn, t0, Rlo_mn);
 9115         Label ok;
 9116         cbz(Rlo_mn, ok); {
 9117           stop("broken Montgomery multiply");
 9118         } bind(ok);
 9119       }
 9120 #endif
 9121       // We have very carefully set things up so that
 9122       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9123       // the lower half of Rm * Rn because we know the result already:
 9124       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9125       // t0 != 0.  So, rather than do a mul and an adds we just set
 9126       // the carry flag iff t0 is nonzero.
 9127       //
 9128       // mul(Rlo_mn, Rm, Rn);
 9129       // adds(zr, t0, Rlo_mn);
 9130       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9131       adcs(t0, t1, Rhi_mn);
 9132       adc(t1, t2, zr);
 9133       mov(t2, zr);
 9134     }
 9135 
 9136     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
 9137       block_comment("pre2");
 9138       // Pa = Pa_base + i-len;
 9139       // Pb = Pb_base + len;
 9140       // Pm = Pm_base + i-len;
 9141       // Pn = Pn_base + len;
 9142 
 9143       if (i.is_register()) {
 9144         sub(Rj, i.as_register(), len);
 9145       } else {
 9146         mov(Rj, i.as_constant());
 9147         sub(Rj, Rj, len);
 9148       }
 9149       // Rj == i-len
 9150 
 9151       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
 9152       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
 9153       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9154       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
 9155 
 9156       // Ra = *++Pa;
 9157       // Rb = *--Pb;
 9158       // Rm = *++Pm;
 9159       // Rn = *--Pn;
 9160       ldr(Ra, pre(Pa, wordSize));
 9161       ldr(Rb, pre(Pb, -wordSize));
 9162       ldr(Rm, pre(Pm, wordSize));
 9163       ldr(Rn, pre(Pn, -wordSize));
 9164 
 9165       mov(Rhi_mn, zr);
 9166       mov(Rlo_mn, zr);
 9167     }
 9168 
 9169     void post2(RegisterOrConstant i, RegisterOrConstant len) {
 9170       block_comment("post2");
 9171       if (i.is_constant()) {
 9172         mov(Rj, i.as_constant()-len.as_constant());
 9173       } else {
 9174         sub(Rj, i.as_register(), len);
 9175       }
 9176 
 9177       adds(t0, t0, Rlo_mn); // The pending m*n, low part
 9178 
 9179       // As soon as we know the least significant digit of our result,
 9180       // store it.
 9181       // Pm_base[i-len] = t0;
 9182       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9183 
 9184       // t0 = t1; t1 = t2; t2 = 0;
 9185       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
 9186       adc(t1, t2, zr);
 9187       mov(t2, zr);
 9188     }
 9189 
 9190     // A carry in t0 after Montgomery multiplication means that we
 9191     // should subtract multiples of n from our result in m.  We'll
 9192     // keep doing that until there is no carry.
 9193     void normalize(RegisterOrConstant len) {
 9194       block_comment("normalize");
 9195       // while (t0)
 9196       //   t0 = sub(Pm_base, Pn_base, t0, len);
 9197       Label loop, post, again;
 9198       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
 9199       cbz(t0, post); {
 9200         bind(again); {
 9201           mov(i, zr);
 9202           mov(cnt, len);
 9203           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9204           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9205           subs(zr, zr, zr); // set carry flag, i.e. no borrow
 9206           align(16);
 9207           bind(loop); {
 9208             sbcs(Rm, Rm, Rn);
 9209             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9210             add(i, i, 1);
 9211             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9212             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9213             sub(cnt, cnt, 1);
 9214           } cbnz(cnt, loop);
 9215           sbc(t0, t0, zr);
 9216         } cbnz(t0, again);
 9217       } bind(post);
 9218     }
 9219 
 9220     // Move memory at s to d, reversing words.
 9221     //    Increments d to end of copied memory
 9222     //    Destroys tmp1, tmp2
 9223     //    Preserves len
 9224     //    Leaves s pointing to the address which was in d at start
 9225     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
 9226       assert(tmp1->encoding() < r19->encoding(), "register corruption");
 9227       assert(tmp2->encoding() < r19->encoding(), "register corruption");
 9228 
 9229       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
 9230       mov(tmp1, len);
 9231       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
 9232       sub(s, d, len, ext::uxtw, LogBytesPerWord);
 9233     }
 9234     // where
 9235     void reverse1(Register d, Register s, Register tmp) {
 9236       ldr(tmp, pre(s, -wordSize));
 9237       ror(tmp, tmp, 32);
 9238       str(tmp, post(d, wordSize));
 9239     }
 9240 
 9241     void step_squaring() {
 9242       // An extra ACC
 9243       step();
 9244       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9245     }
 9246 
 9247     void last_squaring(RegisterOrConstant i) {
 9248       Label dont;
 9249       // if ((i & 1) == 0) {
 9250       tbnz(i.as_register(), 0, dont); {
 9251         // MACC(Ra, Rb, t0, t1, t2);
 9252         // Ra = *++Pa;
 9253         // Rb = *--Pb;
 9254         umulh(Rhi_ab, Ra, Rb);
 9255         mul(Rlo_ab, Ra, Rb);
 9256         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9257       } bind(dont);
 9258     }
 9259 
 9260     void extra_step_squaring() {
 9261       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9262 
 9263       // MACC(Rm, Rn, t0, t1, t2);
 9264       // Rm = *++Pm;
 9265       // Rn = *--Pn;
 9266       umulh(Rhi_mn, Rm, Rn);
 9267       mul(Rlo_mn, Rm, Rn);
 9268       ldr(Rm, pre(Pm, wordSize));
 9269       ldr(Rn, pre(Pn, -wordSize));
 9270     }
 9271 
 9272     void post1_squaring() {
 9273       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9274 
 9275       // *Pm = Rm = t0 * inv;
 9276       mul(Rm, t0, inv);
 9277       str(Rm, Address(Pm));
 9278 
 9279       // MACC(Rm, Rn, t0, t1, t2);
 9280       // t0 = t1; t1 = t2; t2 = 0;
 9281       umulh(Rhi_mn, Rm, Rn);
 9282 
 9283 #ifndef PRODUCT
 9284       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9285       {
 9286         mul(Rlo_mn, Rm, Rn);
 9287         add(Rlo_mn, t0, Rlo_mn);
 9288         Label ok;
 9289         cbz(Rlo_mn, ok); {
 9290           stop("broken Montgomery multiply");
 9291         } bind(ok);
 9292       }
 9293 #endif
 9294       // We have very carefully set things up so that
 9295       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9296       // the lower half of Rm * Rn because we know the result already:
 9297       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9298       // t0 != 0.  So, rather than do a mul and an adds we just set
 9299       // the carry flag iff t0 is nonzero.
 9300       //
 9301       // mul(Rlo_mn, Rm, Rn);
 9302       // adds(zr, t0, Rlo_mn);
 9303       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9304       adcs(t0, t1, Rhi_mn);
 9305       adc(t1, t2, zr);
 9306       mov(t2, zr);
 9307     }
 9308 
 9309     void acc(Register Rhi, Register Rlo,
 9310              Register t0, Register t1, Register t2) {
 9311       adds(t0, t0, Rlo);
 9312       adcs(t1, t1, Rhi);
 9313       adc(t2, t2, zr);
 9314     }
 9315 
 9316   public:
 9317     /**
 9318      * Fast Montgomery multiplication.  The derivation of the
 9319      * algorithm is in A Cryptographic Library for the Motorola
 9320      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
 9321      *
 9322      * Arguments:
 9323      *
 9324      * Inputs for multiplication:
 9325      *   c_rarg0   - int array elements a
 9326      *   c_rarg1   - int array elements b
 9327      *   c_rarg2   - int array elements n (the modulus)
 9328      *   c_rarg3   - int length
 9329      *   c_rarg4   - int inv
 9330      *   c_rarg5   - int array elements m (the result)
 9331      *
 9332      * Inputs for squaring:
 9333      *   c_rarg0   - int array elements a
 9334      *   c_rarg1   - int array elements n (the modulus)
 9335      *   c_rarg2   - int length
 9336      *   c_rarg3   - int inv
 9337      *   c_rarg4   - int array elements m (the result)
 9338      *
 9339      */
 9340     address generate_multiply() {
 9341       Label argh, nothing;
 9342       bind(argh);
 9343       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9344 
 9345       align(CodeEntryAlignment);
 9346       address entry = pc();
 9347 
 9348       cbzw(Rlen, nothing);
 9349 
 9350       enter();
 9351 
 9352       // Make room.
 9353       cmpw(Rlen, 512);
 9354       br(Assembler::HI, argh);
 9355       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9356       andr(sp, Ra, -2 * wordSize);
 9357 
 9358       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9359 
 9360       {
 9361         // Copy input args, reversing as we go.  We use Ra as a
 9362         // temporary variable.
 9363         reverse(Ra, Pa_base, Rlen, t0, t1);
 9364         if (!_squaring)
 9365           reverse(Ra, Pb_base, Rlen, t0, t1);
 9366         reverse(Ra, Pn_base, Rlen, t0, t1);
 9367       }
 9368 
 9369       // Push all call-saved registers and also Pm_base which we'll need
 9370       // at the end.
 9371       save_regs();
 9372 
 9373 #ifndef PRODUCT
 9374       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
 9375       {
 9376         ldr(Rn, Address(Pn_base, 0));
 9377         mul(Rlo_mn, Rn, inv);
 9378         subs(zr, Rlo_mn, -1);
 9379         Label ok;
 9380         br(EQ, ok); {
 9381           stop("broken inverse in Montgomery multiply");
 9382         } bind(ok);
 9383       }
 9384 #endif
 9385 
 9386       mov(Pm_base, Ra);
 9387 
 9388       mov(t0, zr);
 9389       mov(t1, zr);
 9390       mov(t2, zr);
 9391 
 9392       block_comment("for (int i = 0; i < len; i++) {");
 9393       mov(Ri, zr); {
 9394         Label loop, end;
 9395         cmpw(Ri, Rlen);
 9396         br(Assembler::GE, end);
 9397 
 9398         bind(loop);
 9399         pre1(Ri);
 9400 
 9401         block_comment("  for (j = i; j; j--) {"); {
 9402           movw(Rj, Ri);
 9403           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9404         } block_comment("  } // j");
 9405 
 9406         post1();
 9407         addw(Ri, Ri, 1);
 9408         cmpw(Ri, Rlen);
 9409         br(Assembler::LT, loop);
 9410         bind(end);
 9411         block_comment("} // i");
 9412       }
 9413 
 9414       block_comment("for (int i = len; i < 2*len; i++) {");
 9415       mov(Ri, Rlen); {
 9416         Label loop, end;
 9417         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9418         br(Assembler::GE, end);
 9419 
 9420         bind(loop);
 9421         pre2(Ri, Rlen);
 9422 
 9423         block_comment("  for (j = len*2-i-1; j; j--) {"); {
 9424           lslw(Rj, Rlen, 1);
 9425           subw(Rj, Rj, Ri);
 9426           subw(Rj, Rj, 1);
 9427           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9428         } block_comment("  } // j");
 9429 
 9430         post2(Ri, Rlen);
 9431         addw(Ri, Ri, 1);
 9432         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9433         br(Assembler::LT, loop);
 9434         bind(end);
 9435       }
 9436       block_comment("} // i");
 9437 
 9438       normalize(Rlen);
 9439 
 9440       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9441       restore_regs();  // Restore caller's Pm_base
 9442 
 9443       // Copy our result into caller's Pm_base
 9444       reverse(Pm_base, Ra, Rlen, t0, t1);
 9445 
 9446       leave();
 9447       bind(nothing);
 9448       ret(lr);
 9449 
 9450       return entry;
 9451     }
 9452     // In C, approximately:
 9453 
 9454     // void
 9455     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
 9456     //                     julong Pn_base[], julong Pm_base[],
 9457     //                     julong inv, int len) {
 9458     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9459     //   julong *Pa, *Pb, *Pn, *Pm;
 9460     //   julong Ra, Rb, Rn, Rm;
 9461 
 9462     //   int i;
 9463 
 9464     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9465 
 9466     //   for (i = 0; i < len; i++) {
 9467     //     int j;
 9468 
 9469     //     Pa = Pa_base;
 9470     //     Pb = Pb_base + i;
 9471     //     Pm = Pm_base;
 9472     //     Pn = Pn_base + i;
 9473 
 9474     //     Ra = *Pa;
 9475     //     Rb = *Pb;
 9476     //     Rm = *Pm;
 9477     //     Rn = *Pn;
 9478 
 9479     //     int iters = i;
 9480     //     for (j = 0; iters--; j++) {
 9481     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9482     //       MACC(Ra, Rb, t0, t1, t2);
 9483     //       Ra = *++Pa;
 9484     //       Rb = *--Pb;
 9485     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9486     //       MACC(Rm, Rn, t0, t1, t2);
 9487     //       Rm = *++Pm;
 9488     //       Rn = *--Pn;
 9489     //     }
 9490 
 9491     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
 9492     //     MACC(Ra, Rb, t0, t1, t2);
 9493     //     *Pm = Rm = t0 * inv;
 9494     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9495     //     MACC(Rm, Rn, t0, t1, t2);
 9496 
 9497     //     assert(t0 == 0, "broken Montgomery multiply");
 9498 
 9499     //     t0 = t1; t1 = t2; t2 = 0;
 9500     //   }
 9501 
 9502     //   for (i = len; i < 2*len; i++) {
 9503     //     int j;
 9504 
 9505     //     Pa = Pa_base + i-len;
 9506     //     Pb = Pb_base + len;
 9507     //     Pm = Pm_base + i-len;
 9508     //     Pn = Pn_base + len;
 9509 
 9510     //     Ra = *++Pa;
 9511     //     Rb = *--Pb;
 9512     //     Rm = *++Pm;
 9513     //     Rn = *--Pn;
 9514 
 9515     //     int iters = len*2-i-1;
 9516     //     for (j = i-len+1; iters--; j++) {
 9517     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9518     //       MACC(Ra, Rb, t0, t1, t2);
 9519     //       Ra = *++Pa;
 9520     //       Rb = *--Pb;
 9521     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9522     //       MACC(Rm, Rn, t0, t1, t2);
 9523     //       Rm = *++Pm;
 9524     //       Rn = *--Pn;
 9525     //     }
 9526 
 9527     //     Pm_base[i-len] = t0;
 9528     //     t0 = t1; t1 = t2; t2 = 0;
 9529     //   }
 9530 
 9531     //   while (t0)
 9532     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9533     // }
 9534 
 9535     /**
 9536      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
 9537      * multiplies than Montgomery multiplication so it should be up to
 9538      * 25% faster.  However, its loop control is more complex and it
 9539      * may actually run slower on some machines.
 9540      *
 9541      * Arguments:
 9542      *
 9543      * Inputs:
 9544      *   c_rarg0   - int array elements a
 9545      *   c_rarg1   - int array elements n (the modulus)
 9546      *   c_rarg2   - int length
 9547      *   c_rarg3   - int inv
 9548      *   c_rarg4   - int array elements m (the result)
 9549      *
 9550      */
 9551     address generate_square() {
 9552       Label argh;
 9553       bind(argh);
 9554       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9555 
 9556       align(CodeEntryAlignment);
 9557       address entry = pc();
 9558 
 9559       enter();
 9560 
 9561       // Make room.
 9562       cmpw(Rlen, 512);
 9563       br(Assembler::HI, argh);
 9564       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9565       andr(sp, Ra, -2 * wordSize);
 9566 
 9567       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9568 
 9569       {
 9570         // Copy input args, reversing as we go.  We use Ra as a
 9571         // temporary variable.
 9572         reverse(Ra, Pa_base, Rlen, t0, t1);
 9573         reverse(Ra, Pn_base, Rlen, t0, t1);
 9574       }
 9575 
 9576       // Push all call-saved registers and also Pm_base which we'll need
 9577       // at the end.
 9578       save_regs();
 9579 
 9580       mov(Pm_base, Ra);
 9581 
 9582       mov(t0, zr);
 9583       mov(t1, zr);
 9584       mov(t2, zr);
 9585 
 9586       block_comment("for (int i = 0; i < len; i++) {");
 9587       mov(Ri, zr); {
 9588         Label loop, end;
 9589         bind(loop);
 9590         cmp(Ri, Rlen);
 9591         br(Assembler::GE, end);
 9592 
 9593         pre1(Ri);
 9594 
 9595         block_comment("for (j = (i+1)/2; j; j--) {"); {
 9596           add(Rj, Ri, 1);
 9597           lsr(Rj, Rj, 1);
 9598           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9599         } block_comment("  } // j");
 9600 
 9601         last_squaring(Ri);
 9602 
 9603         block_comment("  for (j = i/2; j; j--) {"); {
 9604           lsr(Rj, Ri, 1);
 9605           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9606         } block_comment("  } // j");
 9607 
 9608         post1_squaring();
 9609         add(Ri, Ri, 1);
 9610         cmp(Ri, Rlen);
 9611         br(Assembler::LT, loop);
 9612 
 9613         bind(end);
 9614         block_comment("} // i");
 9615       }
 9616 
 9617       block_comment("for (int i = len; i < 2*len; i++) {");
 9618       mov(Ri, Rlen); {
 9619         Label loop, end;
 9620         bind(loop);
 9621         cmp(Ri, Rlen, Assembler::LSL, 1);
 9622         br(Assembler::GE, end);
 9623 
 9624         pre2(Ri, Rlen);
 9625 
 9626         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
 9627           lsl(Rj, Rlen, 1);
 9628           sub(Rj, Rj, Ri);
 9629           sub(Rj, Rj, 1);
 9630           lsr(Rj, Rj, 1);
 9631           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9632         } block_comment("  } // j");
 9633 
 9634         last_squaring(Ri);
 9635 
 9636         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
 9637           lsl(Rj, Rlen, 1);
 9638           sub(Rj, Rj, Ri);
 9639           lsr(Rj, Rj, 1);
 9640           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9641         } block_comment("  } // j");
 9642 
 9643         post2(Ri, Rlen);
 9644         add(Ri, Ri, 1);
 9645         cmp(Ri, Rlen, Assembler::LSL, 1);
 9646 
 9647         br(Assembler::LT, loop);
 9648         bind(end);
 9649         block_comment("} // i");
 9650       }
 9651 
 9652       normalize(Rlen);
 9653 
 9654       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9655       restore_regs();  // Restore caller's Pm_base
 9656 
 9657       // Copy our result into caller's Pm_base
 9658       reverse(Pm_base, Ra, Rlen, t0, t1);
 9659 
 9660       leave();
 9661       ret(lr);
 9662 
 9663       return entry;
 9664     }
 9665     // In C, approximately:
 9666 
 9667     // void
 9668     // montgomery_square(julong Pa_base[], julong Pn_base[],
 9669     //                   julong Pm_base[], julong inv, int len) {
 9670     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9671     //   julong *Pa, *Pb, *Pn, *Pm;
 9672     //   julong Ra, Rb, Rn, Rm;
 9673 
 9674     //   int i;
 9675 
 9676     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9677 
 9678     //   for (i = 0; i < len; i++) {
 9679     //     int j;
 9680 
 9681     //     Pa = Pa_base;
 9682     //     Pb = Pa_base + i;
 9683     //     Pm = Pm_base;
 9684     //     Pn = Pn_base + i;
 9685 
 9686     //     Ra = *Pa;
 9687     //     Rb = *Pb;
 9688     //     Rm = *Pm;
 9689     //     Rn = *Pn;
 9690 
 9691     //     int iters = (i+1)/2;
 9692     //     for (j = 0; iters--; j++) {
 9693     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9694     //       MACC2(Ra, Rb, t0, t1, t2);
 9695     //       Ra = *++Pa;
 9696     //       Rb = *--Pb;
 9697     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9698     //       MACC(Rm, Rn, t0, t1, t2);
 9699     //       Rm = *++Pm;
 9700     //       Rn = *--Pn;
 9701     //     }
 9702     //     if ((i & 1) == 0) {
 9703     //       assert(Ra == Pa_base[j], "must be");
 9704     //       MACC(Ra, Ra, t0, t1, t2);
 9705     //     }
 9706     //     iters = i/2;
 9707     //     assert(iters == i-j, "must be");
 9708     //     for (; iters--; j++) {
 9709     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9710     //       MACC(Rm, Rn, t0, t1, t2);
 9711     //       Rm = *++Pm;
 9712     //       Rn = *--Pn;
 9713     //     }
 9714 
 9715     //     *Pm = Rm = t0 * inv;
 9716     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9717     //     MACC(Rm, Rn, t0, t1, t2);
 9718 
 9719     //     assert(t0 == 0, "broken Montgomery multiply");
 9720 
 9721     //     t0 = t1; t1 = t2; t2 = 0;
 9722     //   }
 9723 
 9724     //   for (i = len; i < 2*len; i++) {
 9725     //     int start = i-len+1;
 9726     //     int end = start + (len - start)/2;
 9727     //     int j;
 9728 
 9729     //     Pa = Pa_base + i-len;
 9730     //     Pb = Pa_base + len;
 9731     //     Pm = Pm_base + i-len;
 9732     //     Pn = Pn_base + len;
 9733 
 9734     //     Ra = *++Pa;
 9735     //     Rb = *--Pb;
 9736     //     Rm = *++Pm;
 9737     //     Rn = *--Pn;
 9738 
 9739     //     int iters = (2*len-i-1)/2;
 9740     //     assert(iters == end-start, "must be");
 9741     //     for (j = start; iters--; j++) {
 9742     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9743     //       MACC2(Ra, Rb, t0, t1, t2);
 9744     //       Ra = *++Pa;
 9745     //       Rb = *--Pb;
 9746     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9747     //       MACC(Rm, Rn, t0, t1, t2);
 9748     //       Rm = *++Pm;
 9749     //       Rn = *--Pn;
 9750     //     }
 9751     //     if ((i & 1) == 0) {
 9752     //       assert(Ra == Pa_base[j], "must be");
 9753     //       MACC(Ra, Ra, t0, t1, t2);
 9754     //     }
 9755     //     iters =  (2*len-i)/2;
 9756     //     assert(iters == len-j, "must be");
 9757     //     for (; iters--; j++) {
 9758     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9759     //       MACC(Rm, Rn, t0, t1, t2);
 9760     //       Rm = *++Pm;
 9761     //       Rn = *--Pn;
 9762     //     }
 9763     //     Pm_base[i-len] = t0;
 9764     //     t0 = t1; t1 = t2; t2 = 0;
 9765     //   }
 9766 
 9767     //   while (t0)
 9768     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9769     // }
 9770   };
 9771 
 9772   void generate_vector_math_stubs() {
 9773     // Get native vector math stub routine addresses
 9774     void* libsleef = nullptr;
 9775     char ebuf[1024];
 9776     char dll_name[JVM_MAXPATHLEN];
 9777     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
 9778       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
 9779     }
 9780     if (libsleef == nullptr) {
 9781       log_info(library)("Failed to load native vector math library, %s!", ebuf);
 9782       return;
 9783     }
 9784     // Method naming convention
 9785     //   All the methods are named as <OP><T><N>_<U><suffix>
 9786     //   Where:
 9787     //     <OP>     is the operation name, e.g. sin
 9788     //     <T>      is optional to indicate float/double
 9789     //              "f/d" for vector float/double operation
 9790     //     <N>      is the number of elements in the vector
 9791     //              "2/4" for neon, and "x" for sve
 9792     //     <U>      is the precision level
 9793     //              "u10/u05" represents 1.0/0.5 ULP error bounds
 9794     //               We use "u10" for all operations by default
 9795     //               But for those functions do not have u10 support, we use "u05" instead
 9796     //     <suffix> indicates neon/sve
 9797     //              "sve/advsimd" for sve/neon implementations
 9798     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
 9799     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
 9800     //
 9801     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
 9802 
 9803     // Math vector stubs implemented with SVE for scalable vector size.
 9804     if (UseSVE > 0) {
 9805       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9806         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9807         // Skip "tanh" because there is performance regression
 9808         if (vop == VectorSupport::VECTOR_OP_TANH) {
 9809           continue;
 9810         }
 9811 
 9812         // The native library does not support u10 level of "hypot".
 9813         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9814 
 9815         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
 9816         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9817 
 9818         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
 9819         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9820       }
 9821     }
 9822 
 9823     // Math vector stubs implemented with NEON for 64/128 bits vector size.
 9824     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9825       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9826       // Skip "tanh" because there is performance regression
 9827       if (vop == VectorSupport::VECTOR_OP_TANH) {
 9828         continue;
 9829       }
 9830 
 9831       // The native library does not support u10 level of "hypot".
 9832       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9833 
 9834       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9835       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
 9836 
 9837       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9838       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9839 
 9840       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
 9841       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9842     }
 9843   }
 9844 
 9845   // Initialization
 9846   void generate_initial_stubs() {
 9847     // Generate initial stubs and initializes the entry points
 9848 
 9849     // entry points that exist in all platforms Note: This is code
 9850     // that could be shared among different platforms - however the
 9851     // benefit seems to be smaller than the disadvantage of having a
 9852     // much more complicated generator structure. See also comment in
 9853     // stubRoutines.hpp.
 9854 
 9855     StubRoutines::_forward_exception_entry = generate_forward_exception();
 9856 
 9857     StubRoutines::_call_stub_entry =
 9858       generate_call_stub(StubRoutines::_call_stub_return_address);
 9859 
 9860     // is referenced by megamorphic call
 9861     StubRoutines::_catch_exception_entry = generate_catch_exception();
 9862 
 9863     // Initialize table for copy memory (arraycopy) check.
 9864     if (UnsafeMemoryAccess::_table == nullptr) {
 9865       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
 9866     }
 9867 
 9868     if (UseCRC32Intrinsics) {
 9869       // set table address before stub generation which use it
 9870       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
 9871       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
 9872     }
 9873 
 9874     if (UseCRC32CIntrinsics) {
 9875       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
 9876     }
 9877 
 9878     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
 9879       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
 9880     }
 9881 
 9882     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
 9883       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
 9884     }
 9885 
 9886     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
 9887         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
 9888       StubRoutines::_hf2f = generate_float16ToFloat();
 9889       StubRoutines::_f2hf = generate_floatToFloat16();
 9890     }
 9891   }
 9892 
 9893   void generate_continuation_stubs() {
 9894     // Continuation stubs:
 9895     StubRoutines::_cont_thaw          = generate_cont_thaw();
 9896     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
 9897     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
 9898     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
 9899   }
 9900 
 9901   void generate_final_stubs() {
 9902     // support for verify_oop (must happen after universe_init)
 9903     if (VerifyOops) {
 9904       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
 9905     }
 9906 
 9907     // arraycopy stubs used by compilers
 9908     generate_arraycopy_stubs();
 9909 
 9910     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
 9911 
 9912     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
 9913 
 9914     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
 9915     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
 9916 
 9917 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9918 
 9919     generate_atomic_entry_points();
 9920 
 9921 #endif // LINUX
 9922 
 9923 #ifdef COMPILER2
 9924     if (UseSecondarySupersTable) {
 9925       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
 9926       if (! InlineSecondarySupersTest) {
 9927         generate_lookup_secondary_supers_table_stub();
 9928       }
 9929     }
 9930 #endif
 9931 
 9932     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
 9933   }
 9934 
 9935   void generate_compiler_stubs() {
 9936 #if COMPILER2_OR_JVMCI
 9937 
 9938     if (UseSVE == 0) {
 9939       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
 9940     }
 9941 
 9942     // array equals stub for large arrays.
 9943     if (!UseSimpleArrayEquals) {
 9944       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
 9945     }
 9946 
 9947     // arrays_hascode stub for large arrays.
 9948     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
 9949     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
 9950     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
 9951     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
 9952     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
 9953 
 9954     // byte_array_inflate stub for large arrays.
 9955     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
 9956 
 9957     // countPositives stub for large arrays.
 9958     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
 9959 
 9960     generate_compare_long_strings();
 9961 
 9962     generate_string_indexof_stubs();
 9963 
 9964 #ifdef COMPILER2
 9965     if (UseMultiplyToLenIntrinsic) {
 9966       StubRoutines::_multiplyToLen = generate_multiplyToLen();
 9967     }
 9968 
 9969     if (UseSquareToLenIntrinsic) {
 9970       StubRoutines::_squareToLen = generate_squareToLen();
 9971     }
 9972 
 9973     if (UseMulAddIntrinsic) {
 9974       StubRoutines::_mulAdd = generate_mulAdd();
 9975     }
 9976 
 9977     if (UseSIMDForBigIntegerShiftIntrinsics) {
 9978       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
 9979       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
 9980     }
 9981 
 9982     if (UseMontgomeryMultiplyIntrinsic) {
 9983       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
 9984       StubCodeMark mark(this, stub_id);
 9985       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
 9986       StubRoutines::_montgomeryMultiply = g.generate_multiply();
 9987     }
 9988 
 9989     if (UseMontgomerySquareIntrinsic) {
 9990       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
 9991       StubCodeMark mark(this, stub_id);
 9992       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
 9993       // We use generate_multiply() rather than generate_square()
 9994       // because it's faster for the sizes of modulus we care about.
 9995       StubRoutines::_montgomerySquare = g.generate_multiply();
 9996     }
 9997 
 9998     generate_vector_math_stubs();
 9999 
10000 #endif // COMPILER2
10001 
10002     if (UseChaCha20Intrinsics) {
10003       StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
10004     }
10005 
10006     if (UseDilithiumIntrinsics) {
10007       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
10008       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
10009       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
10010       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
10011       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
10012     }
10013 
10014     if (UseBASE64Intrinsics) {
10015         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
10016         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
10017     }
10018 
10019     // data cache line writeback
10020     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
10021     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
10022 
10023     if (UseAESIntrinsics) {
10024       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
10025       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
10026       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
10027       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
10028       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
10029     }
10030     if (UseGHASHIntrinsics) {
10031       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
10032       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
10033     }
10034     if (UseAESIntrinsics && UseGHASHIntrinsics) {
10035       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
10036     }
10037 
10038     if (UseMD5Intrinsics) {
10039       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
10040       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
10041     }
10042     if (UseSHA1Intrinsics) {
10043       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
10044       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
10045     }
10046     if (UseSHA256Intrinsics) {
10047       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
10048       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
10049     }
10050     if (UseSHA512Intrinsics) {
10051       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
10052       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
10053     }
10054     if (UseSHA3Intrinsics) {
10055       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
10056       StubRoutines::_double_keccak         = generate_double_keccak();
10057       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
10058     }
10059 
10060     if (UsePoly1305Intrinsics) {
10061       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
10062     }
10063 
10064     // generate Adler32 intrinsics code
10065     if (UseAdler32Intrinsics) {
10066       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
10067     }
10068 
10069 #endif // COMPILER2_OR_JVMCI
10070   }
10071 
10072  public:
10073   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
10074     switch(blob_id) {
10075     case initial_id:
10076       generate_initial_stubs();
10077       break;
10078      case continuation_id:
10079       generate_continuation_stubs();
10080       break;
10081     case compiler_id:
10082       generate_compiler_stubs();
10083       break;
10084     case final_id:
10085       generate_final_stubs();
10086       break;
10087     default:
10088       fatal("unexpected blob id: %d", blob_id);
10089       break;
10090     };
10091   }
10092 }; // end class declaration
10093 
10094 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
10095   StubGenerator g(code, blob_id);
10096 }
10097 
10098 
10099 #if defined (LINUX)
10100 
10101 // Define pointers to atomic stubs and initialize them to point to the
10102 // code in atomic_aarch64.S.
10103 
10104 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
10105   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
10106     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
10107   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
10108     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
10109 
10110 DEFAULT_ATOMIC_OP(fetch_add, 4, )
10111 DEFAULT_ATOMIC_OP(fetch_add, 8, )
10112 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
10113 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
10114 DEFAULT_ATOMIC_OP(xchg, 4, )
10115 DEFAULT_ATOMIC_OP(xchg, 8, )
10116 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
10117 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
10118 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
10119 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
10120 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
10121 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
10122 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
10123 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
10124 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
10125 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
10126 
10127 #undef DEFAULT_ATOMIC_OP
10128 
10129 #endif // LINUX