1 /*
    2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Constant data definitions
   83 
   84 static const uint32_t _sha256_round_consts[64] = {
   85   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   86   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   87   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
   88   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
   89   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
   90   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
   91   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
   92   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
   93   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
   94   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
   95   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
   96   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
   97   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
   98   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
   99   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  100   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  101 };
  102 
  103 static const uint64_t _sha512_round_consts[80] = {
  104   0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
  105   0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
  106   0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
  107   0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
  108   0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
  109   0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
  110   0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
  111   0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
  112   0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
  113   0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
  114   0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
  115   0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
  116   0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
  117   0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
  118   0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
  119   0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
  120   0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
  121   0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
  122   0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
  123   0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
  124   0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
  125   0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
  126   0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
  127   0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
  128   0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
  129   0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
  130   0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
  131 };
  132 
  133 static const uint64_t _sha3_round_consts[24] = {
  134   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  135   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  136   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  137   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  138   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  139   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  140   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  141   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  142 };
  143 
  144 static const uint64_t _double_keccak_round_consts[24] = {
  145   0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
  146   0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
  147   0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
  148   0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
  149   0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
  150   0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
  151   0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
  152   0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
  153 };
  154 
  155 static const char _encodeBlock_toBase64[64] = {
  156   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  157   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  158   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  159   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  160   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
  161 };
  162 
  163 static const char _encodeBlock_toBase64URL[64] = {
  164   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  165   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  166   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  167   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  168   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
  169 };
  170 
  171 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
  172 // except the trailing character '=' is also treated illegal value in this intrinsic. That
  173 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
  174 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
  175   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  176   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  177   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  178   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  179   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  180   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
  181   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  182   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  183   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  184   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  185   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  186   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  187   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  188   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  189   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  190   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  191 };
  192 
  193 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
  194   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  195   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  196   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  197   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  198   255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
  199   15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
  200   255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
  201   41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
  202   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  203   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  204   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  205   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  206   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  207   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  208   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  209   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  210 };
  211 
  212 // A legal value of base64 code is in range [0, 127].  We need two lookups
  213 // with tbl/tbx and combine them to get the decode data. The 1st table vector
  214 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
  215 // table vector lookup use tbx, out of range indices are unchanged in
  216 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
  217 // The value of index 64 is set to 0, so that we know that we already get the
  218 // decoded data with the 1st lookup.
  219 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
  220   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  221   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  222   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
  223   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  224   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  225   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  226   255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  227   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  228 };
  229 
  230 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
  231   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  232   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
  233   255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
  234   52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
  235   0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
  236   14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
  237   63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
  238   40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
  239 };
  240 
  241 
  242 // Stub Code definitions
  243 
  244 class StubGenerator: public StubCodeGenerator {
  245  private:
  246 
  247 #ifdef PRODUCT
  248 #define inc_counter_np(counter) ((void)0)
  249 #else
  250   void inc_counter_np_(uint& counter) {
  251     __ incrementw(ExternalAddress((address)&counter));
  252   }
  253 #define inc_counter_np(counter) \
  254   BLOCK_COMMENT("inc_counter " #counter); \
  255   inc_counter_np_(counter);
  256 #endif
  257 
  258   // Call stubs are used to call Java from C
  259   //
  260   // Arguments:
  261   //    c_rarg0:   call wrapper address                   address
  262   //    c_rarg1:   result                                 address
  263   //    c_rarg2:   result type                            BasicType
  264   //    c_rarg3:   method                                 Method*
  265   //    c_rarg4:   (interpreter) entry point              address
  266   //    c_rarg5:   parameters                             intptr_t*
  267   //    c_rarg6:   parameter size (in words)              int
  268   //    c_rarg7:   thread                                 Thread*
  269   //
  270   // There is no return from the stub itself as any Java result
  271   // is written to result
  272   //
  273   // we save r30 (lr) as the return PC at the base of the frame and
  274   // link r29 (fp) below it as the frame pointer installing sp (r31)
  275   // into fp.
  276   //
  277   // we save r0-r7, which accounts for all the c arguments.
  278   //
  279   // TODO: strictly do we need to save them all? they are treated as
  280   // volatile by C so could we omit saving the ones we are going to
  281   // place in global registers (thread? method?) or those we only use
  282   // during setup of the Java call?
  283   //
  284   // we don't need to save r8 which C uses as an indirect result location
  285   // return register.
  286   //
  287   // we don't need to save r9-r15 which both C and Java treat as
  288   // volatile
  289   //
  290   // we don't need to save r16-18 because Java does not use them
  291   //
  292   // we save r19-r28 which Java uses as scratch registers and C
  293   // expects to be callee-save
  294   //
  295   // we save the bottom 64 bits of each value stored in v8-v15; it is
  296   // the responsibility of the caller to preserve larger values.
  297   //
  298   // so the stub frame looks like this when we enter Java code
  299   //
  300   //     [ return_from_Java     ] <--- sp
  301   //     [ argument word n      ]
  302   //      ...
  303   // -29 [ argument word 1      ]
  304   // -28 [ saved Floating-point Control Register ]
  305   // -26 [ saved v15            ] <--- sp_after_call
  306   // -25 [ saved v14            ]
  307   // -24 [ saved v13            ]
  308   // -23 [ saved v12            ]
  309   // -22 [ saved v11            ]
  310   // -21 [ saved v10            ]
  311   // -20 [ saved v9             ]
  312   // -19 [ saved v8             ]
  313   // -18 [ saved r28            ]
  314   // -17 [ saved r27            ]
  315   // -16 [ saved r26            ]
  316   // -15 [ saved r25            ]
  317   // -14 [ saved r24            ]
  318   // -13 [ saved r23            ]
  319   // -12 [ saved r22            ]
  320   // -11 [ saved r21            ]
  321   // -10 [ saved r20            ]
  322   //  -9 [ saved r19            ]
  323   //  -8 [ call wrapper    (r0) ]
  324   //  -7 [ result          (r1) ]
  325   //  -6 [ result type     (r2) ]
  326   //  -5 [ method          (r3) ]
  327   //  -4 [ entry point     (r4) ]
  328   //  -3 [ parameters      (r5) ]
  329   //  -2 [ parameter size  (r6) ]
  330   //  -1 [ thread (r7)          ]
  331   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  332   //   1 [ saved lr       (r30) ]
  333 
  334   // Call stub stack layout word offsets from fp
  335   enum call_stub_layout {
  336     sp_after_call_off  = -28,
  337 
  338     fpcr_off           = sp_after_call_off,
  339     d15_off            = -26,
  340     d13_off            = -24,
  341     d11_off            = -22,
  342     d9_off             = -20,
  343 
  344     r28_off            = -18,
  345     r26_off            = -16,
  346     r24_off            = -14,
  347     r22_off            = -12,
  348     r20_off            = -10,
  349     call_wrapper_off   =  -8,
  350     result_off         =  -7,
  351     result_type_off    =  -6,
  352     method_off         =  -5,
  353     entry_point_off    =  -4,
  354     parameter_size_off =  -2,
  355     thread_off         =  -1,
  356     fp_f               =   0,
  357     retaddr_off        =   1,
  358   };
  359 
  360   address generate_call_stub(address& return_address) {
  361     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  362            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  363            "adjust this code");
  364 
  365     StubId stub_id = StubId::stubgen_call_stub_id;
  366     GrowableArray<address> entries;
  367     int entry_count = StubInfo::entry_count(stub_id);
  368     assert(entry_count == 2, "sanity check");
  369     address start = load_archive_data(stub_id, &entries);
  370     if (start != nullptr) {
  371       assert(entries.length() == 1, "expected 1 extra entry");
  372       return_address = entries.at(0);
  373       return start;
  374     }
  375     StubCodeMark mark(this, stub_id);
  376     start = __ pc();
  377 
  378     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  379 
  380     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  381     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  382     const Address result        (rfp, result_off         * wordSize);
  383     const Address result_type   (rfp, result_type_off    * wordSize);
  384     const Address method        (rfp, method_off         * wordSize);
  385     const Address entry_point   (rfp, entry_point_off    * wordSize);
  386     const Address parameter_size(rfp, parameter_size_off * wordSize);
  387 
  388     const Address thread        (rfp, thread_off         * wordSize);
  389 
  390     const Address d15_save      (rfp, d15_off * wordSize);
  391     const Address d13_save      (rfp, d13_off * wordSize);
  392     const Address d11_save      (rfp, d11_off * wordSize);
  393     const Address d9_save       (rfp, d9_off * wordSize);
  394 
  395     const Address r28_save      (rfp, r28_off * wordSize);
  396     const Address r26_save      (rfp, r26_off * wordSize);
  397     const Address r24_save      (rfp, r24_off * wordSize);
  398     const Address r22_save      (rfp, r22_off * wordSize);
  399     const Address r20_save      (rfp, r20_off * wordSize);
  400 
  401     // stub code
  402 
  403     address aarch64_entry = __ pc();
  404 
  405     // set up frame and move sp to end of save area
  406     __ enter();
  407     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  408 
  409     // save register parameters and Java scratch/global registers
  410     // n.b. we save thread even though it gets installed in
  411     // rthread because we want to sanity check rthread later
  412     __ str(c_rarg7,  thread);
  413     __ strw(c_rarg6, parameter_size);
  414     __ stp(c_rarg4, c_rarg5,  entry_point);
  415     __ stp(c_rarg2, c_rarg3,  result_type);
  416     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  417 
  418     __ stp(r20, r19,   r20_save);
  419     __ stp(r22, r21,   r22_save);
  420     __ stp(r24, r23,   r24_save);
  421     __ stp(r26, r25,   r26_save);
  422     __ stp(r28, r27,   r28_save);
  423 
  424     __ stpd(v9,  v8,   d9_save);
  425     __ stpd(v11, v10,  d11_save);
  426     __ stpd(v13, v12,  d13_save);
  427     __ stpd(v15, v14,  d15_save);
  428 
  429     __ get_fpcr(rscratch1);
  430     __ str(rscratch1, fpcr_save);
  431     // Set FPCR to the state we need. We do want Round to Nearest. We
  432     // don't want non-IEEE rounding modes or floating-point traps.
  433     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  434     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  435     __ set_fpcr(rscratch1);
  436 
  437     // install Java thread in global register now we have saved
  438     // whatever value it held
  439     __ mov(rthread, c_rarg7);
  440     // And method
  441     __ mov(rmethod, c_rarg3);
  442 
  443     // set up the heapbase register
  444     __ reinit_heapbase();
  445 
  446 #ifdef ASSERT
  447     // make sure we have no pending exceptions
  448     {
  449       Label L;
  450       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  451       __ cmp(rscratch1, (u1)NULL_WORD);
  452       __ br(Assembler::EQ, L);
  453       __ stop("StubRoutines::call_stub: entered with pending exception");
  454       __ BIND(L);
  455     }
  456 #endif
  457     // pass parameters if any
  458     __ mov(esp, sp);
  459     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  460     __ andr(sp, rscratch1, -2 * wordSize);
  461 
  462     BLOCK_COMMENT("pass parameters if any");
  463     Label parameters_done;
  464     // parameter count is still in c_rarg6
  465     // and parameter pointer identifying param 1 is in c_rarg5
  466     __ cbzw(c_rarg6, parameters_done);
  467 
  468     address loop = __ pc();
  469     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  470     __ subsw(c_rarg6, c_rarg6, 1);
  471     __ push(rscratch1);
  472     __ br(Assembler::GT, loop);
  473 
  474     __ BIND(parameters_done);
  475 
  476     // call Java entry -- passing methdoOop, and current sp
  477     //      rmethod: Method*
  478     //      r19_sender_sp: sender sp
  479     BLOCK_COMMENT("call Java function");
  480     __ mov(r19_sender_sp, sp);
  481     __ blr(c_rarg4);
  482 
  483     // we do this here because the notify will already have been done
  484     // if we get to the next instruction via an exception
  485     //
  486     // n.b. adding this instruction here affects the calculation of
  487     // whether or not a routine returns to the call stub (used when
  488     // doing stack walks) since the normal test is to check the return
  489     // pc against the address saved below. so we may need to allow for
  490     // this extra instruction in the check.
  491 
  492     // save current address for use by exception handling code
  493 
  494     return_address = __ pc();
  495     entries.append(return_address);
  496 
  497     // store result depending on type (everything that is not
  498     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  499     // n.b. this assumes Java returns an integral result in r0
  500     // and a floating result in j_farg0
  501     __ ldr(j_rarg2, result);
  502     Label is_long, is_float, is_double, exit;
  503     __ ldr(j_rarg1, result_type);
  504     __ cmp(j_rarg1, (u1)T_OBJECT);
  505     __ br(Assembler::EQ, is_long);
  506     __ cmp(j_rarg1, (u1)T_LONG);
  507     __ br(Assembler::EQ, is_long);
  508     __ cmp(j_rarg1, (u1)T_FLOAT);
  509     __ br(Assembler::EQ, is_float);
  510     __ cmp(j_rarg1, (u1)T_DOUBLE);
  511     __ br(Assembler::EQ, is_double);
  512 
  513     // handle T_INT case
  514     __ strw(r0, Address(j_rarg2));
  515 
  516     __ BIND(exit);
  517 
  518     // pop parameters
  519     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  520 
  521 #ifdef ASSERT
  522     // verify that threads correspond
  523     {
  524       Label L, S;
  525       __ ldr(rscratch1, thread);
  526       __ cmp(rthread, rscratch1);
  527       __ br(Assembler::NE, S);
  528       __ get_thread(rscratch1);
  529       __ cmp(rthread, rscratch1);
  530       __ br(Assembler::EQ, L);
  531       __ BIND(S);
  532       __ stop("StubRoutines::call_stub: threads must correspond");
  533       __ BIND(L);
  534     }
  535 #endif
  536 
  537     __ pop_cont_fastpath(rthread);
  538 
  539     // restore callee-save registers
  540     __ ldpd(v15, v14,  d15_save);
  541     __ ldpd(v13, v12,  d13_save);
  542     __ ldpd(v11, v10,  d11_save);
  543     __ ldpd(v9,  v8,   d9_save);
  544 
  545     __ ldp(r28, r27,   r28_save);
  546     __ ldp(r26, r25,   r26_save);
  547     __ ldp(r24, r23,   r24_save);
  548     __ ldp(r22, r21,   r22_save);
  549     __ ldp(r20, r19,   r20_save);
  550 
  551     // restore fpcr
  552     __ ldr(rscratch1,  fpcr_save);
  553     __ set_fpcr(rscratch1);
  554 
  555     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  556     __ ldrw(c_rarg2, result_type);
  557     __ ldr(c_rarg3,  method);
  558     __ ldp(c_rarg4, c_rarg5,  entry_point);
  559     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  560 
  561     // leave frame and return to caller
  562     __ leave();
  563     __ ret(lr);
  564 
  565     // handle return types different from T_INT
  566 
  567     __ BIND(is_long);
  568     __ str(r0, Address(j_rarg2, 0));
  569     __ br(Assembler::AL, exit);
  570 
  571     __ BIND(is_float);
  572     __ strs(j_farg0, Address(j_rarg2, 0));
  573     __ br(Assembler::AL, exit);
  574 
  575     __ BIND(is_double);
  576     __ strd(j_farg0, Address(j_rarg2, 0));
  577     __ br(Assembler::AL, exit);
  578 
  579     // record the stub entry and end plus the auxiliary entry
  580     store_archive_data(stub_id, start, __ pc(), &entries);
  581 
  582     return start;
  583   }
  584 
  585   // Return point for a Java call if there's an exception thrown in
  586   // Java code.  The exception is caught and transformed into a
  587   // pending exception stored in JavaThread that can be tested from
  588   // within the VM.
  589   //
  590   // Note: Usually the parameters are removed by the callee. In case
  591   // of an exception crossing an activation frame boundary, that is
  592   // not the case if the callee is compiled code => need to setup the
  593   // rsp.
  594   //
  595   // r0: exception oop
  596 
  597   address generate_catch_exception() {
  598     StubId stub_id = StubId::stubgen_catch_exception_id;
  599     int entry_count = StubInfo::entry_count(stub_id);
  600     assert(entry_count == 1, "sanity check");
  601     address start = load_archive_data(stub_id);
  602     if (start != nullptr) {
  603       return start;
  604     }
  605     StubCodeMark mark(this, stub_id);
  606     start = __ pc();
  607 
  608     // same as in generate_call_stub():
  609     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  610     const Address thread        (rfp, thread_off         * wordSize);
  611 
  612 #ifdef ASSERT
  613     // verify that threads correspond
  614     {
  615       Label L, S;
  616       __ ldr(rscratch1, thread);
  617       __ cmp(rthread, rscratch1);
  618       __ br(Assembler::NE, S);
  619       __ get_thread(rscratch1);
  620       __ cmp(rthread, rscratch1);
  621       __ br(Assembler::EQ, L);
  622       __ bind(S);
  623       __ stop("StubRoutines::catch_exception: threads must correspond");
  624       __ bind(L);
  625     }
  626 #endif
  627 
  628     // set pending exception
  629     __ verify_oop(r0);
  630 
  631     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  632     // special case -- add file name string to AOT address table
  633     address file = (address)AOTCodeCache::add_C_string(__FILE__);
  634     __ lea(rscratch1, ExternalAddress(file));
  635     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  636     __ movw(rscratch1, (int)__LINE__);
  637     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  638 
  639     // complete return to VM
  640     assert(StubRoutines::_call_stub_return_address != nullptr,
  641            "_call_stub_return_address must have been generated before");
  642     __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
  643 
  644     // record the stub entry and end
  645     store_archive_data(stub_id, start, __ pc());
  646 
  647     return start;
  648   }
  649 
  650   // Continuation point for runtime calls returning with a pending
  651   // exception.  The pending exception check happened in the runtime
  652   // or native call stub.  The pending exception in Thread is
  653   // converted into a Java-level exception.
  654   //
  655   // Contract with Java-level exception handlers:
  656   // r0: exception
  657   // r3: throwing pc
  658   //
  659   // NOTE: At entry of this stub, exception-pc must be in LR !!
  660 
  661   // NOTE: this is always used as a jump target within generated code
  662   // so it just needs to be generated code with no x86 prolog
  663 
  664   address generate_forward_exception() {
  665     StubId stub_id = StubId::stubgen_forward_exception_id;
  666     int entry_count = StubInfo::entry_count(stub_id);
  667     assert(entry_count == 1, "sanity check");
  668     address start = load_archive_data(stub_id);
  669     if (start != nullptr) {
  670       return start;
  671     }
  672     StubCodeMark mark(this, stub_id);
  673     start = __ pc();
  674 
  675     // Upon entry, LR points to the return address returning into
  676     // Java (interpreted or compiled) code; i.e., the return address
  677     // becomes the throwing pc.
  678     //
  679     // Arguments pushed before the runtime call are still on the stack
  680     // but the exception handler will reset the stack pointer ->
  681     // ignore them.  A potential result in registers can be ignored as
  682     // well.
  683 
  684 #ifdef ASSERT
  685     // make sure this code is only executed if there is a pending exception
  686     {
  687       Label L;
  688       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  689       __ cbnz(rscratch1, L);
  690       __ stop("StubRoutines::forward exception: no pending exception (1)");
  691       __ bind(L);
  692     }
  693 #endif
  694 
  695     // compute exception handler into r19
  696 
  697     // call the VM to find the handler address associated with the
  698     // caller address. pass thread in r0 and caller pc (ret address)
  699     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  700     // the stack.
  701     __ mov(c_rarg1, lr);
  702     // lr will be trashed by the VM call so we move it to R19
  703     // (callee-saved) because we also need to pass it to the handler
  704     // returned by this call.
  705     __ mov(r19, lr);
  706     BLOCK_COMMENT("call exception_handler_for_return_address");
  707     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  708                          SharedRuntime::exception_handler_for_return_address),
  709                     rthread, c_rarg1);
  710     // Reinitialize the ptrue predicate register, in case the external runtime
  711     // call clobbers ptrue reg, as we may return to SVE compiled code.
  712     __ reinitialize_ptrue();
  713 
  714     // we should not really care that lr is no longer the callee
  715     // address. we saved the value the handler needs in r19 so we can
  716     // just copy it to r3. however, the C2 handler will push its own
  717     // frame and then calls into the VM and the VM code asserts that
  718     // the PC for the frame above the handler belongs to a compiled
  719     // Java method. So, we restore lr here to satisfy that assert.
  720     __ mov(lr, r19);
  721     // setup r0 & r3 & clear pending exception
  722     __ mov(r3, r19);
  723     __ mov(r19, r0);
  724     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  725     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  726 
  727 #ifdef ASSERT
  728     // make sure exception is set
  729     {
  730       Label L;
  731       __ cbnz(r0, L);
  732       __ stop("StubRoutines::forward exception: no pending exception (2)");
  733       __ bind(L);
  734     }
  735 #endif
  736 
  737     // continue at exception handler
  738     // r0: exception
  739     // r3: throwing pc
  740     // r19: exception handler
  741     __ verify_oop(r0);
  742     __ br(r19);
  743 
  744     // record the stub entry and end
  745     store_archive_data(stub_id, start, __ pc());
  746 
  747     return start;
  748   }
  749 
  750   // Non-destructive plausibility checks for oops
  751   //
  752   // Arguments:
  753   //    r0: oop to verify
  754   //    rscratch1: error message
  755   //
  756   // Stack after saving c_rarg3:
  757   //    [tos + 0]: saved c_rarg3
  758   //    [tos + 1]: saved c_rarg2
  759   //    [tos + 2]: saved lr
  760   //    [tos + 3]: saved rscratch2
  761   //    [tos + 4]: saved r0
  762   //    [tos + 5]: saved rscratch1
  763   address generate_verify_oop() {
  764     StubId stub_id = StubId::stubgen_verify_oop_id;
  765     int entry_count = StubInfo::entry_count(stub_id);
  766     assert(entry_count == 1, "sanity check");
  767     address start = load_archive_data(stub_id);
  768     if (start != nullptr) {
  769       return start;
  770     }
  771     StubCodeMark mark(this, stub_id);
  772     start = __ pc();
  773 
  774     Label exit, error;
  775 
  776     // save c_rarg2 and c_rarg3
  777     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  778 
  779     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  780     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  781     __ ldr(c_rarg3, Address(c_rarg2));
  782     __ add(c_rarg3, c_rarg3, 1);
  783     __ str(c_rarg3, Address(c_rarg2));
  784 
  785     // object is in r0
  786     // make sure object is 'reasonable'
  787     __ cbz(r0, exit); // if obj is null it is OK
  788 
  789     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  790     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  791 
  792     // return if everything seems ok
  793     __ bind(exit);
  794 
  795     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  796     __ ret(lr);
  797 
  798     // handle errors
  799     __ bind(error);
  800     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  801 
  802     __ push(RegSet::range(r0, r29), sp);
  803     // debug(char* msg, int64_t pc, int64_t regs[])
  804     __ mov(c_rarg0, rscratch1);      // pass address of error message
  805     __ mov(c_rarg1, lr);             // pass return address
  806     __ mov(c_rarg2, sp);             // pass address of regs on stack
  807 #ifndef PRODUCT
  808     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  809 #endif
  810     BLOCK_COMMENT("call MacroAssembler::debug");
  811     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  812     __ blr(rscratch1);
  813     __ hlt(0);
  814 
  815     // record the stub entry and end
  816     store_archive_data(stub_id, start, __ pc());
  817 
  818     return start;
  819   }
  820 
  821   // Generate indices for iota vector.
  822   void generate_iota_indices(StubId stub_id) {
  823     GrowableArray<address> entries;
  824     int entry_count = StubInfo::entry_count(stub_id);
  825     assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
  826     address start = load_archive_data(stub_id, &entries);
  827     if (start != nullptr) {
  828       assert(entries.length() == entry_count - 1,
  829              "unexpected entries count %d", entries.length());
  830       StubRoutines::aarch64::_vector_iota_indices[0] = start;
  831       for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  832         StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  833       }
  834       return;
  835     }
  836     __ align(CodeEntryAlignment);
  837     StubCodeMark mark(this, stub_id);
  838     start = __ pc();
  839     // B
  840     __ emit_data64(0x0706050403020100, relocInfo::none);
  841     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  842     entries.append(__ pc());
  843     // H
  844     __ emit_data64(0x0003000200010000, relocInfo::none);
  845     __ emit_data64(0x0007000600050004, relocInfo::none);
  846     entries.append(__ pc());
  847     // S
  848     __ emit_data64(0x0000000100000000, relocInfo::none);
  849     __ emit_data64(0x0000000300000002, relocInfo::none);
  850     entries.append(__ pc());
  851     // D
  852     __ emit_data64(0x0000000000000000, relocInfo::none);
  853     __ emit_data64(0x0000000000000001, relocInfo::none);
  854     entries.append(__ pc());
  855     // S - FP
  856     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  857     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  858     entries.append(__ pc());
  859     // D - FP
  860     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  861     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  862 
  863     // record the stub entry and end
  864     store_archive_data(stub_id, start, __ pc(), &entries);
  865 
  866     // install the entry addresses in the entry array
  867     assert(entries.length() == entry_count - 1,
  868            "unexpected entries count %d", entries.length());
  869     StubRoutines::aarch64::_vector_iota_indices[0] = start;
  870     for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
  871       StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
  872     }
  873   }
  874 
  875   // The inner part of zero_words().  This is the bulk operation,
  876   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  877   // caller is responsible for zeroing the last few words.
  878   //
  879   // Inputs:
  880   // r10: the HeapWord-aligned base address of an array to zero.
  881   // r11: the count in HeapWords, r11 > 0.
  882   //
  883   // Returns r10 and r11, adjusted for the caller to clear.
  884   // r10: the base address of the tail of words left to clear.
  885   // r11: the number of words in the tail.
  886   //      r11 < MacroAssembler::zero_words_block_size.
  887 
  888   address generate_zero_blocks() {
  889     StubId stub_id = StubId::stubgen_zero_blocks_id;
  890     int entry_count = StubInfo::entry_count(stub_id);
  891     assert(entry_count == 1, "sanity check");
  892     address start = load_archive_data(stub_id);
  893     if (start != nullptr) {
  894       return start;
  895     }
  896     __ align(CodeEntryAlignment);
  897     StubCodeMark mark(this, stub_id);
  898     Label done;
  899     Label base_aligned;
  900 
  901     Register base = r10, cnt = r11;
  902 
  903     start = __ pc();
  904 
  905     if (UseBlockZeroing) {
  906       int zva_length = VM_Version::zva_length();
  907 
  908       // Ensure ZVA length can be divided by 16. This is required by
  909       // the subsequent operations.
  910       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  911 
  912       __ tbz(base, 3, base_aligned);
  913       __ str(zr, Address(__ post(base, 8)));
  914       __ sub(cnt, cnt, 1);
  915       __ bind(base_aligned);
  916 
  917       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  918       // alignment.
  919       Label small;
  920       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  921       __ subs(rscratch1, cnt, low_limit >> 3);
  922       __ br(Assembler::LT, small);
  923       __ zero_dcache_blocks(base, cnt);
  924       __ bind(small);
  925     }
  926 
  927     {
  928       // Number of stp instructions we'll unroll
  929       const int unroll =
  930         MacroAssembler::zero_words_block_size / 2;
  931       // Clear the remaining blocks.
  932       Label loop;
  933       __ subs(cnt, cnt, unroll * 2);
  934       __ br(Assembler::LT, done);
  935       __ bind(loop);
  936       for (int i = 0; i < unroll; i++)
  937         __ stp(zr, zr, __ post(base, 16));
  938       __ subs(cnt, cnt, unroll * 2);
  939       __ br(Assembler::GE, loop);
  940       __ bind(done);
  941       __ add(cnt, cnt, unroll * 2);
  942     }
  943 
  944     __ ret(lr);
  945 
  946     // record the stub entry and end
  947     store_archive_data(stub_id, start, __ pc());
  948 
  949     return start;
  950   }
  951 
  952 
  953   typedef enum {
  954     copy_forwards = 1,
  955     copy_backwards = -1
  956   } copy_direction;
  957 
  958   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  959   // for arraycopy stubs.
  960   class ArrayCopyBarrierSetHelper : StackObj {
  961     BarrierSetAssembler* _bs_asm;
  962     MacroAssembler* _masm;
  963     DecoratorSet _decorators;
  964     BasicType _type;
  965     Register _gct1;
  966     Register _gct2;
  967     Register _gct3;
  968     FloatRegister _gcvt1;
  969     FloatRegister _gcvt2;
  970     FloatRegister _gcvt3;
  971 
  972   public:
  973     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  974                               DecoratorSet decorators,
  975                               BasicType type,
  976                               Register gct1,
  977                               Register gct2,
  978                               Register gct3,
  979                               FloatRegister gcvt1,
  980                               FloatRegister gcvt2,
  981                               FloatRegister gcvt3)
  982       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  983         _masm(masm),
  984         _decorators(decorators),
  985         _type(type),
  986         _gct1(gct1),
  987         _gct2(gct2),
  988         _gct3(gct3),
  989         _gcvt1(gcvt1),
  990         _gcvt2(gcvt2),
  991         _gcvt3(gcvt3) {
  992     }
  993 
  994     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  995       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  996                             dst1, dst2, src,
  997                             _gct1, _gct2, _gcvt1);
  998     }
  999 
 1000     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 1001       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 1002                              dst, src1, src2,
 1003                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 1004     }
 1005 
 1006     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 1007       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 1008                             dst1, dst2, src,
 1009                             _gct1);
 1010     }
 1011 
 1012     void copy_store_at_16(Address dst, Register src1, Register src2) {
 1013       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 1014                              dst, src1, src2,
 1015                              _gct1, _gct2, _gct3);
 1016     }
 1017 
 1018     void copy_load_at_8(Register dst, Address src) {
 1019       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 1020                             dst, noreg, src,
 1021                             _gct1);
 1022     }
 1023 
 1024     void copy_store_at_8(Address dst, Register src) {
 1025       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 1026                              dst, src, noreg,
 1027                              _gct1, _gct2, _gct3);
 1028     }
 1029   };
 1030 
 1031   // Bulk copy of blocks of 8 words.
 1032   //
 1033   // count is a count of words.
 1034   //
 1035   // Precondition: count >= 8
 1036   //
 1037   // Postconditions:
 1038   //
 1039   // The least significant bit of count contains the remaining count
 1040   // of words to copy.  The rest of count is trash.
 1041   //
 1042   // s and d are adjusted to point to the remaining words to copy
 1043   //
 1044   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
 1045     int entry_count = StubInfo::entry_count(stub_id);
 1046     assert(entry_count == 1, "sanity check");
 1047     address start = load_archive_data(stub_id);
 1048     if (start != nullptr) {
 1049       return start;
 1050     }
 1051     BasicType type;
 1052     copy_direction direction;
 1053 
 1054     switch (stub_id) {
 1055     case StubId::stubgen_copy_byte_f_id:
 1056       direction = copy_forwards;
 1057       type = T_BYTE;
 1058       break;
 1059     case StubId::stubgen_copy_byte_b_id:
 1060       direction = copy_backwards;
 1061       type = T_BYTE;
 1062       break;
 1063     case StubId::stubgen_copy_oop_f_id:
 1064       direction = copy_forwards;
 1065       type = T_OBJECT;
 1066       break;
 1067     case StubId::stubgen_copy_oop_b_id:
 1068       direction = copy_backwards;
 1069       type = T_OBJECT;
 1070       break;
 1071     case StubId::stubgen_copy_oop_uninit_f_id:
 1072       direction = copy_forwards;
 1073       type = T_OBJECT;
 1074       break;
 1075     case StubId::stubgen_copy_oop_uninit_b_id:
 1076       direction = copy_backwards;
 1077       type = T_OBJECT;
 1078       break;
 1079     default:
 1080       ShouldNotReachHere();
 1081     }
 1082 
 1083     int unit = wordSize * direction;
 1084     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 1085 
 1086     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 1087       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 1088     const Register stride = r14;
 1089     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1090     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1091     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1092 
 1093     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 1094     assert_different_registers(s, d, count, rscratch1, rscratch2);
 1095 
 1096     Label again, drain;
 1097 
 1098     __ align(CodeEntryAlignment);
 1099 
 1100     StubCodeMark mark(this, stub_id);
 1101 
 1102     start = __ pc();
 1103 
 1104     Label unaligned_copy_long;
 1105     if (AvoidUnalignedAccesses) {
 1106       __ tbnz(d, 3, unaligned_copy_long);
 1107     }
 1108 
 1109     if (direction == copy_forwards) {
 1110       __ sub(s, s, bias);
 1111       __ sub(d, d, bias);
 1112     }
 1113 
 1114 #ifdef ASSERT
 1115     // Make sure we are never given < 8 words
 1116     {
 1117       Label L;
 1118       __ cmp(count, (u1)8);
 1119       __ br(Assembler::GE, L);
 1120       __ stop("genrate_copy_longs called with < 8 words");
 1121       __ bind(L);
 1122     }
 1123 #endif
 1124 
 1125     // Fill 8 registers
 1126     if (UseSIMDForMemoryOps) {
 1127       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1128       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1129     } else {
 1130       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1131       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1132       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1133       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1134     }
 1135 
 1136     __ subs(count, count, 16);
 1137     __ br(Assembler::LO, drain);
 1138 
 1139     int prefetch = PrefetchCopyIntervalInBytes;
 1140     bool use_stride = false;
 1141     if (direction == copy_backwards) {
 1142       use_stride = prefetch > 256;
 1143       prefetch = -prefetch;
 1144       if (use_stride) __ mov(stride, prefetch);
 1145     }
 1146 
 1147     __ bind(again);
 1148 
 1149     if (PrefetchCopyIntervalInBytes > 0)
 1150       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1151 
 1152     if (UseSIMDForMemoryOps) {
 1153       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1154       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 1155       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1156       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 1157     } else {
 1158       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1159       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1160       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1161       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1162       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1163       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1164       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1165       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1166     }
 1167 
 1168     __ subs(count, count, 8);
 1169     __ br(Assembler::HS, again);
 1170 
 1171     // Drain
 1172     __ bind(drain);
 1173     if (UseSIMDForMemoryOps) {
 1174       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 1175       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 1176     } else {
 1177       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1178       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 1179       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 1180       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 1181     }
 1182 
 1183     {
 1184       Label L1, L2;
 1185       __ tbz(count, exact_log2(4), L1);
 1186       if (UseSIMDForMemoryOps) {
 1187         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 1188         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 1189       } else {
 1190         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1191         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1192         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 1193         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 1194       }
 1195       __ bind(L1);
 1196 
 1197       if (direction == copy_forwards) {
 1198         __ add(s, s, bias);
 1199         __ add(d, d, bias);
 1200       }
 1201 
 1202       __ tbz(count, 1, L2);
 1203       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 1204       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 1205       __ bind(L2);
 1206     }
 1207 
 1208     __ ret(lr);
 1209 
 1210     if (AvoidUnalignedAccesses) {
 1211       Label drain, again;
 1212       // Register order for storing. Order is different for backward copy.
 1213 
 1214       __ bind(unaligned_copy_long);
 1215 
 1216       // source address is even aligned, target odd aligned
 1217       //
 1218       // when forward copying word pairs we read long pairs at offsets
 1219       // {0, 2, 4, 6} (in long words). when backwards copying we read
 1220       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 1221       // address by -2 in the forwards case so we can compute the
 1222       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 1223       // or -1.
 1224       //
 1225       // when forward copying we need to store 1 word, 3 pairs and
 1226       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 1227       // zero offset We adjust the destination by -1 which means we
 1228       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1229       //
 1230       // When backwards copyng we need to store 1 word, 3 pairs and
 1231       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1232       // offsets {1, 3, 5, 7, 8} * unit.
 1233 
 1234       if (direction == copy_forwards) {
 1235         __ sub(s, s, 16);
 1236         __ sub(d, d, 8);
 1237       }
 1238 
 1239       // Fill 8 registers
 1240       //
 1241       // for forwards copy s was offset by -16 from the original input
 1242       // value of s so the register contents are at these offsets
 1243       // relative to the 64 bit block addressed by that original input
 1244       // and so on for each successive 64 byte block when s is updated
 1245       //
 1246       // t0 at offset 0,  t1 at offset 8
 1247       // t2 at offset 16, t3 at offset 24
 1248       // t4 at offset 32, t5 at offset 40
 1249       // t6 at offset 48, t7 at offset 56
 1250 
 1251       // for backwards copy s was not offset so the register contents
 1252       // are at these offsets into the preceding 64 byte block
 1253       // relative to that original input and so on for each successive
 1254       // preceding 64 byte block when s is updated. this explains the
 1255       // slightly counter-intuitive looking pattern of register usage
 1256       // in the stp instructions for backwards copy.
 1257       //
 1258       // t0 at offset -16, t1 at offset -8
 1259       // t2 at offset -32, t3 at offset -24
 1260       // t4 at offset -48, t5 at offset -40
 1261       // t6 at offset -64, t7 at offset -56
 1262 
 1263       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1264       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1265       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1266       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1267 
 1268       __ subs(count, count, 16);
 1269       __ br(Assembler::LO, drain);
 1270 
 1271       int prefetch = PrefetchCopyIntervalInBytes;
 1272       bool use_stride = false;
 1273       if (direction == copy_backwards) {
 1274         use_stride = prefetch > 256;
 1275         prefetch = -prefetch;
 1276         if (use_stride) __ mov(stride, prefetch);
 1277       }
 1278 
 1279       __ bind(again);
 1280 
 1281       if (PrefetchCopyIntervalInBytes > 0)
 1282         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1283 
 1284       if (direction == copy_forwards) {
 1285         // allowing for the offset of -8 the store instructions place
 1286         // registers into the target 64 bit block at the following
 1287         // offsets
 1288         //
 1289         // t0 at offset 0
 1290         // t1 at offset 8,  t2 at offset 16
 1291         // t3 at offset 24, t4 at offset 32
 1292         // t5 at offset 40, t6 at offset 48
 1293         // t7 at offset 56
 1294 
 1295         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1296         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1297         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1298         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1299         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1300         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1301         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1302         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1303         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1304       } else {
 1305         // d was not offset when we started so the registers are
 1306         // written into the 64 bit block preceding d with the following
 1307         // offsets
 1308         //
 1309         // t1 at offset -8
 1310         // t3 at offset -24, t0 at offset -16
 1311         // t5 at offset -48, t2 at offset -32
 1312         // t7 at offset -56, t4 at offset -48
 1313         //                   t6 at offset -64
 1314         //
 1315         // note that this matches the offsets previously noted for the
 1316         // loads
 1317 
 1318         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1319         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1320         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1321         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1322         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1323         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1324         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1325         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1326         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1327       }
 1328 
 1329       __ subs(count, count, 8);
 1330       __ br(Assembler::HS, again);
 1331 
 1332       // Drain
 1333       //
 1334       // this uses the same pattern of offsets and register arguments
 1335       // as above
 1336       __ bind(drain);
 1337       if (direction == copy_forwards) {
 1338         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1339         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1340         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1341         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1342         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1343       } else {
 1344         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1345         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1346         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1347         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1348         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1349       }
 1350       // now we need to copy any remaining part block which may
 1351       // include a 4 word block subblock and/or a 2 word subblock.
 1352       // bits 2 and 1 in the count are the tell-tale for whether we
 1353       // have each such subblock
 1354       {
 1355         Label L1, L2;
 1356         __ tbz(count, exact_log2(4), L1);
 1357         // this is the same as above but copying only 4 longs hence
 1358         // with only one intervening stp between the str instructions
 1359         // but note that the offsets and registers still follow the
 1360         // same pattern
 1361         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1362         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1363         if (direction == copy_forwards) {
 1364           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1365           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1366           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1367         } else {
 1368           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1369           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1370           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1371         }
 1372         __ bind(L1);
 1373 
 1374         __ tbz(count, 1, L2);
 1375         // this is the same as above but copying only 2 longs hence
 1376         // there is no intervening stp between the str instructions
 1377         // but note that the offset and register patterns are still
 1378         // the same
 1379         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1380         if (direction == copy_forwards) {
 1381           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1382           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1383         } else {
 1384           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1385           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1386         }
 1387         __ bind(L2);
 1388 
 1389         // for forwards copy we need to re-adjust the offsets we
 1390         // applied so that s and d are follow the last words written
 1391 
 1392         if (direction == copy_forwards) {
 1393           __ add(s, s, 16);
 1394           __ add(d, d, 8);
 1395         }
 1396 
 1397       }
 1398 
 1399       __ ret(lr);
 1400     }
 1401 
 1402     // record the stub entry and end
 1403     store_archive_data(stub_id, start, __ pc());
 1404 
 1405     return start;
 1406   }
 1407 
 1408   // Small copy: less than 16 bytes.
 1409   //
 1410   // NB: Ignores all of the bits of count which represent more than 15
 1411   // bytes, so a caller doesn't have to mask them.
 1412 
 1413   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1414     bool is_backwards = step < 0;
 1415     size_t granularity = g_uabs(step);
 1416     int direction = is_backwards ? -1 : 1;
 1417 
 1418     Label Lword, Lint, Lshort, Lbyte;
 1419 
 1420     assert(granularity
 1421            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1422 
 1423     const Register t0 = r3;
 1424     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1425     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1426 
 1427     // ??? I don't know if this bit-test-and-branch is the right thing
 1428     // to do.  It does a lot of jumping, resulting in several
 1429     // mispredicted branches.  It might make more sense to do this
 1430     // with something like Duff's device with a single computed branch.
 1431 
 1432     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1433     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1434     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1435     __ bind(Lword);
 1436 
 1437     if (granularity <= sizeof (jint)) {
 1438       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1439       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1440       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1441       __ bind(Lint);
 1442     }
 1443 
 1444     if (granularity <= sizeof (jshort)) {
 1445       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1446       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1447       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1448       __ bind(Lshort);
 1449     }
 1450 
 1451     if (granularity <= sizeof (jbyte)) {
 1452       __ tbz(count, 0, Lbyte);
 1453       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1454       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1455       __ bind(Lbyte);
 1456     }
 1457   }
 1458 
 1459   // All-singing all-dancing memory copy.
 1460   //
 1461   // Copy count units of memory from s to d.  The size of a unit is
 1462   // step, which can be positive or negative depending on the direction
 1463   // of copy.  If is_aligned is false, we align the source address.
 1464   //
 1465 
 1466   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1467                    Register s, Register d, Register count, int step) {
 1468     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1469     bool is_backwards = step < 0;
 1470     unsigned int granularity = g_uabs(step);
 1471     const Register t0 = r3, t1 = r4;
 1472 
 1473     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1474     // load all the data before writing anything
 1475     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1476     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1477     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1478     const Register send = r17, dend = r16;
 1479     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1480     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1481     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1482 
 1483     if (PrefetchCopyIntervalInBytes > 0)
 1484       __ prfm(Address(s, 0), PLDL1KEEP);
 1485     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1486     __ br(Assembler::HI, copy_big);
 1487 
 1488     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1489     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1490 
 1491     __ cmp(count, u1(16/granularity));
 1492     __ br(Assembler::LS, copy16);
 1493 
 1494     __ cmp(count, u1(64/granularity));
 1495     __ br(Assembler::HI, copy80);
 1496 
 1497     __ cmp(count, u1(32/granularity));
 1498     __ br(Assembler::LS, copy32);
 1499 
 1500     // 33..64 bytes
 1501     if (UseSIMDForMemoryOps) {
 1502       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1503       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1504       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1505       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1506     } else {
 1507       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1508       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1509       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1510       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1511 
 1512       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1513       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1514       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1515       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1516     }
 1517     __ b(finish);
 1518 
 1519     // 17..32 bytes
 1520     __ bind(copy32);
 1521     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1522     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1523 
 1524     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1525     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1526     __ b(finish);
 1527 
 1528     // 65..80/96 bytes
 1529     // (96 bytes if SIMD because we do 32 byes per instruction)
 1530     __ bind(copy80);
 1531     if (UseSIMDForMemoryOps) {
 1532       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1533       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1534       // Unaligned pointers can be an issue for copying.
 1535       // The issue has more chances to happen when granularity of data is
 1536       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1537       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1538       // The most performance drop has been seen for the range 65-80 bytes.
 1539       // For such cases using the pair of ldp/stp instead of the third pair of
 1540       // ldpq/stpq fixes the performance issue.
 1541       if (granularity < sizeof (jint)) {
 1542         Label copy96;
 1543         __ cmp(count, u1(80/granularity));
 1544         __ br(Assembler::HI, copy96);
 1545         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1546 
 1547         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1548         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1549 
 1550         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1551         __ b(finish);
 1552 
 1553         __ bind(copy96);
 1554       }
 1555       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1556 
 1557       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1558       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1559 
 1560       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1561     } else {
 1562       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1563       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1564       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1565       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1566       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1567 
 1568       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1569       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1570       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1571       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1572       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1573     }
 1574     __ b(finish);
 1575 
 1576     // 0..16 bytes
 1577     __ bind(copy16);
 1578     __ cmp(count, u1(8/granularity));
 1579     __ br(Assembler::LO, copy8);
 1580 
 1581     // 8..16 bytes
 1582     bs.copy_load_at_8(t0, Address(s, 0));
 1583     bs.copy_load_at_8(t1, Address(send, -8));
 1584     bs.copy_store_at_8(Address(d, 0), t0);
 1585     bs.copy_store_at_8(Address(dend, -8), t1);
 1586     __ b(finish);
 1587 
 1588     if (granularity < 8) {
 1589       // 4..7 bytes
 1590       __ bind(copy8);
 1591       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1592       __ ldrw(t0, Address(s, 0));
 1593       __ ldrw(t1, Address(send, -4));
 1594       __ strw(t0, Address(d, 0));
 1595       __ strw(t1, Address(dend, -4));
 1596       __ b(finish);
 1597       if (granularity < 4) {
 1598         // 0..3 bytes
 1599         __ bind(copy4);
 1600         __ cbz(count, finish); // get rid of 0 case
 1601         if (granularity == 2) {
 1602           __ ldrh(t0, Address(s, 0));
 1603           __ strh(t0, Address(d, 0));
 1604         } else { // granularity == 1
 1605           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1606           // the first and last byte.
 1607           // Handle the 3 byte case by loading and storing base + count/2
 1608           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1609           // This does means in the 1 byte case we load/store the same
 1610           // byte 3 times.
 1611           __ lsr(count, count, 1);
 1612           __ ldrb(t0, Address(s, 0));
 1613           __ ldrb(t1, Address(send, -1));
 1614           __ ldrb(t2, Address(s, count));
 1615           __ strb(t0, Address(d, 0));
 1616           __ strb(t1, Address(dend, -1));
 1617           __ strb(t2, Address(d, count));
 1618         }
 1619         __ b(finish);
 1620       }
 1621     }
 1622 
 1623     __ bind(copy_big);
 1624     if (is_backwards) {
 1625       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1626       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1627     }
 1628 
 1629     // Now we've got the small case out of the way we can align the
 1630     // source address on a 2-word boundary.
 1631 
 1632     // Here we will materialize a count in r15, which is used by copy_memory_small
 1633     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1634     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1635     // can not be used as a temp register, as it contains the count.
 1636 
 1637     Label aligned;
 1638 
 1639     if (is_aligned) {
 1640       // We may have to adjust by 1 word to get s 2-word-aligned.
 1641       __ tbz(s, exact_log2(wordSize), aligned);
 1642       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1643       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1644       __ sub(count, count, wordSize/granularity);
 1645     } else {
 1646       if (is_backwards) {
 1647         __ andr(r15, s, 2 * wordSize - 1);
 1648       } else {
 1649         __ neg(r15, s);
 1650         __ andr(r15, r15, 2 * wordSize - 1);
 1651       }
 1652       // r15 is the byte adjustment needed to align s.
 1653       __ cbz(r15, aligned);
 1654       int shift = exact_log2(granularity);
 1655       if (shift > 0) {
 1656         __ lsr(r15, r15, shift);
 1657       }
 1658       __ sub(count, count, r15);
 1659 
 1660 #if 0
 1661       // ?? This code is only correct for a disjoint copy.  It may or
 1662       // may not make sense to use it in that case.
 1663 
 1664       // Copy the first pair; s and d may not be aligned.
 1665       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1666       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1667 
 1668       // Align s and d, adjust count
 1669       if (is_backwards) {
 1670         __ sub(s, s, r15);
 1671         __ sub(d, d, r15);
 1672       } else {
 1673         __ add(s, s, r15);
 1674         __ add(d, d, r15);
 1675       }
 1676 #else
 1677       copy_memory_small(decorators, type, s, d, r15, step);
 1678 #endif
 1679     }
 1680 
 1681     __ bind(aligned);
 1682 
 1683     // s is now 2-word-aligned.
 1684 
 1685     // We have a count of units and some trailing bytes. Adjust the
 1686     // count and do a bulk copy of words. If the shift is zero
 1687     // perform a move instead to benefit from zero latency moves.
 1688     int shift = exact_log2(wordSize/granularity);
 1689     if (shift > 0) {
 1690       __ lsr(r15, count, shift);
 1691     } else {
 1692       __ mov(r15, count);
 1693     }
 1694     if (direction == copy_forwards) {
 1695       if (type != T_OBJECT) {
 1696         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
 1697         __ blr(rscratch1);
 1698       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1699         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
 1700         __ blr(rscratch1);
 1701       } else {
 1702         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
 1703         __ blr(rscratch1);
 1704       }
 1705     } else {
 1706       if (type != T_OBJECT) {
 1707         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
 1708         __ blr(rscratch1);
 1709       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1710         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
 1711         __ blr(rscratch1);
 1712       } else {
 1713         __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
 1714         __ blr(rscratch1);
 1715       }
 1716     }
 1717 
 1718     // And the tail.
 1719     copy_memory_small(decorators, type, s, d, count, step);
 1720 
 1721     if (granularity >= 8) __ bind(copy8);
 1722     if (granularity >= 4) __ bind(copy4);
 1723     __ bind(finish);
 1724   }
 1725 
 1726 
 1727   void clobber_registers() {
 1728 #ifdef ASSERT
 1729     RegSet clobbered
 1730       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1731     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1732     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1733     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1734       __ mov(*it, rscratch1);
 1735     }
 1736 #endif
 1737 
 1738   }
 1739 
 1740   // Scan over array at a for count oops, verifying each one.
 1741   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1742   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1743     Label loop, end;
 1744     __ mov(rscratch1, a);
 1745     __ mov(rscratch2, zr);
 1746     __ bind(loop);
 1747     __ cmp(rscratch2, count);
 1748     __ br(Assembler::HS, end);
 1749     if (size == wordSize) {
 1750       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1751       __ verify_oop(temp);
 1752     } else {
 1753       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1754       __ decode_heap_oop(temp); // calls verify_oop
 1755     }
 1756     __ add(rscratch2, rscratch2, 1);
 1757     __ b(loop);
 1758     __ bind(end);
 1759   }
 1760 
 1761   // Arguments:
 1762   //   stub_id - is used to name the stub and identify all details of
 1763   //             how to perform the copy.
 1764   //
 1765   //   nopush_entry - is assigned to the stub's post push entry point
 1766   //                  unless it is null
 1767   //
 1768   // Inputs:
 1769   //   c_rarg0   - source array address
 1770   //   c_rarg1   - destination array address
 1771   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1772   //
 1773   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1774   // the hardware handle it.  The two dwords within qwords that span
 1775   // cache line boundaries will still be loaded and stored atomically.
 1776   //
 1777   // Side Effects: nopush_entry is set to the (post push) entry point
 1778   //               so it can be used by the corresponding conjoint
 1779   //               copy method
 1780   //
 1781   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1782     int size;
 1783     bool aligned;
 1784     bool is_oop;
 1785     bool dest_uninitialized;
 1786     switch (stub_id) {
 1787     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1788       size = sizeof(jbyte);
 1789       aligned = false;
 1790       is_oop = false;
 1791       dest_uninitialized = false;
 1792       break;
 1793     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1794       size = sizeof(jbyte);
 1795       aligned = true;
 1796       is_oop = false;
 1797       dest_uninitialized = false;
 1798       break;
 1799     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1800       size = sizeof(jshort);
 1801       aligned = false;
 1802       is_oop = false;
 1803       dest_uninitialized = false;
 1804       break;
 1805     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1806       size = sizeof(jshort);
 1807       aligned = true;
 1808       is_oop = false;
 1809       dest_uninitialized = false;
 1810       break;
 1811     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1812       size = sizeof(jint);
 1813       aligned = false;
 1814       is_oop = false;
 1815       dest_uninitialized = false;
 1816       break;
 1817     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1818       size = sizeof(jint);
 1819       aligned = true;
 1820       is_oop = false;
 1821       dest_uninitialized = false;
 1822       break;
 1823     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1824       // since this is always aligned we can (should!) use the same
 1825       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1826       ShouldNotReachHere();
 1827       break;
 1828     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1829       size = sizeof(jlong);
 1830       aligned = true;
 1831       is_oop = false;
 1832       dest_uninitialized = false;
 1833       break;
 1834     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1835       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1836       aligned = !UseCompressedOops;
 1837       is_oop = true;
 1838       dest_uninitialized = false;
 1839       break;
 1840     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1841       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1842       aligned = !UseCompressedOops;
 1843       is_oop = true;
 1844       dest_uninitialized = false;
 1845       break;
 1846     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1847       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1848       aligned = !UseCompressedOops;
 1849       is_oop = true;
 1850       dest_uninitialized = true;
 1851       break;
 1852     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1853       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1854       aligned = !UseCompressedOops;
 1855       is_oop = true;
 1856       dest_uninitialized = true;
 1857       break;
 1858     default:
 1859       ShouldNotReachHere();
 1860       break;
 1861     }
 1862     // all stubs provide a 2nd entry which omits the frame push for
 1863     // use when bailing out from a conjoint copy. However we may also
 1864     // need some extra addressses for memory access protection.
 1865     int entry_count = StubInfo::entry_count(stub_id);
 1866     assert(entry_count == 2, "sanity check");
 1867     assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
 1868 
 1869     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 1870     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 1871     GrowableArray<address> entries;
 1872     GrowableArray<address> extras;
 1873     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 1874     address start = load_archive_data(stub_id, &entries, extras_ptr);
 1875     if (start != nullptr) {
 1876       assert(entries.length() == entry_count - 1,
 1877              "unexpected entries count %d", entries.length());
 1878       *nopush_entry = entries.at(0);
 1879       assert(extras.length() == extra_count,
 1880              "unexpected extra count %d", extras.length());
 1881       if (add_extras) {
 1882         // register one handler at offset 0
 1883         register_unsafe_access_handlers(extras, 0, 1);
 1884       }
 1885       return start;
 1886     }
 1887 
 1888     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1889     RegSet saved_reg = RegSet::of(s, d, count);
 1890 
 1891     __ align(CodeEntryAlignment);
 1892     StubCodeMark mark(this, stub_id);
 1893     start = __ pc();
 1894     __ enter();
 1895 
 1896     *nopush_entry = __ pc();
 1897     entries.append(*nopush_entry);
 1898 
 1899     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1900     BLOCK_COMMENT("Post-Push Entry:");
 1901 
 1902     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1903     if (dest_uninitialized) {
 1904       decorators |= IS_DEST_UNINITIALIZED;
 1905     }
 1906     if (aligned) {
 1907       decorators |= ARRAYCOPY_ALIGNED;
 1908     }
 1909 
 1910     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1911     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1912 
 1913     if (is_oop) {
 1914       // save regs before copy_memory
 1915       __ push(RegSet::of(d, count), sp);
 1916     }
 1917     {
 1918       // UnsafeMemoryAccess page error: continue after unsafe access
 1919       UnsafeMemoryAccessMark umam(this, add_extras, true);
 1920       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1921     }
 1922 
 1923     if (is_oop) {
 1924       __ pop(RegSet::of(d, count), sp);
 1925       if (VerifyOops)
 1926         verify_oop_array(size, d, count, r16);
 1927     }
 1928 
 1929     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 1930 
 1931     __ leave();
 1932     __ mov(r0, zr); // return 0
 1933     __ ret(lr);
 1934 
 1935     address end = __ pc();
 1936 
 1937     if (add_extras) {
 1938       // retrieve the registered handler addresses
 1939       retrieve_unsafe_access_handlers(start, end, extras);
 1940       assert(extras.length() == extra_count
 1941              , "incorrect handlers count %d", extras.length());
 1942     }
 1943 
 1944     // record the stub entry and end plus the no_push entry and any
 1945     // extra handler addresses
 1946     store_archive_data(stub_id, start, end, &entries, extras_ptr);
 1947 
 1948     return start;
 1949   }
 1950 
 1951   // Arguments:
 1952   //   stub_id - is used to name the stub and identify all details of
 1953   //             how to perform the copy.
 1954   //
 1955   //   nooverlap_target - identifes the (post push) entry for the
 1956   //             corresponding disjoint copy routine which can be
 1957   //             jumped to if the ranges do not actually overlap
 1958   //
 1959   //   nopush_entry - is assigned to the stub's post push entry point
 1960   //                  unless it is null
 1961   //
 1962   //
 1963   // Inputs:
 1964   //   c_rarg0   - source array address
 1965   //   c_rarg1   - destination array address
 1966   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1967   //
 1968   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1969   // the hardware handle it.  The two dwords within qwords that span
 1970   // cache line boundaries will still be loaded and stored atomically.
 1971   //
 1972   // Side Effects:
 1973   //   nopush_entry is set to the no-overlap entry point so it can be
 1974   //   used by some other conjoint copy method
 1975   //
 1976   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1977     int size;
 1978     bool aligned;
 1979     bool is_oop;
 1980     bool dest_uninitialized;
 1981     switch (stub_id) {
 1982     case StubId::stubgen_jbyte_arraycopy_id:
 1983       size = sizeof(jbyte);
 1984       aligned = false;
 1985       is_oop = false;
 1986       dest_uninitialized = false;
 1987       break;
 1988     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1989       size = sizeof(jbyte);
 1990       aligned = true;
 1991       is_oop = false;
 1992       dest_uninitialized = false;
 1993       break;
 1994     case StubId::stubgen_jshort_arraycopy_id:
 1995       size = sizeof(jshort);
 1996       aligned = false;
 1997       is_oop = false;
 1998       dest_uninitialized = false;
 1999       break;
 2000     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 2001       size = sizeof(jshort);
 2002       aligned = true;
 2003       is_oop = false;
 2004       dest_uninitialized = false;
 2005       break;
 2006     case StubId::stubgen_jint_arraycopy_id:
 2007       size = sizeof(jint);
 2008       aligned = false;
 2009       is_oop = false;
 2010       dest_uninitialized = false;
 2011       break;
 2012     case StubId::stubgen_arrayof_jint_arraycopy_id:
 2013       size = sizeof(jint);
 2014       aligned = true;
 2015       is_oop = false;
 2016       dest_uninitialized = false;
 2017       break;
 2018     case StubId::stubgen_jlong_arraycopy_id:
 2019       // since this is always aligned we can (should!) use the same
 2020       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 2021       ShouldNotReachHere();
 2022       break;
 2023     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 2024       size = sizeof(jlong);
 2025       aligned = true;
 2026       is_oop = false;
 2027       dest_uninitialized = false;
 2028       break;
 2029     case StubId::stubgen_oop_arraycopy_id:
 2030       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2031       aligned = !UseCompressedOops;
 2032       is_oop = true;
 2033       dest_uninitialized = false;
 2034       break;
 2035     case StubId::stubgen_arrayof_oop_arraycopy_id:
 2036       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2037       aligned = !UseCompressedOops;
 2038       is_oop = true;
 2039       dest_uninitialized = false;
 2040       break;
 2041     case StubId::stubgen_oop_arraycopy_uninit_id:
 2042       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2043       aligned = !UseCompressedOops;
 2044       is_oop = true;
 2045       dest_uninitialized = true;
 2046       break;
 2047     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 2048       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 2049       aligned = !UseCompressedOops;
 2050       is_oop = true;
 2051       dest_uninitialized = true;
 2052       break;
 2053     default:
 2054       ShouldNotReachHere();
 2055     }
 2056     // only some conjoint stubs generate a 2nd entry
 2057     int entry_count = StubInfo::entry_count(stub_id);
 2058     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2059     assert(entry_count == expected_entry_count,
 2060            "expected entry count %d does not match declared entry count %d for stub %s",
 2061            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2062 
 2063     // We need to protect memory accesses in certain cases
 2064     bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
 2065     int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
 2066     GrowableArray<address> entries;
 2067     GrowableArray<address> extras;
 2068     GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
 2069     GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
 2070     address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
 2071     if (start != nullptr) {
 2072       assert(entries.length() == expected_entry_count - 1,
 2073              "unexpected entries count %d", entries.length());
 2074       assert(extras.length() == extra_count,
 2075              "unexpected extra count %d", extras.length());
 2076       if (nopush_entry != nullptr) {
 2077         *nopush_entry = entries.at(0);
 2078       }
 2079       if (add_extras) {
 2080         // register one handler at offset 0
 2081         register_unsafe_access_handlers(extras, 0, 1);
 2082       }
 2083       return start;
 2084     }
 2085 
 2086     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2087     RegSet saved_regs = RegSet::of(s, d, count);
 2088     StubCodeMark mark(this, stub_id);
 2089     start = __ pc();
 2090     __ enter();
 2091 
 2092     if (nopush_entry != nullptr) {
 2093       *nopush_entry = __ pc();
 2094       entries.append(*nopush_entry);
 2095       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 2096       BLOCK_COMMENT("Post-Push Entry:");
 2097     }
 2098 
 2099     // use fwd copy when (d-s) above_equal (count*size)
 2100     Label L_overlapping;
 2101     __ sub(rscratch1, d, s);
 2102     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 2103     __ br(Assembler::LO, L_overlapping);
 2104     __ b(RuntimeAddress(nooverlap_target));
 2105     __ bind(L_overlapping);
 2106 
 2107     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 2108     if (dest_uninitialized) {
 2109       decorators |= IS_DEST_UNINITIALIZED;
 2110     }
 2111     if (aligned) {
 2112       decorators |= ARRAYCOPY_ALIGNED;
 2113     }
 2114 
 2115     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2116     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 2117 
 2118     if (is_oop) {
 2119       // save regs before copy_memory
 2120       __ push(RegSet::of(d, count), sp);
 2121     }
 2122     {
 2123       // UnsafeMemoryAccess page error: continue after unsafe access
 2124       UnsafeMemoryAccessMark umam(this, add_extras, true);
 2125       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 2126     }
 2127     if (is_oop) {
 2128       __ pop(RegSet::of(d, count), sp);
 2129       if (VerifyOops)
 2130         verify_oop_array(size, d, count, r16);
 2131     }
 2132     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
 2133     __ leave();
 2134     __ mov(r0, zr); // return 0
 2135     __ ret(lr);
 2136 
 2137     assert(entries.length() == expected_entry_count - 1,
 2138            "unexpected entries count %d", entries.length());
 2139 
 2140     address end = __ pc();
 2141 
 2142     if (add_extras) {
 2143       // retrieve the registered handler addresses
 2144       retrieve_unsafe_access_handlers(start, end, extras);
 2145       assert(extras.length() == extra_count,
 2146              "incorrect handlers count %d", extras.length());
 2147     }
 2148 
 2149     // record the stub entry and end plus any no_push entry and/or
 2150     // extra handler addresses
 2151     store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
 2152 
 2153     return start;
 2154   }
 2155 
 2156   // Helper for generating a dynamic type check.
 2157   // Smashes rscratch1, rscratch2.
 2158   void generate_type_check(Register sub_klass,
 2159                            Register super_check_offset,
 2160                            Register super_klass,
 2161                            Register temp1,
 2162                            Register temp2,
 2163                            Register result,
 2164                            Label& L_success) {
 2165     assert_different_registers(sub_klass, super_check_offset, super_klass);
 2166 
 2167     BLOCK_COMMENT("type_check:");
 2168 
 2169     Label L_miss;
 2170 
 2171     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 2172                                      super_check_offset);
 2173     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 2174 
 2175     // Fall through on failure!
 2176     __ BIND(L_miss);
 2177   }
 2178 
 2179   //
 2180   //  Generate checkcasting array copy stub
 2181   //
 2182   //  Input:
 2183   //    c_rarg0   - source array address
 2184   //    c_rarg1   - destination array address
 2185   //    c_rarg2   - element count, treated as ssize_t, can be zero
 2186   //    c_rarg3   - size_t ckoff (super_check_offset)
 2187   //    c_rarg4   - oop ckval (super_klass)
 2188   //
 2189   //  Output:
 2190   //    r0 ==  0  -  success
 2191   //    r0 == -1^K - failure, where K is partial transfer count
 2192   //
 2193   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 2194     bool dest_uninitialized;
 2195     switch (stub_id) {
 2196     case StubId::stubgen_checkcast_arraycopy_id:
 2197       dest_uninitialized = false;
 2198       break;
 2199     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 2200       dest_uninitialized = true;
 2201       break;
 2202     default:
 2203       ShouldNotReachHere();
 2204     }
 2205 
 2206     // The normal stub provides a 2nd entry which omits the frame push
 2207     // for use when bailing out from a disjoint copy.
 2208     // Only some conjoint stubs generate a 2nd entry
 2209     int entry_count = StubInfo::entry_count(stub_id);
 2210     int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
 2211     GrowableArray<address> entries;
 2212     GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
 2213     assert(entry_count == expected_entry_count,
 2214            "expected entry count %d does not match declared entry count %d for stub %s",
 2215            expected_entry_count, entry_count, StubInfo::name(stub_id));
 2216     address start = load_archive_data(stub_id, entries_ptr);
 2217     if (start != nullptr) {
 2218       assert(entries.length() + 1 == expected_entry_count,
 2219              "expected entry count %d does not match return entry count %d for stub %s",
 2220              expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
 2221       if (nopush_entry != nullptr) {
 2222         *nopush_entry = entries.at(0);
 2223       }
 2224       return start;
 2225     }
 2226 
 2227     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 2228 
 2229     // Input registers (after setup_arg_regs)
 2230     const Register from        = c_rarg0;   // source array address
 2231     const Register to          = c_rarg1;   // destination array address
 2232     const Register count       = c_rarg2;   // elementscount
 2233     const Register ckoff       = c_rarg3;   // super_check_offset
 2234     const Register ckval       = c_rarg4;   // super_klass
 2235 
 2236     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 2237 
 2238     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 2239     const Register copied_oop  = r22;       // actual oop copied
 2240     const Register count_save  = r21;       // orig elementscount
 2241     const Register start_to    = r20;       // destination array start address
 2242     const Register r19_klass   = r19;       // oop._klass
 2243 
 2244     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 2245     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 2246 
 2247     //---------------------------------------------------------------
 2248     // Assembler stub will be used for this call to arraycopy
 2249     // if the two arrays are subtypes of Object[] but the
 2250     // destination array type is not equal to or a supertype
 2251     // of the source type.  Each element must be separately
 2252     // checked.
 2253 
 2254     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 2255                                copied_oop, r19_klass, count_save);
 2256 
 2257     __ align(CodeEntryAlignment);
 2258     StubCodeMark mark(this, stub_id);
 2259     start = __ pc();
 2260 
 2261     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2262 
 2263 #ifdef ASSERT
 2264     // caller guarantees that the arrays really are different
 2265     // otherwise, we would have to make conjoint checks
 2266     { Label L;
 2267       __ b(L);                  // conjoint check not yet implemented
 2268       __ stop("checkcast_copy within a single array");
 2269       __ bind(L);
 2270     }
 2271 #endif //ASSERT
 2272 
 2273     // Caller of this entry point must set up the argument registers.
 2274     if (nopush_entry != nullptr) {
 2275       *nopush_entry = __ pc();
 2276       entries.append(*nopush_entry);
 2277       BLOCK_COMMENT("Entry:");
 2278     }
 2279 
 2280      // Empty array:  Nothing to do.
 2281     __ cbz(count, L_done);
 2282     __ push(RegSet::of(r19, r20, r21, r22), sp);
 2283 
 2284 #ifdef ASSERT
 2285     BLOCK_COMMENT("assert consistent ckoff/ckval");
 2286     // The ckoff and ckval must be mutually consistent,
 2287     // even though caller generates both.
 2288     { Label L;
 2289       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2290       __ ldrw(start_to, Address(ckval, sco_offset));
 2291       __ cmpw(ckoff, start_to);
 2292       __ br(Assembler::EQ, L);
 2293       __ stop("super_check_offset inconsistent");
 2294       __ bind(L);
 2295     }
 2296 #endif //ASSERT
 2297 
 2298     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 2299     bool is_oop = true;
 2300     int element_size = UseCompressedOops ? 4 : 8;
 2301     if (dest_uninitialized) {
 2302       decorators |= IS_DEST_UNINITIALIZED;
 2303     }
 2304 
 2305     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 2306     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 2307 
 2308     // save the original count
 2309     __ mov(count_save, count);
 2310 
 2311     // Copy from low to high addresses
 2312     __ mov(start_to, to);              // Save destination array start address
 2313     __ b(L_load_element);
 2314 
 2315     // ======== begin loop ========
 2316     // (Loop is rotated; its entry is L_load_element.)
 2317     // Loop control:
 2318     //   for (; count != 0; count--) {
 2319     //     copied_oop = load_heap_oop(from++);
 2320     //     ... generate_type_check ...;
 2321     //     store_heap_oop(to++, copied_oop);
 2322     //   }
 2323     __ align(OptoLoopAlignment);
 2324 
 2325     __ BIND(L_store_element);
 2326     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 2327                       __ post(to, element_size), copied_oop, noreg,
 2328                       gct1, gct2, gct3);
 2329     __ sub(count, count, 1);
 2330     __ cbz(count, L_do_card_marks);
 2331 
 2332     // ======== loop entry is here ========
 2333     __ BIND(L_load_element);
 2334     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 2335                      copied_oop, noreg, __ post(from, element_size),
 2336                      gct1);
 2337     __ cbz(copied_oop, L_store_element);
 2338 
 2339     __ load_klass(r19_klass, copied_oop);// query the object klass
 2340 
 2341     BLOCK_COMMENT("type_check:");
 2342     generate_type_check(/*sub_klass*/r19_klass,
 2343                         /*super_check_offset*/ckoff,
 2344                         /*super_klass*/ckval,
 2345                         /*r_array_base*/gct1,
 2346                         /*temp2*/gct2,
 2347                         /*result*/r10, L_store_element);
 2348 
 2349     // Fall through on failure!
 2350 
 2351     // ======== end loop ========
 2352 
 2353     // It was a real error; we must depend on the caller to finish the job.
 2354     // Register count = remaining oops, count_orig = total oops.
 2355     // Emit GC store barriers for the oops we have copied and report
 2356     // their number to the caller.
 2357 
 2358     __ subs(count, count_save, count);     // K = partially copied oop count
 2359     __ eon(count, count, zr);              // report (-1^K) to caller
 2360     __ br(Assembler::EQ, L_done_pop);
 2361 
 2362     __ BIND(L_do_card_marks);
 2363     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
 2364 
 2365     __ bind(L_done_pop);
 2366     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2367     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2368 
 2369     __ bind(L_done);
 2370     __ mov(r0, count);
 2371     __ leave();
 2372     __ ret(lr);
 2373 
 2374     // record the stub entry and end plus any no_push entry
 2375     store_archive_data(stub_id, start, __ pc() , entries_ptr);
 2376     return start;
 2377   }
 2378 
 2379   // Perform range checks on the proposed arraycopy.
 2380   // Kills temp, but nothing else.
 2381   // Also, clean the sign bits of src_pos and dst_pos.
 2382   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2383                               Register src_pos, // source position (c_rarg1)
 2384                               Register dst,     // destination array oo (c_rarg2)
 2385                               Register dst_pos, // destination position (c_rarg3)
 2386                               Register length,
 2387                               Register temp,
 2388                               Label& L_failed) {
 2389     BLOCK_COMMENT("arraycopy_range_checks:");
 2390 
 2391     assert_different_registers(rscratch1, temp);
 2392 
 2393     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2394     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2395     __ addw(temp, length, src_pos);
 2396     __ cmpw(temp, rscratch1);
 2397     __ br(Assembler::HI, L_failed);
 2398 
 2399     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2400     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2401     __ addw(temp, length, dst_pos);
 2402     __ cmpw(temp, rscratch1);
 2403     __ br(Assembler::HI, L_failed);
 2404 
 2405     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2406     __ movw(src_pos, src_pos);
 2407     __ movw(dst_pos, dst_pos);
 2408 
 2409     BLOCK_COMMENT("arraycopy_range_checks done");
 2410   }
 2411 
 2412   // These stubs get called from some dumb test routine.
 2413   // I'll write them properly when they're called from
 2414   // something that's actually doing something.
 2415   static void fake_arraycopy_stub(address src, address dst, int count) {
 2416     assert(count == 0, "huh?");
 2417   }
 2418 
 2419 
 2420   //
 2421   //  Generate 'unsafe' array copy stub
 2422   //  Though just as safe as the other stubs, it takes an unscaled
 2423   //  size_t argument instead of an element count.
 2424   //
 2425   //  Input:
 2426   //    c_rarg0   - source array address
 2427   //    c_rarg1   - destination array address
 2428   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2429   //
 2430   // Examines the alignment of the operands and dispatches
 2431   // to a long, int, short, or byte copy loop.
 2432   //
 2433   address generate_unsafe_copy(address byte_copy_entry,
 2434                                address short_copy_entry,
 2435                                address int_copy_entry,
 2436                                address long_copy_entry) {
 2437     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2438     int entry_count = StubInfo::entry_count(stub_id);
 2439     assert(entry_count == 1, "sanity check");
 2440     address start = load_archive_data(stub_id);
 2441     if (start != nullptr) {
 2442       return start;
 2443     }
 2444     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2445     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2446 
 2447     __ align(CodeEntryAlignment);
 2448     StubCodeMark mark(this, stub_id);
 2449     start = __ pc();
 2450     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2451 
 2452     // bump this on entry, not on exit:
 2453     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2454 
 2455     __ orr(rscratch1, s, d);
 2456     __ orr(rscratch1, rscratch1, count);
 2457 
 2458     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2459     __ cbz(rscratch1, L_long_aligned);
 2460     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2461     __ cbz(rscratch1, L_int_aligned);
 2462     __ tbz(rscratch1, 0, L_short_aligned);
 2463     __ b(RuntimeAddress(byte_copy_entry));
 2464 
 2465     __ BIND(L_short_aligned);
 2466     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2467     __ b(RuntimeAddress(short_copy_entry));
 2468     __ BIND(L_int_aligned);
 2469     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2470     __ b(RuntimeAddress(int_copy_entry));
 2471     __ BIND(L_long_aligned);
 2472     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2473     __ b(RuntimeAddress(long_copy_entry));
 2474 
 2475     // record the stub entry and end
 2476     store_archive_data(stub_id, start, __ pc());
 2477 
 2478     return start;
 2479   }
 2480 
 2481   //
 2482   //  Generate generic array copy stubs
 2483   //
 2484   //  Input:
 2485   //    c_rarg0    -  src oop
 2486   //    c_rarg1    -  src_pos (32-bits)
 2487   //    c_rarg2    -  dst oop
 2488   //    c_rarg3    -  dst_pos (32-bits)
 2489   //    c_rarg4    -  element count (32-bits)
 2490   //
 2491   //  Output:
 2492   //    r0 ==  0  -  success
 2493   //    r0 == -1^K - failure, where K is partial transfer count
 2494   //
 2495   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2496                                 address int_copy_entry, address oop_copy_entry,
 2497                                 address long_copy_entry, address checkcast_copy_entry) {
 2498     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2499     int entry_count = StubInfo::entry_count(stub_id);
 2500     assert(entry_count == 1, "sanity check");
 2501     address start = load_archive_data(stub_id);
 2502     if (start != nullptr) {
 2503       return start;
 2504     }
 2505     Label L_failed, L_objArray;
 2506     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2507 
 2508     // Input registers
 2509     const Register src        = c_rarg0;  // source array oop
 2510     const Register src_pos    = c_rarg1;  // source position
 2511     const Register dst        = c_rarg2;  // destination array oop
 2512     const Register dst_pos    = c_rarg3;  // destination position
 2513     const Register length     = c_rarg4;
 2514 
 2515 
 2516     // Registers used as temps
 2517     const Register dst_klass  = c_rarg5;
 2518 
 2519     __ align(CodeEntryAlignment);
 2520 
 2521     StubCodeMark mark(this, stub_id);
 2522 
 2523     start = __ pc();
 2524 
 2525     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2526 
 2527     // bump this on entry, not on exit:
 2528     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2529 
 2530     //-----------------------------------------------------------------------
 2531     // Assembler stub will be used for this call to arraycopy
 2532     // if the following conditions are met:
 2533     //
 2534     // (1) src and dst must not be null.
 2535     // (2) src_pos must not be negative.
 2536     // (3) dst_pos must not be negative.
 2537     // (4) length  must not be negative.
 2538     // (5) src klass and dst klass should be the same and not null.
 2539     // (6) src and dst should be arrays.
 2540     // (7) src_pos + length must not exceed length of src.
 2541     // (8) dst_pos + length must not exceed length of dst.
 2542     //
 2543 
 2544     //  if (src == nullptr) return -1;
 2545     __ cbz(src, L_failed);
 2546 
 2547     //  if (src_pos < 0) return -1;
 2548     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2549 
 2550     //  if (dst == nullptr) return -1;
 2551     __ cbz(dst, L_failed);
 2552 
 2553     //  if (dst_pos < 0) return -1;
 2554     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2555 
 2556     // registers used as temp
 2557     const Register scratch_length    = r16; // elements count to copy
 2558     const Register scratch_src_klass = r17; // array klass
 2559     const Register lh                = r15; // layout helper
 2560 
 2561     //  if (length < 0) return -1;
 2562     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2563     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2564 
 2565     __ load_klass(scratch_src_klass, src);
 2566 #ifdef ASSERT
 2567     //  assert(src->klass() != nullptr);
 2568     {
 2569       BLOCK_COMMENT("assert klasses not null {");
 2570       Label L1, L2;
 2571       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2572       __ bind(L1);
 2573       __ stop("broken null klass");
 2574       __ bind(L2);
 2575       __ load_klass(rscratch1, dst);
 2576       __ cbz(rscratch1, L1);     // this would be broken also
 2577       BLOCK_COMMENT("} assert klasses not null done");
 2578     }
 2579 #endif
 2580 
 2581     // Load layout helper (32-bits)
 2582     //
 2583     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2584     // 32        30    24            16              8     2                 0
 2585     //
 2586     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2587     //
 2588 
 2589     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2590 
 2591     // Handle objArrays completely differently...
 2592     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2593     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2594     __ movw(rscratch1, objArray_lh);
 2595     __ eorw(rscratch2, lh, rscratch1);
 2596     __ cbzw(rscratch2, L_objArray);
 2597 
 2598     //  if (src->klass() != dst->klass()) return -1;
 2599     __ load_klass(rscratch2, dst);
 2600     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2601     __ cbnz(rscratch2, L_failed);
 2602 
 2603     //  if (!src->is_Array()) return -1;
 2604     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2605 
 2606     // At this point, it is known to be a typeArray (array_tag 0x3).
 2607 #ifdef ASSERT
 2608     {
 2609       BLOCK_COMMENT("assert primitive array {");
 2610       Label L;
 2611       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2612       __ cmpw(lh, rscratch2);
 2613       __ br(Assembler::GE, L);
 2614       __ stop("must be a primitive array");
 2615       __ bind(L);
 2616       BLOCK_COMMENT("} assert primitive array done");
 2617     }
 2618 #endif
 2619 
 2620     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2621                            rscratch2, L_failed);
 2622 
 2623     // TypeArrayKlass
 2624     //
 2625     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2626     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2627     //
 2628 
 2629     const Register rscratch1_offset = rscratch1;    // array offset
 2630     const Register r15_elsize = lh; // element size
 2631 
 2632     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2633            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2634     __ add(src, src, rscratch1_offset);           // src array offset
 2635     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2636     BLOCK_COMMENT("choose copy loop based on element size");
 2637 
 2638     // next registers should be set before the jump to corresponding stub
 2639     const Register from     = c_rarg0;  // source array address
 2640     const Register to       = c_rarg1;  // destination array address
 2641     const Register count    = c_rarg2;  // elements count
 2642 
 2643     // 'from', 'to', 'count' registers should be set in such order
 2644     // since they are the same as 'src', 'src_pos', 'dst'.
 2645 
 2646     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2647 
 2648     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2649     // size in bytes).  We do a simple bitwise binary search.
 2650   __ BIND(L_copy_bytes);
 2651     __ tbnz(r15_elsize, 1, L_copy_ints);
 2652     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2653     __ lea(from, Address(src, src_pos));// src_addr
 2654     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2655     __ movw(count, scratch_length); // length
 2656     __ b(RuntimeAddress(byte_copy_entry));
 2657 
 2658   __ BIND(L_copy_shorts);
 2659     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2660     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2661     __ movw(count, scratch_length); // length
 2662     __ b(RuntimeAddress(short_copy_entry));
 2663 
 2664   __ BIND(L_copy_ints);
 2665     __ tbnz(r15_elsize, 0, L_copy_longs);
 2666     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2667     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2668     __ movw(count, scratch_length); // length
 2669     __ b(RuntimeAddress(int_copy_entry));
 2670 
 2671   __ BIND(L_copy_longs);
 2672 #ifdef ASSERT
 2673     {
 2674       BLOCK_COMMENT("assert long copy {");
 2675       Label L;
 2676       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2677       __ cmpw(r15_elsize, LogBytesPerLong);
 2678       __ br(Assembler::EQ, L);
 2679       __ stop("must be long copy, but elsize is wrong");
 2680       __ bind(L);
 2681       BLOCK_COMMENT("} assert long copy done");
 2682     }
 2683 #endif
 2684     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2685     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2686     __ movw(count, scratch_length); // length
 2687     __ b(RuntimeAddress(long_copy_entry));
 2688 
 2689     // ObjArrayKlass
 2690   __ BIND(L_objArray);
 2691     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2692 
 2693     Label L_plain_copy, L_checkcast_copy;
 2694     //  test array classes for subtyping
 2695     __ load_klass(r15, dst);
 2696     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2697     __ br(Assembler::NE, L_checkcast_copy);
 2698 
 2699     // Identically typed arrays can be copied without element-wise checks.
 2700     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2701                            rscratch2, L_failed);
 2702 
 2703     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2704     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2705     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2706     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2707     __ movw(count, scratch_length); // length
 2708   __ BIND(L_plain_copy);
 2709     __ b(RuntimeAddress(oop_copy_entry));
 2710 
 2711   __ BIND(L_checkcast_copy);
 2712     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2713     {
 2714       // Before looking at dst.length, make sure dst is also an objArray.
 2715       __ ldrw(rscratch1, Address(r15, lh_offset));
 2716       __ movw(rscratch2, objArray_lh);
 2717       __ eorw(rscratch1, rscratch1, rscratch2);
 2718       __ cbnzw(rscratch1, L_failed);
 2719 
 2720       // It is safe to examine both src.length and dst.length.
 2721       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2722                              r15, L_failed);
 2723 
 2724       __ load_klass(dst_klass, dst); // reload
 2725 
 2726       // Marshal the base address arguments now, freeing registers.
 2727       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2728       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2729       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2730       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2731       __ movw(count, length);           // length (reloaded)
 2732       Register sco_temp = c_rarg3;      // this register is free now
 2733       assert_different_registers(from, to, count, sco_temp,
 2734                                  dst_klass, scratch_src_klass);
 2735       // assert_clean_int(count, sco_temp);
 2736 
 2737       // Generate the type check.
 2738       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2739       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2740 
 2741       // Smashes rscratch1, rscratch2
 2742       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2743                           L_plain_copy);
 2744 
 2745       // Fetch destination element klass from the ObjArrayKlass header.
 2746       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2747       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2748       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2749 
 2750       // the checkcast_copy loop needs two extra arguments:
 2751       assert(c_rarg3 == sco_temp, "#3 already in place");
 2752       // Set up arguments for checkcast_copy_entry.
 2753       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2754       __ b(RuntimeAddress(checkcast_copy_entry));
 2755     }
 2756 
 2757   __ BIND(L_failed);
 2758     __ mov(r0, -1);
 2759     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2760     __ ret(lr);
 2761 
 2762     // record the stub entry and end
 2763     store_archive_data(stub_id, start, __ pc());
 2764 
 2765     return start;
 2766   }
 2767 
 2768   //
 2769   // Generate stub for array fill. If "aligned" is true, the
 2770   // "to" address is assumed to be heapword aligned.
 2771   //
 2772   // Arguments for generated stub:
 2773   //   to:    c_rarg0
 2774   //   value: c_rarg1
 2775   //   count: c_rarg2 treated as signed
 2776   //
 2777   address generate_fill(StubId stub_id) {
 2778     BasicType t;
 2779     bool aligned;
 2780 
 2781     switch (stub_id) {
 2782     case StubId::stubgen_jbyte_fill_id:
 2783       t = T_BYTE;
 2784       aligned = false;
 2785       break;
 2786     case StubId::stubgen_jshort_fill_id:
 2787       t = T_SHORT;
 2788       aligned = false;
 2789       break;
 2790     case StubId::stubgen_jint_fill_id:
 2791       t = T_INT;
 2792       aligned = false;
 2793       break;
 2794     case StubId::stubgen_arrayof_jbyte_fill_id:
 2795       t = T_BYTE;
 2796       aligned = true;
 2797       break;
 2798     case StubId::stubgen_arrayof_jshort_fill_id:
 2799       t = T_SHORT;
 2800       aligned = true;
 2801       break;
 2802     case StubId::stubgen_arrayof_jint_fill_id:
 2803       t = T_INT;
 2804       aligned = true;
 2805       break;
 2806     default:
 2807       ShouldNotReachHere();
 2808     };
 2809     int entry_count = StubInfo::entry_count(stub_id);
 2810     assert(entry_count == 1, "sanity check");
 2811     address start = load_archive_data(stub_id);
 2812     if (start != nullptr) {
 2813       return start;
 2814     }
 2815     __ align(CodeEntryAlignment);
 2816     StubCodeMark mark(this, stub_id);
 2817     start = __ pc();
 2818 
 2819     BLOCK_COMMENT("Entry:");
 2820 
 2821     const Register to        = c_rarg0;  // source array address
 2822     const Register value     = c_rarg1;  // value
 2823     const Register count     = c_rarg2;  // elements count
 2824 
 2825     const Register bz_base = r10;        // base for block_zero routine
 2826     const Register cnt_words = r11;      // temp register
 2827 
 2828     __ enter();
 2829 
 2830     Label L_fill_elements, L_exit1;
 2831 
 2832     int shift = -1;
 2833     switch (t) {
 2834       case T_BYTE:
 2835         shift = 0;
 2836         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2837         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2838         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2839         __ br(Assembler::LO, L_fill_elements);
 2840         break;
 2841       case T_SHORT:
 2842         shift = 1;
 2843         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2844         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2845         __ br(Assembler::LO, L_fill_elements);
 2846         break;
 2847       case T_INT:
 2848         shift = 2;
 2849         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2850         __ br(Assembler::LO, L_fill_elements);
 2851         break;
 2852       default: ShouldNotReachHere();
 2853     }
 2854 
 2855     // Align source address at 8 bytes address boundary.
 2856     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2857     if (!aligned) {
 2858       switch (t) {
 2859         case T_BYTE:
 2860           // One byte misalignment happens only for byte arrays.
 2861           __ tbz(to, 0, L_skip_align1);
 2862           __ strb(value, Address(__ post(to, 1)));
 2863           __ subw(count, count, 1);
 2864           __ bind(L_skip_align1);
 2865           // Fallthrough
 2866         case T_SHORT:
 2867           // Two bytes misalignment happens only for byte and short (char) arrays.
 2868           __ tbz(to, 1, L_skip_align2);
 2869           __ strh(value, Address(__ post(to, 2)));
 2870           __ subw(count, count, 2 >> shift);
 2871           __ bind(L_skip_align2);
 2872           // Fallthrough
 2873         case T_INT:
 2874           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2875           __ tbz(to, 2, L_skip_align4);
 2876           __ strw(value, Address(__ post(to, 4)));
 2877           __ subw(count, count, 4 >> shift);
 2878           __ bind(L_skip_align4);
 2879           break;
 2880         default: ShouldNotReachHere();
 2881       }
 2882     }
 2883 
 2884     //
 2885     //  Fill large chunks
 2886     //
 2887     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2888     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2889     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2890     if (UseBlockZeroing) {
 2891       Label non_block_zeroing, rest;
 2892       // If the fill value is zero we can use the fast zero_words().
 2893       __ cbnz(value, non_block_zeroing);
 2894       __ mov(bz_base, to);
 2895       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2896       address tpc = __ zero_words(bz_base, cnt_words);
 2897       if (tpc == nullptr) {
 2898         fatal("CodeCache is full at generate_fill");
 2899       }
 2900       __ b(rest);
 2901       __ bind(non_block_zeroing);
 2902       __ fill_words(to, cnt_words, value);
 2903       __ bind(rest);
 2904     } else {
 2905       __ fill_words(to, cnt_words, value);
 2906     }
 2907 
 2908     // Remaining count is less than 8 bytes. Fill it by a single store.
 2909     // Note that the total length is no less than 8 bytes.
 2910     if (t == T_BYTE || t == T_SHORT) {
 2911       Label L_exit1;
 2912       __ cbzw(count, L_exit1);
 2913       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2914       __ str(value, Address(to, -8));    // overwrite some elements
 2915       __ bind(L_exit1);
 2916       __ leave();
 2917       __ ret(lr);
 2918     }
 2919 
 2920     // Handle copies less than 8 bytes.
 2921     Label L_fill_2, L_fill_4, L_exit2;
 2922     __ bind(L_fill_elements);
 2923     switch (t) {
 2924       case T_BYTE:
 2925         __ tbz(count, 0, L_fill_2);
 2926         __ strb(value, Address(__ post(to, 1)));
 2927         __ bind(L_fill_2);
 2928         __ tbz(count, 1, L_fill_4);
 2929         __ strh(value, Address(__ post(to, 2)));
 2930         __ bind(L_fill_4);
 2931         __ tbz(count, 2, L_exit2);
 2932         __ strw(value, Address(to));
 2933         break;
 2934       case T_SHORT:
 2935         __ tbz(count, 0, L_fill_4);
 2936         __ strh(value, Address(__ post(to, 2)));
 2937         __ bind(L_fill_4);
 2938         __ tbz(count, 1, L_exit2);
 2939         __ strw(value, Address(to));
 2940         break;
 2941       case T_INT:
 2942         __ cbzw(count, L_exit2);
 2943         __ strw(value, Address(to));
 2944         break;
 2945       default: ShouldNotReachHere();
 2946     }
 2947     __ bind(L_exit2);
 2948     __ leave();
 2949     __ ret(lr);
 2950 
 2951     // record the stub entry and end
 2952     store_archive_data(stub_id, start, __ pc());
 2953 
 2954     return start;
 2955   }
 2956 
 2957   address generate_unsafecopy_common_error_exit() {
 2958     StubId stub_id = StubId::stubgen_unsafecopy_common_id;
 2959     int entry_count = StubInfo::entry_count(stub_id);
 2960     assert(entry_count == 1, "sanity check");
 2961     address start = load_archive_data(stub_id);
 2962     if (start != nullptr) {
 2963       return start;
 2964     }
 2965     __ align(CodeEntryAlignment);
 2966     StubCodeMark mark(this, stub_id);
 2967     start = __ pc();
 2968       __ leave();
 2969       __ mov(r0, 0);
 2970       __ ret(lr);
 2971 
 2972     // record the stub entry and end
 2973     store_archive_data(stub_id, start, __ pc());
 2974 
 2975     return start;
 2976   }
 2977 
 2978   //
 2979   //  Generate 'unsafe' set memory stub
 2980   //  Though just as safe as the other stubs, it takes an unscaled
 2981   //  size_t (# bytes) argument instead of an element count.
 2982   //
 2983   //  This fill operation is atomicity preserving: as long as the
 2984   //  address supplied is sufficiently aligned, all writes of up to 64
 2985   //  bits in size are single-copy atomic.
 2986   //
 2987   //  Input:
 2988   //    c_rarg0   - destination array address
 2989   //    c_rarg1   - byte count (size_t)
 2990   //    c_rarg2   - byte value
 2991   //
 2992   address generate_unsafe_setmemory() {
 2993     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
 2994     int entry_count = StubInfo::entry_count(stub_id);
 2995     assert(entry_count == 1, "sanity check");
 2996     // we expect one set of extra unsafememory access handler entries
 2997     GrowableArray<address> extras;
 2998     int extra_count =  1 * UnsafeMemoryAccess::COLUMN_COUNT;
 2999     address start = load_archive_data(stub_id, nullptr, &extras);
 3000     if (start != nullptr) {
 3001       assert(extras.length() == extra_count,
 3002              "unexpected extra entry count %d", extras.length());
 3003       register_unsafe_access_handlers(extras, 0, 1);
 3004       return start;
 3005     }
 3006 
 3007     __ align(CodeEntryAlignment);
 3008     StubCodeMark mark(this, stub_id);
 3009     start = __ pc();
 3010 
 3011     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 3012     Label tail;
 3013 
 3014     {
 3015     UnsafeMemoryAccessMark umam(this, true, false);
 3016 
 3017     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3018 
 3019     __ dup(v0, __ T16B, value);
 3020 
 3021     if (AvoidUnalignedAccesses) {
 3022       __ cmp(count, (u1)16);
 3023       __ br(__ LO, tail);
 3024 
 3025       __ mov(rscratch1, 16);
 3026       __ andr(rscratch2, dest, 15);
 3027       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 3028       __ strq(v0, Address(dest));
 3029       __ sub(count, count, rscratch1);
 3030       __ add(dest, dest, rscratch1);
 3031     }
 3032 
 3033     __ subs(count, count, (u1)64);
 3034     __ br(__ LO, tail);
 3035     {
 3036       Label again;
 3037       __ bind(again);
 3038       __ stpq(v0, v0, Address(dest));
 3039       __ stpq(v0, v0, Address(dest, 32));
 3040 
 3041       __ subs(count, count, 64);
 3042       __ add(dest, dest, 64);
 3043       __ br(__ HS, again);
 3044     }
 3045 
 3046     __ bind(tail);
 3047     // The count of bytes is off by 64, but we don't need to correct
 3048     // it because we're only going to use the least-significant few
 3049     // count bits from here on.
 3050     // __ add(count, count, 64);
 3051 
 3052     {
 3053       Label dont;
 3054       __ tbz(count, exact_log2(32), dont);
 3055       __ stpq(v0, v0, __ post(dest, 32));
 3056       __ bind(dont);
 3057     }
 3058     {
 3059       Label dont;
 3060       __ tbz(count, exact_log2(16), dont);
 3061       __ strq(v0, __ post(dest, 16));
 3062       __ bind(dont);
 3063     }
 3064     {
 3065       Label dont;
 3066       __ tbz(count, exact_log2(8), dont);
 3067       __ strd(v0, __ post(dest, 8));
 3068       __ bind(dont);
 3069     }
 3070 
 3071     Label finished;
 3072     __ tst(count, 7);
 3073     __ br(__ EQ, finished);
 3074 
 3075     {
 3076       Label dont;
 3077       __ tbz(count, exact_log2(4), dont);
 3078       __ strs(v0, __ post(dest, 4));
 3079       __ bind(dont);
 3080     }
 3081     {
 3082       Label dont;
 3083       __ tbz(count, exact_log2(2), dont);
 3084       __ bfi(value, value, 8, 8);
 3085       __ strh(value, __ post(dest, 2));
 3086       __ bind(dont);
 3087     }
 3088     {
 3089       Label dont;
 3090       __ tbz(count, exact_log2(1), dont);
 3091       __ strb(value, Address(dest));
 3092       __ bind(dont);
 3093     }
 3094 
 3095     __ bind(finished);
 3096     __ leave();
 3097     __ ret(lr);
 3098     // have to exit the block and destroy the UnsafeMemoryAccessMark
 3099     // in order to retrieve the handler end address
 3100     }
 3101 
 3102     // install saved handler addresses in extras
 3103     address end = __ pc();
 3104     retrieve_unsafe_access_handlers(start, end, extras);
 3105     assert(extras.length() == extra_count,
 3106            "incorrect handlers count %d", extras.length());
 3107     // record the stub entry and end plus the extras
 3108     store_archive_data(stub_id, start, end, nullptr, &extras);
 3109 
 3110     return start;
 3111   }
 3112 
 3113   address generate_data_cache_writeback() {
 3114     const Register line        = c_rarg0;  // address of line to write back
 3115 
 3116     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 3117     int entry_count = StubInfo::entry_count(stub_id);
 3118     assert(entry_count == 1, "sanity check");
 3119     address start = load_archive_data(stub_id);
 3120     if (start != nullptr) {
 3121       return start;
 3122     }
 3123     __ align(CodeEntryAlignment);
 3124     StubCodeMark mark(this, stub_id);
 3125 
 3126     start = __ pc();
 3127     __ enter();
 3128     __ cache_wb(Address(line, 0));
 3129     __ leave();
 3130     __ ret(lr);
 3131 
 3132     // record the stub entry and end
 3133     store_archive_data(stub_id, start, __ pc());
 3134 
 3135     return start;
 3136   }
 3137 
 3138   address generate_data_cache_writeback_sync() {
 3139     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 3140     int entry_count = StubInfo::entry_count(stub_id);
 3141     assert(entry_count == 1, "sanity check");
 3142     address start = load_archive_data(stub_id);
 3143     if (start != nullptr) {
 3144       return start;
 3145     }
 3146     const Register is_pre     = c_rarg0;  // pre or post sync
 3147     __ align(CodeEntryAlignment);
 3148     StubCodeMark mark(this, stub_id);
 3149 
 3150     // pre wbsync is a no-op
 3151     // post wbsync translates to an sfence
 3152 
 3153     Label skip;
 3154     start = __ pc();
 3155     __ enter();
 3156     __ cbnz(is_pre, skip);
 3157     __ cache_wbsync(false);
 3158     __ bind(skip);
 3159     __ leave();
 3160     __ ret(lr);
 3161 
 3162     // record the stub entry and end
 3163     store_archive_data(stub_id, start, __ pc());
 3164 
 3165     return start;
 3166   }
 3167 
 3168   void generate_arraycopy_stubs() {
 3169     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 3170     // entry immediately following their stack push. This can be used
 3171     // as a post-push branch target for compatible stubs when they
 3172     // identify a special case that can be handled by the fallback
 3173     // stub e.g a disjoint copy stub may be use as a special case
 3174     // fallback for its compatible conjoint copy stub.
 3175     //
 3176     // A no push entry is always returned in the following local and
 3177     // then published by assigning to the appropriate entry field in
 3178     // class StubRoutines. The entry value is then passed to the
 3179     // generator for the compatible stub. That means the entry must be
 3180     // listed when saving to/restoring from the AOT cache, ensuring
 3181     // that the inter-stub jumps are noted at AOT-cache save and
 3182     // relocated at AOT cache load.
 3183     address nopush_entry;
 3184 
 3185     // generate the common exit first so later stubs can rely on it if
 3186     // they want an UnsafeMemoryAccess exit non-local to the stub
 3187     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 3188     // register the stub as the default exit with class UnsafeMemoryAccess
 3189     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 3190 
 3191     // generate and publish arch64-specific bulk copy routines first
 3192     // so we can call them from other copy stubs
 3193     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3194     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3195 
 3196     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3197     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 3198 
 3199     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3200     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 3201 
 3202     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 3203 
 3204     //*** jbyte
 3205     // Always need aligned and unaligned versions
 3206     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3207     // disjoint nopush entry is needed by conjoint copy
 3208     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3209     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 3210     // conjoint nopush entry is needed by generic/unsafe copy
 3211     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 3212     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 3213     // disjoint arrayof nopush entry is needed by conjoint copy
 3214     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 3215     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 3216 
 3217     //*** jshort
 3218     // Always need aligned and unaligned versions
 3219     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 3220     // disjoint nopush entry is needed by conjoint copy
 3221     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 3222     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 3223     // conjoint nopush entry is used by generic/unsafe copy
 3224     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 3225     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 3226     // disjoint arrayof nopush entry is needed by conjoint copy
 3227     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 3228     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 3229 
 3230     //*** jint
 3231     // Aligned versions
 3232     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 3233     // disjoint arrayof nopush entry is needed by conjoint copy
 3234     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 3235     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 3236     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 3237     // jint_arraycopy_nopush always points to the unaligned version
 3238     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 3239     // disjoint nopush entry is needed by conjoint copy
 3240     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 3241     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 3242     // conjoint nopush entry is needed by generic/unsafe copy
 3243     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 3244 
 3245     //*** jlong
 3246     // It is always aligned
 3247     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 3248     // disjoint arrayof nopush entry is needed by conjoint copy
 3249     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 3250     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 3251     // conjoint nopush entry is needed by generic/unsafe copy
 3252     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 3253     // disjoint normal/nopush and conjoint normal entries are not
 3254     // generated since the arrayof versions are the same
 3255     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 3256     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 3257     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 3258 
 3259     //*** oops
 3260     {
 3261       StubRoutines::_arrayof_oop_disjoint_arraycopy
 3262         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 3263       // disjoint arrayof nopush entry is needed by conjoint copy
 3264       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 3265       StubRoutines::_arrayof_oop_arraycopy
 3266         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 3267       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 3268       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 3269       // Aligned versions without pre-barriers
 3270       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 3271         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 3272       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 3273       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 3274       // note that we don't need a returned nopush entry because the
 3275       // generic/unsafe copy does not cater for uninit arrays.
 3276       StubRoutines::_arrayof_oop_arraycopy_uninit
 3277         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 3278     }
 3279 
 3280     // for oop copies reuse arrayof entries for non-arrayof cases
 3281     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 3282     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 3283     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 3284     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 3285     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 3286     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 3287 
 3288     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 3289     // checkcast nopush entry is needed by generic copy
 3290     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 3291     // note that we don't need a returned nopush entry because the
 3292     // generic copy does not cater for uninit arrays.
 3293     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 3294 
 3295     // unsafe arraycopy may fallback on conjoint stubs
 3296     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3297                                                               StubRoutines::_jshort_arraycopy_nopush,
 3298                                                               StubRoutines::_jint_arraycopy_nopush,
 3299                                                               StubRoutines::_jlong_arraycopy_nopush);
 3300 
 3301     // generic arraycopy may fallback on conjoint stubs
 3302     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 3303                                                                StubRoutines::_jshort_arraycopy_nopush,
 3304                                                                StubRoutines::_jint_arraycopy_nopush,
 3305                                                                StubRoutines::_oop_arraycopy_nopush,
 3306                                                                StubRoutines::_jlong_arraycopy_nopush,
 3307                                                                StubRoutines::_checkcast_arraycopy_nopush);
 3308 
 3309     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 3310     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 3311     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 3312     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 3313     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 3314     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 3315   }
 3316 
 3317   void generate_math_stubs() { Unimplemented(); }
 3318 
 3319   // Arguments:
 3320   //
 3321   // Inputs:
 3322   //   c_rarg0   - source byte array address
 3323   //   c_rarg1   - destination byte array address
 3324   //   c_rarg2   - sessionKe (key) in little endian int array
 3325   //
 3326   address generate_aescrypt_encryptBlock() {
 3327     assert(UseAES, "need AES cryptographic extension support");
 3328     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 3329     int entry_count = StubInfo::entry_count(stub_id);
 3330     assert(entry_count == 1, "sanity check");
 3331     address start = load_archive_data(stub_id);
 3332     if (start != nullptr) {
 3333       return start;
 3334     }
 3335     __ align(CodeEntryAlignment);
 3336     StubCodeMark mark(this, stub_id);
 3337 
 3338     const Register from        = c_rarg0;  // source array address
 3339     const Register to          = c_rarg1;  // destination array address
 3340     const Register key         = c_rarg2;  // key array address
 3341     const Register keylen      = rscratch1;
 3342 
 3343     start = __ pc();
 3344     __ enter();
 3345 
 3346     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3347 
 3348     __ aesenc_loadkeys(key, keylen);
 3349     __ aesecb_encrypt(from, to, keylen);
 3350 
 3351     __ mov(r0, 0);
 3352 
 3353     __ leave();
 3354     __ ret(lr);
 3355 
 3356     // record the stub entry and end
 3357     store_archive_data(stub_id, start, __ pc());
 3358 
 3359     return start;
 3360   }
 3361 
 3362   // Arguments:
 3363   //
 3364   // Inputs:
 3365   //   c_rarg0   - source byte array address
 3366   //   c_rarg1   - destination byte array address
 3367   //   c_rarg2   - sessionKd (key) in little endian int array
 3368   //
 3369   address generate_aescrypt_decryptBlock() {
 3370     assert(UseAES, "need AES cryptographic extension support");
 3371     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 3372     int entry_count = StubInfo::entry_count(stub_id);
 3373     assert(entry_count == 1, "sanity check");
 3374     address start = load_archive_data(stub_id);
 3375     if (start != nullptr) {
 3376       return start;
 3377     }
 3378     __ align(CodeEntryAlignment);
 3379     StubCodeMark mark(this, stub_id);
 3380     Label L_doLast;
 3381 
 3382     const Register from        = c_rarg0;  // source array address
 3383     const Register to          = c_rarg1;  // destination array address
 3384     const Register key         = c_rarg2;  // key array address
 3385     const Register keylen      = rscratch1;
 3386 
 3387     start = __ pc();
 3388     __ enter(); // required for proper stackwalking of RuntimeStub frame
 3389 
 3390     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3391 
 3392     __ aesecb_decrypt(from, to, key, keylen);
 3393 
 3394     __ mov(r0, 0);
 3395 
 3396     __ leave();
 3397     __ ret(lr);
 3398 
 3399     // record the stub entry and end
 3400     store_archive_data(stub_id, start, __ pc());
 3401 
 3402     return start;
 3403   }
 3404 
 3405   // Arguments:
 3406   //
 3407   // Inputs:
 3408   //   c_rarg0   - source byte array address
 3409   //   c_rarg1   - destination byte array address
 3410   //   c_rarg2   - sessionKe (key) in little endian int array
 3411   //   c_rarg3   - r vector byte array address
 3412   //   c_rarg4   - input length
 3413   //
 3414   // Output:
 3415   //   x0        - input length
 3416   //
 3417   address generate_cipherBlockChaining_encryptAESCrypt() {
 3418     assert(UseAES, "need AES cryptographic extension support");
 3419     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 3420     int entry_count = StubInfo::entry_count(stub_id);
 3421     assert(entry_count == 1, "sanity check");
 3422     address start = load_archive_data(stub_id);
 3423     if (start != nullptr) {
 3424       return start;
 3425     }
 3426     __ align(CodeEntryAlignment);
 3427     StubCodeMark mark(this, stub_id);
 3428 
 3429     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3430 
 3431     const Register from        = c_rarg0;  // source array address
 3432     const Register to          = c_rarg1;  // destination array address
 3433     const Register key         = c_rarg2;  // key array address
 3434     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3435                                            // and left with the results of the last encryption block
 3436     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3437     const Register keylen      = rscratch1;
 3438 
 3439     start = __ pc();
 3440 
 3441       __ enter();
 3442 
 3443       __ movw(rscratch2, len_reg);
 3444 
 3445       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3446 
 3447       __ ld1(v0, __ T16B, rvec);
 3448 
 3449       __ cmpw(keylen, 52);
 3450       __ br(Assembler::CC, L_loadkeys_44);
 3451       __ br(Assembler::EQ, L_loadkeys_52);
 3452 
 3453       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3454       __ rev32(v17, __ T16B, v17);
 3455       __ rev32(v18, __ T16B, v18);
 3456     __ BIND(L_loadkeys_52);
 3457       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3458       __ rev32(v19, __ T16B, v19);
 3459       __ rev32(v20, __ T16B, v20);
 3460     __ BIND(L_loadkeys_44);
 3461       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3462       __ rev32(v21, __ T16B, v21);
 3463       __ rev32(v22, __ T16B, v22);
 3464       __ rev32(v23, __ T16B, v23);
 3465       __ rev32(v24, __ T16B, v24);
 3466       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3467       __ rev32(v25, __ T16B, v25);
 3468       __ rev32(v26, __ T16B, v26);
 3469       __ rev32(v27, __ T16B, v27);
 3470       __ rev32(v28, __ T16B, v28);
 3471       __ ld1(v29, v30, v31, __ T16B, key);
 3472       __ rev32(v29, __ T16B, v29);
 3473       __ rev32(v30, __ T16B, v30);
 3474       __ rev32(v31, __ T16B, v31);
 3475 
 3476     __ BIND(L_aes_loop);
 3477       __ ld1(v1, __ T16B, __ post(from, 16));
 3478       __ eor(v0, __ T16B, v0, v1);
 3479 
 3480       __ br(Assembler::CC, L_rounds_44);
 3481       __ br(Assembler::EQ, L_rounds_52);
 3482 
 3483       __ aese(v0, v17); __ aesmc(v0, v0);
 3484       __ aese(v0, v18); __ aesmc(v0, v0);
 3485     __ BIND(L_rounds_52);
 3486       __ aese(v0, v19); __ aesmc(v0, v0);
 3487       __ aese(v0, v20); __ aesmc(v0, v0);
 3488     __ BIND(L_rounds_44);
 3489       __ aese(v0, v21); __ aesmc(v0, v0);
 3490       __ aese(v0, v22); __ aesmc(v0, v0);
 3491       __ aese(v0, v23); __ aesmc(v0, v0);
 3492       __ aese(v0, v24); __ aesmc(v0, v0);
 3493       __ aese(v0, v25); __ aesmc(v0, v0);
 3494       __ aese(v0, v26); __ aesmc(v0, v0);
 3495       __ aese(v0, v27); __ aesmc(v0, v0);
 3496       __ aese(v0, v28); __ aesmc(v0, v0);
 3497       __ aese(v0, v29); __ aesmc(v0, v0);
 3498       __ aese(v0, v30);
 3499       __ eor(v0, __ T16B, v0, v31);
 3500 
 3501       __ st1(v0, __ T16B, __ post(to, 16));
 3502 
 3503       __ subw(len_reg, len_reg, 16);
 3504       __ cbnzw(len_reg, L_aes_loop);
 3505 
 3506       __ st1(v0, __ T16B, rvec);
 3507 
 3508       __ mov(r0, rscratch2);
 3509 
 3510       __ leave();
 3511       __ ret(lr);
 3512 
 3513       // record the stub entry and end
 3514       store_archive_data(stub_id, start, __ pc());
 3515 
 3516       return start;
 3517   }
 3518 
 3519   // Arguments:
 3520   //
 3521   // Inputs:
 3522   //   c_rarg0   - source byte array address
 3523   //   c_rarg1   - destination byte array address
 3524   //   c_rarg2   - sessionKd (key) in little endian int array
 3525   //   c_rarg3   - r vector byte array address
 3526   //   c_rarg4   - input length
 3527   //
 3528   // Output:
 3529   //   r0        - input length
 3530   //
 3531   address generate_cipherBlockChaining_decryptAESCrypt() {
 3532     assert(UseAES, "need AES cryptographic extension support");
 3533     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3534     int entry_count = StubInfo::entry_count(stub_id);
 3535     assert(entry_count == 1, "sanity check");
 3536     address start = load_archive_data(stub_id);
 3537     if (start != nullptr) {
 3538       return start;
 3539     }
 3540     __ align(CodeEntryAlignment);
 3541     StubCodeMark mark(this, stub_id);
 3542 
 3543     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3544 
 3545     const Register from        = c_rarg0;  // source array address
 3546     const Register to          = c_rarg1;  // destination array address
 3547     const Register key         = c_rarg2;  // key array address
 3548     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3549                                            // and left with the results of the last encryption block
 3550     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3551     const Register keylen      = rscratch1;
 3552 
 3553     start = __ pc();
 3554 
 3555       __ enter();
 3556 
 3557       __ movw(rscratch2, len_reg);
 3558 
 3559       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3560 
 3561       __ ld1(v2, __ T16B, rvec);
 3562 
 3563       __ ld1(v31, __ T16B, __ post(key, 16));
 3564       __ rev32(v31, __ T16B, v31);
 3565 
 3566       __ cmpw(keylen, 52);
 3567       __ br(Assembler::CC, L_loadkeys_44);
 3568       __ br(Assembler::EQ, L_loadkeys_52);
 3569 
 3570       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3571       __ rev32(v17, __ T16B, v17);
 3572       __ rev32(v18, __ T16B, v18);
 3573     __ BIND(L_loadkeys_52);
 3574       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3575       __ rev32(v19, __ T16B, v19);
 3576       __ rev32(v20, __ T16B, v20);
 3577     __ BIND(L_loadkeys_44);
 3578       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3579       __ rev32(v21, __ T16B, v21);
 3580       __ rev32(v22, __ T16B, v22);
 3581       __ rev32(v23, __ T16B, v23);
 3582       __ rev32(v24, __ T16B, v24);
 3583       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3584       __ rev32(v25, __ T16B, v25);
 3585       __ rev32(v26, __ T16B, v26);
 3586       __ rev32(v27, __ T16B, v27);
 3587       __ rev32(v28, __ T16B, v28);
 3588       __ ld1(v29, v30, __ T16B, key);
 3589       __ rev32(v29, __ T16B, v29);
 3590       __ rev32(v30, __ T16B, v30);
 3591 
 3592     __ BIND(L_aes_loop);
 3593       __ ld1(v0, __ T16B, __ post(from, 16));
 3594       __ orr(v1, __ T16B, v0, v0);
 3595 
 3596       __ br(Assembler::CC, L_rounds_44);
 3597       __ br(Assembler::EQ, L_rounds_52);
 3598 
 3599       __ aesd(v0, v17); __ aesimc(v0, v0);
 3600       __ aesd(v0, v18); __ aesimc(v0, v0);
 3601     __ BIND(L_rounds_52);
 3602       __ aesd(v0, v19); __ aesimc(v0, v0);
 3603       __ aesd(v0, v20); __ aesimc(v0, v0);
 3604     __ BIND(L_rounds_44);
 3605       __ aesd(v0, v21); __ aesimc(v0, v0);
 3606       __ aesd(v0, v22); __ aesimc(v0, v0);
 3607       __ aesd(v0, v23); __ aesimc(v0, v0);
 3608       __ aesd(v0, v24); __ aesimc(v0, v0);
 3609       __ aesd(v0, v25); __ aesimc(v0, v0);
 3610       __ aesd(v0, v26); __ aesimc(v0, v0);
 3611       __ aesd(v0, v27); __ aesimc(v0, v0);
 3612       __ aesd(v0, v28); __ aesimc(v0, v0);
 3613       __ aesd(v0, v29); __ aesimc(v0, v0);
 3614       __ aesd(v0, v30);
 3615       __ eor(v0, __ T16B, v0, v31);
 3616       __ eor(v0, __ T16B, v0, v2);
 3617 
 3618       __ st1(v0, __ T16B, __ post(to, 16));
 3619       __ orr(v2, __ T16B, v1, v1);
 3620 
 3621       __ subw(len_reg, len_reg, 16);
 3622       __ cbnzw(len_reg, L_aes_loop);
 3623 
 3624       __ st1(v2, __ T16B, rvec);
 3625 
 3626       __ mov(r0, rscratch2);
 3627 
 3628       __ leave();
 3629       __ ret(lr);
 3630 
 3631     // record the stub entry and end
 3632     store_archive_data(stub_id, start, __ pc());
 3633 
 3634     return start;
 3635   }
 3636 
 3637   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3638   // Inputs: 128-bits. in is preserved.
 3639   // The least-significant 64-bit word is in the upper dword of each vector.
 3640   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3641   // Output: result
 3642   void be_add_128_64(FloatRegister result, FloatRegister in,
 3643                      FloatRegister inc, FloatRegister tmp) {
 3644     assert_different_registers(result, tmp, inc);
 3645 
 3646     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3647                                            // input
 3648     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3649     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3650                                            // MSD == 0 (must be!) to LSD
 3651     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3652   }
 3653 
 3654   // CTR AES crypt.
 3655   // Arguments:
 3656   //
 3657   // Inputs:
 3658   //   c_rarg0   - source byte array address
 3659   //   c_rarg1   - destination byte array address
 3660   //   c_rarg2   - sessionKe (key) in little endian int array
 3661   //   c_rarg3   - counter vector byte array address
 3662   //   c_rarg4   - input length
 3663   //   c_rarg5   - saved encryptedCounter start
 3664   //   c_rarg6   - saved used length
 3665   //
 3666   // Output:
 3667   //   r0       - input length
 3668   //
 3669   address generate_counterMode_AESCrypt() {
 3670     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3671     int entry_count = StubInfo::entry_count(stub_id);
 3672     assert(entry_count == 1, "sanity check");
 3673     address start = load_archive_data(stub_id);
 3674     if (start != nullptr) {
 3675       return start;
 3676     }
 3677     const Register in = c_rarg0;
 3678     const Register out = c_rarg1;
 3679     const Register key = c_rarg2;
 3680     const Register counter = c_rarg3;
 3681     const Register saved_len = c_rarg4, len = r10;
 3682     const Register saved_encrypted_ctr = c_rarg5;
 3683     const Register used_ptr = c_rarg6, used = r12;
 3684 
 3685     const Register offset = r7;
 3686     const Register keylen = r11;
 3687 
 3688     const unsigned char block_size = 16;
 3689     const int bulk_width = 4;
 3690     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3691     // performance with larger data sizes, but it also means that the
 3692     // fast path isn't used until you have at least 8 blocks, and up
 3693     // to 127 bytes of data will be executed on the slow path. For
 3694     // that reason, and also so as not to blow away too much icache, 4
 3695     // blocks seems like a sensible compromise.
 3696 
 3697     // Algorithm:
 3698     //
 3699     //    if (len == 0) {
 3700     //        goto DONE;
 3701     //    }
 3702     //    int result = len;
 3703     //    do {
 3704     //        if (used >= blockSize) {
 3705     //            if (len >= bulk_width * blockSize) {
 3706     //                CTR_large_block();
 3707     //                if (len == 0)
 3708     //                    goto DONE;
 3709     //            }
 3710     //            for (;;) {
 3711     //                16ByteVector v0 = counter;
 3712     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3713     //                used = 0;
 3714     //                if (len < blockSize)
 3715     //                    break;    /* goto NEXT */
 3716     //                16ByteVector v1 = load16Bytes(in, offset);
 3717     //                v1 = v1 ^ encryptedCounter;
 3718     //                store16Bytes(out, offset);
 3719     //                used = blockSize;
 3720     //                offset += blockSize;
 3721     //                len -= blockSize;
 3722     //                if (len == 0)
 3723     //                    goto DONE;
 3724     //            }
 3725     //        }
 3726     //      NEXT:
 3727     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3728     //        len--;
 3729     //    } while (len != 0);
 3730     //  DONE:
 3731     //    return result;
 3732     //
 3733     // CTR_large_block()
 3734     //    Wide bulk encryption of whole blocks.
 3735 
 3736     __ align(CodeEntryAlignment);
 3737     StubCodeMark mark(this, stub_id);
 3738     start = __ pc();
 3739     __ enter();
 3740 
 3741     Label DONE, CTR_large_block, large_block_return;
 3742     __ ldrw(used, Address(used_ptr));
 3743     __ cbzw(saved_len, DONE);
 3744 
 3745     __ mov(len, saved_len);
 3746     __ mov(offset, 0);
 3747 
 3748     // Compute #rounds for AES based on the length of the key array
 3749     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3750 
 3751     __ aesenc_loadkeys(key, keylen);
 3752 
 3753     {
 3754       Label L_CTR_loop, NEXT;
 3755 
 3756       __ bind(L_CTR_loop);
 3757 
 3758       __ cmp(used, block_size);
 3759       __ br(__ LO, NEXT);
 3760 
 3761       // Maybe we have a lot of data
 3762       __ subsw(rscratch1, len, bulk_width * block_size);
 3763       __ br(__ HS, CTR_large_block);
 3764       __ BIND(large_block_return);
 3765       __ cbzw(len, DONE);
 3766 
 3767       // Setup the counter
 3768       __ movi(v4, __ T4S, 0);
 3769       __ movi(v5, __ T4S, 1);
 3770       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3771 
 3772       // 128-bit big-endian increment
 3773       __ ld1(v0, __ T16B, counter);
 3774       __ rev64(v16, __ T16B, v0);
 3775       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3776       __ rev64(v16, __ T16B, v16);
 3777       __ st1(v16, __ T16B, counter);
 3778       // Previous counter value is in v0
 3779       // v4 contains { 0, 1 }
 3780 
 3781       {
 3782         // We have fewer than bulk_width blocks of data left. Encrypt
 3783         // them one by one until there is less than a full block
 3784         // remaining, being careful to save both the encrypted counter
 3785         // and the counter.
 3786 
 3787         Label inner_loop;
 3788         __ bind(inner_loop);
 3789         // Counter to encrypt is in v0
 3790         __ aesecb_encrypt(noreg, noreg, keylen);
 3791         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3792 
 3793         // Do we have a remaining full block?
 3794 
 3795         __ mov(used, 0);
 3796         __ cmp(len, block_size);
 3797         __ br(__ LO, NEXT);
 3798 
 3799         // Yes, we have a full block
 3800         __ ldrq(v1, Address(in, offset));
 3801         __ eor(v1, __ T16B, v1, v0);
 3802         __ strq(v1, Address(out, offset));
 3803         __ mov(used, block_size);
 3804         __ add(offset, offset, block_size);
 3805 
 3806         __ subw(len, len, block_size);
 3807         __ cbzw(len, DONE);
 3808 
 3809         // Increment the counter, store it back
 3810         __ orr(v0, __ T16B, v16, v16);
 3811         __ rev64(v16, __ T16B, v16);
 3812         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3813         __ rev64(v16, __ T16B, v16);
 3814         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3815 
 3816         __ b(inner_loop);
 3817       }
 3818 
 3819       __ BIND(NEXT);
 3820 
 3821       // Encrypt a single byte, and loop.
 3822       // We expect this to be a rare event.
 3823       __ ldrb(rscratch1, Address(in, offset));
 3824       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3825       __ eor(rscratch1, rscratch1, rscratch2);
 3826       __ strb(rscratch1, Address(out, offset));
 3827       __ add(offset, offset, 1);
 3828       __ add(used, used, 1);
 3829       __ subw(len, len,1);
 3830       __ cbnzw(len, L_CTR_loop);
 3831     }
 3832 
 3833     __ bind(DONE);
 3834     __ strw(used, Address(used_ptr));
 3835     __ mov(r0, saved_len);
 3836 
 3837     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3838     __ ret(lr);
 3839 
 3840     // Bulk encryption
 3841 
 3842     __ BIND (CTR_large_block);
 3843     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3844 
 3845     if (bulk_width == 8) {
 3846       __ sub(sp, sp, 4 * 16);
 3847       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3848     }
 3849     __ sub(sp, sp, 4 * 16);
 3850     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3851     RegSet saved_regs = (RegSet::of(in, out, offset)
 3852                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3853     __ push(saved_regs, sp);
 3854     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3855     __ add(in, in, offset);
 3856     __ add(out, out, offset);
 3857 
 3858     // Keys should already be loaded into the correct registers
 3859 
 3860     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3861     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3862 
 3863     // AES/CTR loop
 3864     {
 3865       Label L_CTR_loop;
 3866       __ BIND(L_CTR_loop);
 3867 
 3868       // Setup the counters
 3869       __ movi(v8, __ T4S, 0);
 3870       __ movi(v9, __ T4S, 1);
 3871       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3872 
 3873       for (int i = 0; i < bulk_width; i++) {
 3874         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3875         __ rev64(v0_ofs, __ T16B, v16);
 3876         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3877       }
 3878 
 3879       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3880 
 3881       // Encrypt the counters
 3882       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3883 
 3884       if (bulk_width == 8) {
 3885         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3886       }
 3887 
 3888       // XOR the encrypted counters with the inputs
 3889       for (int i = 0; i < bulk_width; i++) {
 3890         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3891         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3892         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3893       }
 3894 
 3895       // Write the encrypted data
 3896       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3897       if (bulk_width == 8) {
 3898         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3899       }
 3900 
 3901       __ subw(len, len, 16 * bulk_width);
 3902       __ cbnzw(len, L_CTR_loop);
 3903     }
 3904 
 3905     // Save the counter back where it goes
 3906     __ rev64(v16, __ T16B, v16);
 3907     __ st1(v16, __ T16B, counter);
 3908 
 3909     __ pop(saved_regs, sp);
 3910 
 3911     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3912     if (bulk_width == 8) {
 3913       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3914     }
 3915 
 3916     __ andr(rscratch1, len, -16 * bulk_width);
 3917     __ sub(len, len, rscratch1);
 3918     __ add(offset, offset, rscratch1);
 3919     __ mov(used, 16);
 3920     __ strw(used, Address(used_ptr));
 3921     __ b(large_block_return);
 3922 
 3923     // record the stub entry and end
 3924     store_archive_data(stub_id, start, __ pc());
 3925 
 3926     return start;
 3927   }
 3928 
 3929   // Vector AES Galois Counter Mode implementation. Parameters:
 3930   //
 3931   // in = c_rarg0
 3932   // len = c_rarg1
 3933   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3934   // out = c_rarg3
 3935   // key = c_rarg4
 3936   // state = c_rarg5 - GHASH.state
 3937   // subkeyHtbl = c_rarg6 - powers of H
 3938   // counter = c_rarg7 - 16 bytes of CTR
 3939   // return - number of processed bytes
 3940   address generate_galoisCounterMode_AESCrypt() {
 3941     Label ghash_polynomial; // local data generated after code
 3942     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3943     int entry_count = StubInfo::entry_count(stub_id);
 3944     assert(entry_count == 1, "sanity check");
 3945     address start = load_archive_data(stub_id);
 3946     if (start != nullptr) {
 3947       return start;
 3948     }
 3949     __ align(CodeEntryAlignment);
 3950     StubCodeMark mark(this, stub_id);
 3951     start = __ pc();
 3952     __ enter();
 3953 
 3954     const Register in = c_rarg0;
 3955     const Register len = c_rarg1;
 3956     const Register ct = c_rarg2;
 3957     const Register out = c_rarg3;
 3958     // and updated with the incremented counter in the end
 3959 
 3960     const Register key = c_rarg4;
 3961     const Register state = c_rarg5;
 3962 
 3963     const Register subkeyHtbl = c_rarg6;
 3964 
 3965     const Register counter = c_rarg7;
 3966 
 3967     const Register keylen = r10;
 3968     // Save state before entering routine
 3969     __ sub(sp, sp, 4 * 16);
 3970     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3971     __ sub(sp, sp, 4 * 16);
 3972     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3973 
 3974     // __ andr(len, len, -512);
 3975     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3976     __ str(len, __ pre(sp, -2 * wordSize));
 3977 
 3978     Label DONE;
 3979     __ cbz(len, DONE);
 3980 
 3981     // Compute #rounds for AES based on the length of the key array
 3982     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3983 
 3984     __ aesenc_loadkeys(key, keylen);
 3985     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3986     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3987 
 3988     // AES/CTR loop
 3989     {
 3990       Label L_CTR_loop;
 3991       __ BIND(L_CTR_loop);
 3992 
 3993       // Setup the counters
 3994       __ movi(v8, __ T4S, 0);
 3995       __ movi(v9, __ T4S, 1);
 3996       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3997 
 3998       assert(v0->encoding() < v8->encoding(), "");
 3999       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 4000         FloatRegister f = as_FloatRegister(i);
 4001         __ rev32(f, __ T16B, v16);
 4002         __ addv(v16, __ T4S, v16, v8);
 4003       }
 4004 
 4005       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 4006 
 4007       // Encrypt the counters
 4008       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 4009 
 4010       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 4011 
 4012       // XOR the encrypted counters with the inputs
 4013       for (int i = 0; i < 8; i++) {
 4014         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 4015         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 4016         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 4017       }
 4018       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 4019       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 4020 
 4021       __ subw(len, len, 16 * 8);
 4022       __ cbnzw(len, L_CTR_loop);
 4023     }
 4024 
 4025     __ rev32(v16, __ T16B, v16);
 4026     __ st1(v16, __ T16B, counter);
 4027 
 4028     __ ldr(len, Address(sp));
 4029     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 4030 
 4031     // GHASH/CTR loop
 4032     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 4033                                 len, /*unrolls*/4);
 4034 
 4035 #ifdef ASSERT
 4036     { Label L;
 4037       __ cmp(len, (unsigned char)0);
 4038       __ br(Assembler::EQ, L);
 4039       __ stop("stubGenerator: abort");
 4040       __ bind(L);
 4041   }
 4042 #endif
 4043 
 4044   __ bind(DONE);
 4045     // Return the number of bytes processed
 4046     __ ldr(r0, __ post(sp, 2 * wordSize));
 4047 
 4048     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 4049     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 4050 
 4051     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4052     __ ret(lr);
 4053 
 4054     // bind label and generate polynomial data
 4055     __ align(wordSize * 2);
 4056     __ bind(ghash_polynomial);
 4057     __ emit_int64(0x87);  // The low-order bits of the field
 4058                           // polynomial (i.e. p = z^7+z^2+z+1)
 4059                           // repeated in the low and high parts of a
 4060                           // 128-bit vector
 4061     __ emit_int64(0x87);
 4062 
 4063     // record the stub entry and end
 4064     store_archive_data(stub_id, start, __ pc());
 4065 
 4066     return start;
 4067   }
 4068 
 4069   class Cached64Bytes {
 4070   private:
 4071     MacroAssembler *_masm;
 4072     Register _regs[8];
 4073 
 4074   public:
 4075     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 4076       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 4077       auto it = rs.begin();
 4078       for (auto &r: _regs) {
 4079         r = *it;
 4080         ++it;
 4081       }
 4082     }
 4083 
 4084     void gen_loads(Register base) {
 4085       for (int i = 0; i < 8; i += 2) {
 4086         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 4087       }
 4088     }
 4089 
 4090     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 4091     void extract_u32(Register dest, int i) {
 4092       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 4093     }
 4094   };
 4095 
 4096   // Utility routines for md5.
 4097   // Clobbers r10 and r11.
 4098   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4099               int k, int s, int t) {
 4100     Register rscratch3 = r10;
 4101     Register rscratch4 = r11;
 4102 
 4103     __ eorw(rscratch3, r3, r4);
 4104     __ movw(rscratch2, t);
 4105     __ andw(rscratch3, rscratch3, r2);
 4106     __ addw(rscratch4, r1, rscratch2);
 4107     reg_cache.extract_u32(rscratch1, k);
 4108     __ eorw(rscratch3, rscratch3, r4);
 4109     __ addw(rscratch4, rscratch4, rscratch1);
 4110     __ addw(rscratch3, rscratch3, rscratch4);
 4111     __ rorw(rscratch2, rscratch3, 32 - s);
 4112     __ addw(r1, rscratch2, r2);
 4113   }
 4114 
 4115   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4116               int k, int s, int t) {
 4117     Register rscratch3 = r10;
 4118     Register rscratch4 = r11;
 4119 
 4120     reg_cache.extract_u32(rscratch1, k);
 4121     __ movw(rscratch2, t);
 4122     __ addw(rscratch4, r1, rscratch2);
 4123     __ addw(rscratch4, rscratch4, rscratch1);
 4124     __ bicw(rscratch2, r3, r4);
 4125     __ andw(rscratch3, r2, r4);
 4126     __ addw(rscratch2, rscratch2, rscratch4);
 4127     __ addw(rscratch2, rscratch2, rscratch3);
 4128     __ rorw(rscratch2, rscratch2, 32 - s);
 4129     __ addw(r1, rscratch2, r2);
 4130   }
 4131 
 4132   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4133               int k, int s, int t) {
 4134     Register rscratch3 = r10;
 4135     Register rscratch4 = r11;
 4136 
 4137     __ eorw(rscratch3, r3, r4);
 4138     __ movw(rscratch2, t);
 4139     __ addw(rscratch4, r1, rscratch2);
 4140     reg_cache.extract_u32(rscratch1, k);
 4141     __ eorw(rscratch3, rscratch3, r2);
 4142     __ addw(rscratch4, rscratch4, rscratch1);
 4143     __ addw(rscratch3, rscratch3, rscratch4);
 4144     __ rorw(rscratch2, rscratch3, 32 - s);
 4145     __ addw(r1, rscratch2, r2);
 4146   }
 4147 
 4148   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 4149               int k, int s, int t) {
 4150     Register rscratch3 = r10;
 4151     Register rscratch4 = r11;
 4152 
 4153     __ movw(rscratch3, t);
 4154     __ ornw(rscratch2, r2, r4);
 4155     __ addw(rscratch4, r1, rscratch3);
 4156     reg_cache.extract_u32(rscratch1, k);
 4157     __ eorw(rscratch3, rscratch2, r3);
 4158     __ addw(rscratch4, rscratch4, rscratch1);
 4159     __ addw(rscratch3, rscratch3, rscratch4);
 4160     __ rorw(rscratch2, rscratch3, 32 - s);
 4161     __ addw(r1, rscratch2, r2);
 4162   }
 4163 
 4164   // Arguments:
 4165   //
 4166   // Inputs:
 4167   //   c_rarg0   - byte[]  source+offset
 4168   //   c_rarg1   - int[]   SHA.state
 4169   //   c_rarg2   - int     offset
 4170   //   c_rarg3   - int     limit
 4171   //
 4172   address generate_md5_implCompress(StubId stub_id) {
 4173     bool multi_block;
 4174     switch (stub_id) {
 4175     case StubId::stubgen_md5_implCompress_id:
 4176       multi_block = false;
 4177       break;
 4178     case StubId::stubgen_md5_implCompressMB_id:
 4179       multi_block = true;
 4180       break;
 4181     default:
 4182       ShouldNotReachHere();
 4183     }
 4184     int entry_count = StubInfo::entry_count(stub_id);
 4185     assert(entry_count == 1, "sanity check");
 4186     address start = load_archive_data(stub_id);
 4187     if (start != nullptr) {
 4188       return start;
 4189     }
 4190     __ align(CodeEntryAlignment);
 4191 
 4192     StubCodeMark mark(this, stub_id);
 4193     start = __ pc();
 4194 
 4195     Register buf       = c_rarg0;
 4196     Register state     = c_rarg1;
 4197     Register ofs       = c_rarg2;
 4198     Register limit     = c_rarg3;
 4199     Register a         = r4;
 4200     Register b         = r5;
 4201     Register c         = r6;
 4202     Register d         = r7;
 4203     Register rscratch3 = r10;
 4204     Register rscratch4 = r11;
 4205 
 4206     Register state_regs[2] = { r12, r13 };
 4207     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 4208     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 4209 
 4210     __ push(saved_regs, sp);
 4211 
 4212     __ ldp(state_regs[0], state_regs[1], Address(state));
 4213     __ ubfx(a, state_regs[0],  0, 32);
 4214     __ ubfx(b, state_regs[0], 32, 32);
 4215     __ ubfx(c, state_regs[1],  0, 32);
 4216     __ ubfx(d, state_regs[1], 32, 32);
 4217 
 4218     Label md5_loop;
 4219     __ BIND(md5_loop);
 4220 
 4221     reg_cache.gen_loads(buf);
 4222 
 4223     // Round 1
 4224     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 4225     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 4226     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 4227     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 4228     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 4229     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 4230     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 4231     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 4232     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 4233     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 4234     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 4235     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 4236     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 4237     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 4238     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 4239     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 4240 
 4241     // Round 2
 4242     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 4243     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 4244     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 4245     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 4246     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 4247     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 4248     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 4249     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 4250     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 4251     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 4252     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 4253     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 4254     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 4255     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 4256     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 4257     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 4258 
 4259     // Round 3
 4260     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 4261     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 4262     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 4263     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 4264     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 4265     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 4266     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 4267     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 4268     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 4269     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 4270     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 4271     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 4272     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 4273     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 4274     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 4275     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 4276 
 4277     // Round 4
 4278     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 4279     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 4280     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 4281     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 4282     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 4283     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 4284     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 4285     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 4286     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 4287     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 4288     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 4289     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 4290     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 4291     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 4292     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 4293     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 4294 
 4295     __ addw(a, state_regs[0], a);
 4296     __ ubfx(rscratch2, state_regs[0], 32, 32);
 4297     __ addw(b, rscratch2, b);
 4298     __ addw(c, state_regs[1], c);
 4299     __ ubfx(rscratch4, state_regs[1], 32, 32);
 4300     __ addw(d, rscratch4, d);
 4301 
 4302     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 4303     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 4304 
 4305     if (multi_block) {
 4306       __ add(buf, buf, 64);
 4307       __ add(ofs, ofs, 64);
 4308       __ cmp(ofs, limit);
 4309       __ br(Assembler::LE, md5_loop);
 4310       __ mov(c_rarg0, ofs); // return ofs
 4311     }
 4312 
 4313     // write hash values back in the correct order
 4314     __ stp(state_regs[0], state_regs[1], Address(state));
 4315 
 4316     __ pop(saved_regs, sp);
 4317 
 4318     __ ret(lr);
 4319 
 4320     // record the stub entry and end
 4321     store_archive_data(stub_id, start, __ pc());
 4322 
 4323     return start;
 4324   }
 4325 
 4326   // Arguments:
 4327   //
 4328   // Inputs:
 4329   //   c_rarg0   - byte[]  source+offset
 4330   //   c_rarg1   - int[]   SHA.state
 4331   //   c_rarg2   - int     offset
 4332   //   c_rarg3   - int     limit
 4333   //
 4334   address generate_sha1_implCompress(StubId stub_id) {
 4335     bool multi_block;
 4336     switch (stub_id) {
 4337     case StubId::stubgen_sha1_implCompress_id:
 4338       multi_block = false;
 4339       break;
 4340     case StubId::stubgen_sha1_implCompressMB_id:
 4341       multi_block = true;
 4342       break;
 4343     default:
 4344       ShouldNotReachHere();
 4345     }
 4346     int entry_count = StubInfo::entry_count(stub_id);
 4347     assert(entry_count == 1, "sanity check");
 4348     address start = load_archive_data(stub_id);
 4349     if (start != nullptr) {
 4350       return start;
 4351     }
 4352     __ align(CodeEntryAlignment);
 4353 
 4354     StubCodeMark mark(this, stub_id);
 4355     start = __ pc();
 4356 
 4357     Register buf   = c_rarg0;
 4358     Register state = c_rarg1;
 4359     Register ofs   = c_rarg2;
 4360     Register limit = c_rarg3;
 4361 
 4362     Label keys;
 4363     Label sha1_loop;
 4364 
 4365     // load the keys into v0..v3
 4366     __ adr(rscratch1, keys);
 4367     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 4368     // load 5 words state into v6, v7
 4369     __ ldrq(v6, Address(state, 0));
 4370     __ ldrs(v7, Address(state, 16));
 4371 
 4372 
 4373     __ BIND(sha1_loop);
 4374     // load 64 bytes of data into v16..v19
 4375     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4376     __ rev32(v16, __ T16B, v16);
 4377     __ rev32(v17, __ T16B, v17);
 4378     __ rev32(v18, __ T16B, v18);
 4379     __ rev32(v19, __ T16B, v19);
 4380 
 4381     // do the sha1
 4382     __ addv(v4, __ T4S, v16, v0);
 4383     __ orr(v20, __ T16B, v6, v6);
 4384 
 4385     FloatRegister d0 = v16;
 4386     FloatRegister d1 = v17;
 4387     FloatRegister d2 = v18;
 4388     FloatRegister d3 = v19;
 4389 
 4390     for (int round = 0; round < 20; round++) {
 4391       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 4392       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 4393       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 4394       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 4395       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 4396 
 4397       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 4398       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 4399       __ sha1h(tmp2, __ T4S, v20);
 4400       if (round < 5)
 4401         __ sha1c(v20, __ T4S, tmp3, tmp4);
 4402       else if (round < 10 || round >= 15)
 4403         __ sha1p(v20, __ T4S, tmp3, tmp4);
 4404       else
 4405         __ sha1m(v20, __ T4S, tmp3, tmp4);
 4406       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 4407 
 4408       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4409     }
 4410 
 4411     __ addv(v7, __ T2S, v7, v21);
 4412     __ addv(v6, __ T4S, v6, v20);
 4413 
 4414     if (multi_block) {
 4415       __ add(ofs, ofs, 64);
 4416       __ cmp(ofs, limit);
 4417       __ br(Assembler::LE, sha1_loop);
 4418       __ mov(c_rarg0, ofs); // return ofs
 4419     }
 4420 
 4421     __ strq(v6, Address(state, 0));
 4422     __ strs(v7, Address(state, 16));
 4423 
 4424     __ ret(lr);
 4425 
 4426     __ bind(keys);
 4427     __ emit_int32(0x5a827999);
 4428     __ emit_int32(0x6ed9eba1);
 4429     __ emit_int32(0x8f1bbcdc);
 4430     __ emit_int32(0xca62c1d6);
 4431 
 4432     // record the stub entry and end
 4433     store_archive_data(stub_id, start, __ pc());
 4434 
 4435     return start;
 4436   }
 4437 
 4438 
 4439   // Arguments:
 4440   //
 4441   // Inputs:
 4442   //   c_rarg0   - byte[]  source+offset
 4443   //   c_rarg1   - int[]   SHA.state
 4444   //   c_rarg2   - int     offset
 4445   //   c_rarg3   - int     limit
 4446   //
 4447   address generate_sha256_implCompress(StubId stub_id) {
 4448     bool multi_block;
 4449     switch (stub_id) {
 4450     case StubId::stubgen_sha256_implCompress_id:
 4451       multi_block = false;
 4452       break;
 4453     case StubId::stubgen_sha256_implCompressMB_id:
 4454       multi_block = true;
 4455       break;
 4456     default:
 4457       ShouldNotReachHere();
 4458     }
 4459     int entry_count = StubInfo::entry_count(stub_id);
 4460     assert(entry_count == 1, "sanity check");
 4461     address start = load_archive_data(stub_id);
 4462     if (start != nullptr) {
 4463       return start;
 4464     }
 4465     __ align(CodeEntryAlignment);
 4466     StubCodeMark mark(this, stub_id);
 4467     start = __ pc();
 4468 
 4469     Register buf   = c_rarg0;
 4470     Register state = c_rarg1;
 4471     Register ofs   = c_rarg2;
 4472     Register limit = c_rarg3;
 4473 
 4474     Label sha1_loop;
 4475 
 4476     __ stpd(v8, v9, __ pre(sp, -32));
 4477     __ stpd(v10, v11, Address(sp, 16));
 4478 
 4479 // dga == v0
 4480 // dgb == v1
 4481 // dg0 == v2
 4482 // dg1 == v3
 4483 // dg2 == v4
 4484 // t0 == v6
 4485 // t1 == v7
 4486 
 4487     // load 16 keys to v16..v31
 4488     __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
 4489     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4490     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4491     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4492     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4493 
 4494     // load 8 words (256 bits) state
 4495     __ ldpq(v0, v1, state);
 4496 
 4497     __ BIND(sha1_loop);
 4498     // load 64 bytes of data into v8..v11
 4499     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4500     __ rev32(v8, __ T16B, v8);
 4501     __ rev32(v9, __ T16B, v9);
 4502     __ rev32(v10, __ T16B, v10);
 4503     __ rev32(v11, __ T16B, v11);
 4504 
 4505     __ addv(v6, __ T4S, v8, v16);
 4506     __ orr(v2, __ T16B, v0, v0);
 4507     __ orr(v3, __ T16B, v1, v1);
 4508 
 4509     FloatRegister d0 = v8;
 4510     FloatRegister d1 = v9;
 4511     FloatRegister d2 = v10;
 4512     FloatRegister d3 = v11;
 4513 
 4514 
 4515     for (int round = 0; round < 16; round++) {
 4516       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4517       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4518       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4519       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4520 
 4521       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4522        __ orr(v4, __ T16B, v2, v2);
 4523       if (round < 15)
 4524         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4525       __ sha256h(v2, __ T4S, v3, tmp2);
 4526       __ sha256h2(v3, __ T4S, v4, tmp2);
 4527       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4528 
 4529       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4530     }
 4531 
 4532     __ addv(v0, __ T4S, v0, v2);
 4533     __ addv(v1, __ T4S, v1, v3);
 4534 
 4535     if (multi_block) {
 4536       __ add(ofs, ofs, 64);
 4537       __ cmp(ofs, limit);
 4538       __ br(Assembler::LE, sha1_loop);
 4539       __ mov(c_rarg0, ofs); // return ofs
 4540     }
 4541 
 4542     __ ldpd(v10, v11, Address(sp, 16));
 4543     __ ldpd(v8, v9, __ post(sp, 32));
 4544 
 4545     __ stpq(v0, v1, state);
 4546 
 4547     __ ret(lr);
 4548 
 4549     // record the stub entry and end
 4550     store_archive_data(stub_id, start, __ pc());
 4551 
 4552     return start;
 4553   }
 4554 
 4555   // Double rounds for sha512.
 4556   void sha512_dround(int dr,
 4557                      FloatRegister vi0, FloatRegister vi1,
 4558                      FloatRegister vi2, FloatRegister vi3,
 4559                      FloatRegister vi4, FloatRegister vrc0,
 4560                      FloatRegister vrc1, FloatRegister vin0,
 4561                      FloatRegister vin1, FloatRegister vin2,
 4562                      FloatRegister vin3, FloatRegister vin4) {
 4563       if (dr < 36) {
 4564         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4565       }
 4566       __ addv(v5, __ T2D, vrc0, vin0);
 4567       __ ext(v6, __ T16B, vi2, vi3, 8);
 4568       __ ext(v5, __ T16B, v5, v5, 8);
 4569       __ ext(v7, __ T16B, vi1, vi2, 8);
 4570       __ addv(vi3, __ T2D, vi3, v5);
 4571       if (dr < 32) {
 4572         __ ext(v5, __ T16B, vin3, vin4, 8);
 4573         __ sha512su0(vin0, __ T2D, vin1);
 4574       }
 4575       __ sha512h(vi3, __ T2D, v6, v7);
 4576       if (dr < 32) {
 4577         __ sha512su1(vin0, __ T2D, vin2, v5);
 4578       }
 4579       __ addv(vi4, __ T2D, vi1, vi3);
 4580       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4581   }
 4582 
 4583   // Arguments:
 4584   //
 4585   // Inputs:
 4586   //   c_rarg0   - byte[]  source+offset
 4587   //   c_rarg1   - int[]   SHA.state
 4588   //   c_rarg2   - int     offset
 4589   //   c_rarg3   - int     limit
 4590   //
 4591   address generate_sha512_implCompress(StubId stub_id) {
 4592     bool multi_block;
 4593     switch (stub_id) {
 4594     case StubId::stubgen_sha512_implCompress_id:
 4595       multi_block = false;
 4596       break;
 4597     case StubId::stubgen_sha512_implCompressMB_id:
 4598       multi_block = true;
 4599       break;
 4600     default:
 4601       ShouldNotReachHere();
 4602     }
 4603     int entry_count = StubInfo::entry_count(stub_id);
 4604     assert(entry_count == 1, "sanity check");
 4605     address start = load_archive_data(stub_id);
 4606     if (start != nullptr) {
 4607       return start;
 4608     }
 4609     __ align(CodeEntryAlignment);
 4610     StubCodeMark mark(this, stub_id);
 4611     start = __ pc();
 4612 
 4613     Register buf   = c_rarg0;
 4614     Register state = c_rarg1;
 4615     Register ofs   = c_rarg2;
 4616     Register limit = c_rarg3;
 4617 
 4618     __ stpd(v8, v9, __ pre(sp, -64));
 4619     __ stpd(v10, v11, Address(sp, 16));
 4620     __ stpd(v12, v13, Address(sp, 32));
 4621     __ stpd(v14, v15, Address(sp, 48));
 4622 
 4623     Label sha512_loop;
 4624 
 4625     // load state
 4626     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4627 
 4628     // load first 4 round constants
 4629     __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
 4630     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4631 
 4632     __ BIND(sha512_loop);
 4633     // load 128B of data into v12..v19
 4634     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4635     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4636     __ rev64(v12, __ T16B, v12);
 4637     __ rev64(v13, __ T16B, v13);
 4638     __ rev64(v14, __ T16B, v14);
 4639     __ rev64(v15, __ T16B, v15);
 4640     __ rev64(v16, __ T16B, v16);
 4641     __ rev64(v17, __ T16B, v17);
 4642     __ rev64(v18, __ T16B, v18);
 4643     __ rev64(v19, __ T16B, v19);
 4644 
 4645     __ mov(rscratch2, rscratch1);
 4646 
 4647     __ mov(v0, __ T16B, v8);
 4648     __ mov(v1, __ T16B, v9);
 4649     __ mov(v2, __ T16B, v10);
 4650     __ mov(v3, __ T16B, v11);
 4651 
 4652     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4653     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4654     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4655     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4656     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4657     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4658     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4659     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4660     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4661     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4662     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4663     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4664     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4665     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4666     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4667     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4668     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4669     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4670     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4671     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4672     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4673     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4674     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4675     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4676     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4677     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4678     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4679     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4680     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4681     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4682     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4683     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4684     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4685     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4686     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4687     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4688     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4689     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4690     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4691     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4692 
 4693     __ addv(v8, __ T2D, v8, v0);
 4694     __ addv(v9, __ T2D, v9, v1);
 4695     __ addv(v10, __ T2D, v10, v2);
 4696     __ addv(v11, __ T2D, v11, v3);
 4697 
 4698     if (multi_block) {
 4699       __ add(ofs, ofs, 128);
 4700       __ cmp(ofs, limit);
 4701       __ br(Assembler::LE, sha512_loop);
 4702       __ mov(c_rarg0, ofs); // return ofs
 4703     }
 4704 
 4705     __ st1(v8, v9, v10, v11, __ T2D, state);
 4706 
 4707     __ ldpd(v14, v15, Address(sp, 48));
 4708     __ ldpd(v12, v13, Address(sp, 32));
 4709     __ ldpd(v10, v11, Address(sp, 16));
 4710     __ ldpd(v8, v9, __ post(sp, 64));
 4711 
 4712     __ ret(lr);
 4713 
 4714     // record the stub entry and end
 4715     store_archive_data(stub_id, start, __ pc());
 4716 
 4717     return start;
 4718   }
 4719 
 4720   // Execute one round of keccak of two computations in parallel.
 4721   // One of the states should be loaded into the lower halves of
 4722   // the vector registers v0-v24, the other should be loaded into
 4723   // the upper halves of those registers. The ld1r instruction loads
 4724   // the round constant into both halves of register v31.
 4725   // Intermediate results c0...c5 and d0...d5 are computed
 4726   // in registers v25...v30.
 4727   // All vector instructions that are used operate on both register
 4728   // halves in parallel.
 4729   // If only a single computation is needed, one can only load the lower halves.
 4730   void keccak_round(Register rscratch1) {
 4731   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4732   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4733   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4734   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4735   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4736   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4737   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4738   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4739   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4740   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4741 
 4742   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4743   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4744   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4745   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4746   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4747 
 4748   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4749   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4750   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4751   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4752   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4753   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4754   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4755   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4756   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4757   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4758   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4759   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4760   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4761   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4762   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4763   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4764   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4765   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4766   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4767   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4768   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4769   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4770   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4771   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4772   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4773 
 4774   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4775   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4776   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4777   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4778   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4779 
 4780   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4781 
 4782   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4783   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4784   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4785   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4786   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4787 
 4788   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4789   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4790   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4791   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4792   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4793 
 4794   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4795   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4796   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4797   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4798   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4799 
 4800   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4801   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4802   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4803   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4804   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4805 
 4806   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4807   }
 4808 
 4809   // Arguments:
 4810   //
 4811   // Inputs:
 4812   //   c_rarg0   - byte[]  source+offset
 4813   //   c_rarg1   - byte[]  SHA.state
 4814   //   c_rarg2   - int     block_size
 4815   //   c_rarg3   - int     offset
 4816   //   c_rarg4   - int     limit
 4817   //
 4818   address generate_sha3_implCompress(StubId stub_id) {
 4819     bool multi_block;
 4820     switch (stub_id) {
 4821     case StubId::stubgen_sha3_implCompress_id:
 4822       multi_block = false;
 4823       break;
 4824     case StubId::stubgen_sha3_implCompressMB_id:
 4825       multi_block = true;
 4826       break;
 4827     default:
 4828       ShouldNotReachHere();
 4829     }
 4830     int entry_count = StubInfo::entry_count(stub_id);
 4831     assert(entry_count == 1, "sanity check");
 4832     address start = load_archive_data(stub_id);
 4833     if (start != nullptr) {
 4834       return start;
 4835     }
 4836     __ align(CodeEntryAlignment);
 4837     StubCodeMark mark(this, stub_id);
 4838     start = __ pc();
 4839 
 4840     Register buf           = c_rarg0;
 4841     Register state         = c_rarg1;
 4842     Register block_size    = c_rarg2;
 4843     Register ofs           = c_rarg3;
 4844     Register limit         = c_rarg4;
 4845 
 4846     Label sha3_loop, rounds24_loop;
 4847     Label sha3_512_or_sha3_384, shake128;
 4848 
 4849     __ stpd(v8, v9, __ pre(sp, -64));
 4850     __ stpd(v10, v11, Address(sp, 16));
 4851     __ stpd(v12, v13, Address(sp, 32));
 4852     __ stpd(v14, v15, Address(sp, 48));
 4853 
 4854     // load state
 4855     __ add(rscratch1, state, 32);
 4856     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4857     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4858     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4859     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4860     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4861     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4862     __ ld1(v24, __ T1D, rscratch1);
 4863 
 4864     __ BIND(sha3_loop);
 4865 
 4866     // 24 keccak rounds
 4867     __ movw(rscratch2, 24);
 4868 
 4869     // load round_constants base
 4870     __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
 4871 
 4872     // load input
 4873     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4874     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4875     __ eor(v0, __ T8B, v0, v25);
 4876     __ eor(v1, __ T8B, v1, v26);
 4877     __ eor(v2, __ T8B, v2, v27);
 4878     __ eor(v3, __ T8B, v3, v28);
 4879     __ eor(v4, __ T8B, v4, v29);
 4880     __ eor(v5, __ T8B, v5, v30);
 4881     __ eor(v6, __ T8B, v6, v31);
 4882 
 4883     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4884     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4885 
 4886     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4887     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4888     __ eor(v7, __ T8B, v7, v25);
 4889     __ eor(v8, __ T8B, v8, v26);
 4890     __ eor(v9, __ T8B, v9, v27);
 4891     __ eor(v10, __ T8B, v10, v28);
 4892     __ eor(v11, __ T8B, v11, v29);
 4893     __ eor(v12, __ T8B, v12, v30);
 4894     __ eor(v13, __ T8B, v13, v31);
 4895 
 4896     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4897     __ eor(v14, __ T8B, v14, v25);
 4898     __ eor(v15, __ T8B, v15, v26);
 4899     __ eor(v16, __ T8B, v16, v27);
 4900 
 4901     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4902     __ andw(c_rarg5, block_size, 48);
 4903     __ cbzw(c_rarg5, rounds24_loop);
 4904 
 4905     __ tbnz(block_size, 5, shake128);
 4906     // block_size == 144, bit5 == 0, SHA3-224
 4907     __ ldrd(v28, __ post(buf, 8));
 4908     __ eor(v17, __ T8B, v17, v28);
 4909     __ b(rounds24_loop);
 4910 
 4911     __ BIND(shake128);
 4912     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4913     __ eor(v17, __ T8B, v17, v28);
 4914     __ eor(v18, __ T8B, v18, v29);
 4915     __ eor(v19, __ T8B, v19, v30);
 4916     __ eor(v20, __ T8B, v20, v31);
 4917     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4918 
 4919     __ BIND(sha3_512_or_sha3_384);
 4920     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4921     __ eor(v7, __ T8B, v7, v25);
 4922     __ eor(v8, __ T8B, v8, v26);
 4923     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4924 
 4925     // SHA3-384
 4926     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4927     __ eor(v9,  __ T8B, v9,  v27);
 4928     __ eor(v10, __ T8B, v10, v28);
 4929     __ eor(v11, __ T8B, v11, v29);
 4930     __ eor(v12, __ T8B, v12, v30);
 4931 
 4932     __ BIND(rounds24_loop);
 4933     __ subw(rscratch2, rscratch2, 1);
 4934 
 4935     keccak_round(rscratch1);
 4936 
 4937     __ cbnzw(rscratch2, rounds24_loop);
 4938 
 4939     if (multi_block) {
 4940       __ add(ofs, ofs, block_size);
 4941       __ cmp(ofs, limit);
 4942       __ br(Assembler::LE, sha3_loop);
 4943       __ mov(c_rarg0, ofs); // return ofs
 4944     }
 4945 
 4946     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4947     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4948     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4949     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4950     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4951     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4952     __ st1(v24, __ T1D, state);
 4953 
 4954     // restore callee-saved registers
 4955     __ ldpd(v14, v15, Address(sp, 48));
 4956     __ ldpd(v12, v13, Address(sp, 32));
 4957     __ ldpd(v10, v11, Address(sp, 16));
 4958     __ ldpd(v8, v9, __ post(sp, 64));
 4959 
 4960     __ ret(lr);
 4961 
 4962     // record the stub entry and end
 4963     store_archive_data(stub_id, start, __ pc());
 4964 
 4965     return start;
 4966   }
 4967 
 4968   // Inputs:
 4969   //   c_rarg0   - long[]  state0
 4970   //   c_rarg1   - long[]  state1
 4971   address generate_double_keccak() {
 4972     StubId stub_id = StubId::stubgen_double_keccak_id;
 4973     int entry_count = StubInfo::entry_count(stub_id);
 4974     assert(entry_count == 1, "sanity check");
 4975     address start = load_archive_data(stub_id);
 4976     if (start != nullptr) {
 4977       return start;
 4978     }
 4979     // Implements the double_keccak() method of the
 4980     // sun.secyrity.provider.SHA3Parallel class
 4981     __ align(CodeEntryAlignment);
 4982     StubCodeMark mark(this, stub_id);
 4983     start = __ pc();
 4984     __ enter();
 4985 
 4986     Register state0        = c_rarg0;
 4987     Register state1        = c_rarg1;
 4988 
 4989     Label rounds24_loop;
 4990 
 4991     // save callee-saved registers
 4992     __ stpd(v8, v9, __ pre(sp, -64));
 4993     __ stpd(v10, v11, Address(sp, 16));
 4994     __ stpd(v12, v13, Address(sp, 32));
 4995     __ stpd(v14, v15, Address(sp, 48));
 4996 
 4997     // load states
 4998     __ add(rscratch1, state0, 32);
 4999     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 5000     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 5001     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 5002     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 5003     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 5004     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 5005     __ ld1(v24, __ D, 0, rscratch1);
 5006     __ add(rscratch1, state1, 32);
 5007     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 5008     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 5009     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 5010     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 5011     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 5012     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 5013     __ ld1(v24, __ D, 1, rscratch1);
 5014 
 5015     // 24 keccak rounds
 5016     __ movw(rscratch2, 24);
 5017 
 5018     // load round_constants base
 5019     __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
 5020 
 5021     __ BIND(rounds24_loop);
 5022     __ subw(rscratch2, rscratch2, 1);
 5023     keccak_round(rscratch1);
 5024     __ cbnzw(rscratch2, rounds24_loop);
 5025 
 5026     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 5027     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 5028     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 5029     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 5030     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 5031     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 5032     __ st1(v24, __ D, 0, state0);
 5033     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 5034     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 5035     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 5036     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 5037     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 5038     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 5039     __ st1(v24, __ D, 1, state1);
 5040 
 5041     // restore callee-saved vector registers
 5042     __ ldpd(v14, v15, Address(sp, 48));
 5043     __ ldpd(v12, v13, Address(sp, 32));
 5044     __ ldpd(v10, v11, Address(sp, 16));
 5045     __ ldpd(v8, v9, __ post(sp, 64));
 5046 
 5047     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5048     __ mov(r0, zr); // return 0
 5049     __ ret(lr);
 5050 
 5051     // record the stub entry and end
 5052     store_archive_data(stub_id, start, __ pc());
 5053 
 5054     return start;
 5055   }
 5056 
 5057   // ChaCha20 block function.  This version parallelizes the 32-bit
 5058   // state elements on each of 16 vectors, producing 4 blocks of
 5059   // keystream at a time.
 5060   //
 5061   // state (int[16]) = c_rarg0
 5062   // keystream (byte[256]) = c_rarg1
 5063   // return - number of bytes of produced keystream (always 256)
 5064   //
 5065   // This implementation takes each 32-bit integer from the state
 5066   // array and broadcasts it across all 4 32-bit lanes of a vector register
 5067   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 5068   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 5069   // the quarter round schedule is implemented as outlined in RFC 7539 section
 5070   // 2.3.  However, instead of sequentially processing the 3 quarter round
 5071   // operations represented by one QUARTERROUND function, we instead stack all
 5072   // the adds, xors and left-rotations from the first 4 quarter rounds together
 5073   // and then do the same for the second set of 4 quarter rounds.  This removes
 5074   // some latency that would otherwise be incurred by waiting for an add to
 5075   // complete before performing an xor (which depends on the result of the
 5076   // add), etc. An adjustment happens between the first and second groups of 4
 5077   // quarter rounds, but this is done only in the inputs to the macro functions
 5078   // that generate the assembly instructions - these adjustments themselves are
 5079   // not part of the resulting assembly.
 5080   // The 4 registers v0-v3 are used during the quarter round operations as
 5081   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 5082   // registers become the vectors involved in adding the start state back onto
 5083   // the post-QR working state.  After the adds are complete, each of the 16
 5084   // vectors write their first lane back to the keystream buffer, followed
 5085   // by the second lane from all vectors and so on.
 5086   address generate_chacha20Block_blockpar() {
 5087     StubId stub_id = StubId::stubgen_chacha20Block_id;
 5088     int entry_count = StubInfo::entry_count(stub_id);
 5089     assert(entry_count == 1, "sanity check");
 5090     address start = load_archive_data(stub_id);
 5091     if (start != nullptr) {
 5092       return start;
 5093     }
 5094     Label L_twoRounds, L_cc20_const;
 5095     __ align(CodeEntryAlignment);
 5096     StubCodeMark mark(this, stub_id);
 5097     start = __ pc();
 5098     __ enter();
 5099 
 5100     int i, j;
 5101     const Register state = c_rarg0;
 5102     const Register keystream = c_rarg1;
 5103     const Register loopCtr = r10;
 5104     const Register tmpAddr = r11;
 5105     const FloatRegister ctrAddOverlay = v28;
 5106     const FloatRegister lrot8Tbl = v29;
 5107 
 5108     // Organize SIMD registers in an array that facilitates
 5109     // putting repetitive opcodes into loop structures.  It is
 5110     // important that each grouping of 4 registers is monotonically
 5111     // increasing to support the requirements of multi-register
 5112     // instructions (e.g. ld4r, st4, etc.)
 5113     const FloatRegister workSt[16] = {
 5114          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 5115         v20, v21, v22, v23, v24, v25, v26, v27
 5116     };
 5117 
 5118     // Pull in constant data.  The first 16 bytes are the add overlay
 5119     // which is applied to the vector holding the counter (state[12]).
 5120     // The second 16 bytes is the index register for the 8-bit left
 5121     // rotation tbl instruction.
 5122     __ adr(tmpAddr, L_cc20_const);
 5123     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 5124 
 5125     // Load from memory and interlace across 16 SIMD registers,
 5126     // With each word from memory being broadcast to all lanes of
 5127     // each successive SIMD register.
 5128     //      Addr(0) -> All lanes in workSt[i]
 5129     //      Addr(4) -> All lanes workSt[i + 1], etc.
 5130     __ mov(tmpAddr, state);
 5131     for (i = 0; i < 16; i += 4) {
 5132       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 5133           __ post(tmpAddr, 16));
 5134     }
 5135     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5136 
 5137     // Before entering the loop, create 5 4-register arrays.  These
 5138     // will hold the 4 registers that represent the a/b/c/d fields
 5139     // in the quarter round operation.  For instance the "b" field
 5140     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 5141     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 5142     // since it is part of a diagonal organization.  The aSet and scratch
 5143     // register sets are defined at declaration time because they do not change
 5144     // organization at any point during the 20-round processing.
 5145     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 5146     FloatRegister bSet[4];
 5147     FloatRegister cSet[4];
 5148     FloatRegister dSet[4];
 5149     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 5150 
 5151     // Set up the 10 iteration loop and perform all 8 quarter round ops
 5152     __ mov(loopCtr, 10);
 5153     __ BIND(L_twoRounds);
 5154 
 5155     // Set to columnar organization and do the following 4 quarter-rounds:
 5156     // QUARTERROUND(0, 4, 8, 12)
 5157     // QUARTERROUND(1, 5, 9, 13)
 5158     // QUARTERROUND(2, 6, 10, 14)
 5159     // QUARTERROUND(3, 7, 11, 15)
 5160     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 5161     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 5162     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 5163 
 5164     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5165     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5166     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5167 
 5168     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5169     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5170     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5171 
 5172     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5173     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5174     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5175 
 5176     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5177     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5178     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5179 
 5180     // Set to diagonal organization and do the next 4 quarter-rounds:
 5181     // QUARTERROUND(0, 5, 10, 15)
 5182     // QUARTERROUND(1, 6, 11, 12)
 5183     // QUARTERROUND(2, 7, 8, 13)
 5184     // QUARTERROUND(3, 4, 9, 14)
 5185     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 5186     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 5187     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 5188 
 5189     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5190     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5191     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 5192 
 5193     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5194     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5195     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 5196 
 5197     __ cc20_qr_add4(aSet, bSet);                    // a += b
 5198     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 5199     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 5200 
 5201     __ cc20_qr_add4(cSet, dSet);                    // c += d
 5202     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 5203     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 5204 
 5205     // Decrement and iterate
 5206     __ sub(loopCtr, loopCtr, 1);
 5207     __ cbnz(loopCtr, L_twoRounds);
 5208 
 5209     __ mov(tmpAddr, state);
 5210 
 5211     // Add the starting state back to the post-loop keystream
 5212     // state.  We read/interlace the state array from memory into
 5213     // 4 registers similar to what we did in the beginning.  Then
 5214     // add the counter overlay onto workSt[12] at the end.
 5215     for (i = 0; i < 16; i += 4) {
 5216       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 5217       __ addv(workSt[i], __ T4S, workSt[i], v0);
 5218       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 5219       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 5220       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 5221     }
 5222     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 5223 
 5224     // Write working state into the keystream buffer.  This is accomplished
 5225     // by taking the lane "i" from each of the four vectors and writing
 5226     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 5227     // repeating with the next 4 vectors until all 16 vectors have been used.
 5228     // Then move to the next lane and repeat the process until all lanes have
 5229     // been written.
 5230     for (i = 0; i < 4; i++) {
 5231       for (j = 0; j < 16; j += 4) {
 5232         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 5233             __ post(keystream, 16));
 5234       }
 5235     }
 5236 
 5237     __ mov(r0, 256);             // Return length of output keystream
 5238     __ leave();
 5239     __ ret(lr);
 5240 
 5241     // bind label and generate local constant data used by this stub
 5242     // The constant data is broken into two 128-bit segments to be loaded
 5243     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 5244     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 5245     // The second 128-bits is a table constant used for 8-bit left rotations.
 5246     __ BIND(L_cc20_const);
 5247     __ emit_int64(0x0000000100000000UL);
 5248     __ emit_int64(0x0000000300000002UL);
 5249     __ emit_int64(0x0605040702010003UL);
 5250     __ emit_int64(0x0E0D0C0F0A09080BUL);
 5251 
 5252     // record the stub entry and end
 5253     store_archive_data(stub_id, start, __ pc());
 5254 
 5255     return start;
 5256   }
 5257 
 5258   // Helpers to schedule parallel operation bundles across vector
 5259   // register sequences of size 2, 4 or 8.
 5260 
 5261   // Implement various primitive computations across vector sequences
 5262 
 5263   template<int N>
 5264   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5265                const VSeq<N>& v1, const VSeq<N>& v2) {
 5266     // output must not be constant
 5267     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5268     // output cannot overwrite pending inputs
 5269     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5270     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5271     for (int i = 0; i < N; i++) {
 5272       __ addv(v[i], T, v1[i], v2[i]);
 5273     }
 5274   }
 5275 
 5276   template<int N>
 5277   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5278                const VSeq<N>& v1, const VSeq<N>& v2) {
 5279     // output must not be constant
 5280     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5281     // output cannot overwrite pending inputs
 5282     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5283     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5284     for (int i = 0; i < N; i++) {
 5285       __ subv(v[i], T, v1[i], v2[i]);
 5286     }
 5287   }
 5288 
 5289   template<int N>
 5290   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5291                const VSeq<N>& v1, const VSeq<N>& v2) {
 5292     // output must not be constant
 5293     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5294     // output cannot overwrite pending inputs
 5295     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5296     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5297     for (int i = 0; i < N; i++) {
 5298       __ mulv(v[i], T, v1[i], v2[i]);
 5299     }
 5300   }
 5301 
 5302   template<int N>
 5303   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 5304     // output must not be constant
 5305     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5306     // output cannot overwrite pending inputs
 5307     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5308     for (int i = 0; i < N; i++) {
 5309       __ negr(v[i], T, v1[i]);
 5310     }
 5311   }
 5312 
 5313   template<int N>
 5314   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 5315                const VSeq<N>& v1, int shift) {
 5316     // output must not be constant
 5317     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5318     // output cannot overwrite pending inputs
 5319     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5320     for (int i = 0; i < N; i++) {
 5321       __ sshr(v[i], T, v1[i], shift);
 5322     }
 5323   }
 5324 
 5325   template<int N>
 5326   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5327     // output must not be constant
 5328     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5329     // output cannot overwrite pending inputs
 5330     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5331     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5332     for (int i = 0; i < N; i++) {
 5333       __ andr(v[i], __ T16B, v1[i], v2[i]);
 5334     }
 5335   }
 5336 
 5337   template<int N>
 5338   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 5339     // output must not be constant
 5340     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5341     // output cannot overwrite pending inputs
 5342     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5343     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5344     for (int i = 0; i < N; i++) {
 5345       __ orr(v[i], __ T16B, v1[i], v2[i]);
 5346     }
 5347   }
 5348 
 5349   template<int N>
 5350   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 5351     // output must not be constant
 5352     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5353     // output cannot overwrite pending inputs
 5354     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5355     for (int i = 0; i < N; i++) {
 5356       __ notr(v[i], __ T16B, v1[i]);
 5357     }
 5358   }
 5359 
 5360   template<int N>
 5361   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 5362     // output must not be constant
 5363     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5364     // output cannot overwrite pending inputs
 5365     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5366     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5367     for (int i = 0; i < N; i++) {
 5368       __ sqdmulh(v[i], T, v1[i], v2[i]);
 5369     }
 5370   }
 5371 
 5372   template<int N>
 5373   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 5374     // output must not be constant
 5375     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 5376     // output cannot overwrite pending inputs
 5377     assert(!vs_write_before_read(v, v1), "output overwrites input");
 5378     assert(!vs_write_before_read(v, v2), "output overwrites input");
 5379     for (int i = 0; i < N; i++) {
 5380       __ mlsv(v[i], T, v1[i], v2[i]);
 5381     }
 5382   }
 5383 
 5384   // load N/2 successive pairs of quadword values from memory in order
 5385   // into N successive vector registers of the sequence via the
 5386   // address supplied in base.
 5387   template<int N>
 5388   void vs_ldpq(const VSeq<N>& v, Register base) {
 5389     for (int i = 0; i < N; i += 2) {
 5390       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 5391     }
 5392   }
 5393 
 5394   // load N/2 successive pairs of quadword values from memory in order
 5395   // into N vector registers of the sequence via the address supplied
 5396   // in base using post-increment addressing
 5397   template<int N>
 5398   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 5399     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5400     for (int i = 0; i < N; i += 2) {
 5401       __ ldpq(v[i], v[i+1], __ post(base, 32));
 5402     }
 5403   }
 5404 
 5405   // store N successive vector registers of the sequence into N/2
 5406   // successive pairs of quadword memory locations via the address
 5407   // supplied in base using post-increment addressing
 5408   template<int N>
 5409   void vs_stpq_post(const VSeq<N>& v, Register base) {
 5410     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5411     for (int i = 0; i < N; i += 2) {
 5412       __ stpq(v[i], v[i+1], __ post(base, 32));
 5413     }
 5414   }
 5415 
 5416   // load N/2 pairs of quadword values from memory de-interleaved into
 5417   // N vector registers 2 at a time via the address supplied in base
 5418   // using post-increment addressing.
 5419   template<int N>
 5420   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5421     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5422     for (int i = 0; i < N; i += 2) {
 5423       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 5424     }
 5425   }
 5426 
 5427   // store N vector registers interleaved into N/2 pairs of quadword
 5428   // memory locations via the address supplied in base using
 5429   // post-increment addressing.
 5430   template<int N>
 5431   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5432     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 5433     for (int i = 0; i < N; i += 2) {
 5434       __ st2(v[i], v[i+1], T, __ post(base, 32));
 5435     }
 5436   }
 5437 
 5438   // load N quadword values from memory de-interleaved into N vector
 5439   // registers 3 elements at a time via the address supplied in base.
 5440   template<int N>
 5441   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5442     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5443     for (int i = 0; i < N; i += 3) {
 5444       __ ld3(v[i], v[i+1], v[i+2], T, base);
 5445     }
 5446   }
 5447 
 5448   // load N quadword values from memory de-interleaved into N vector
 5449   // registers 3 elements at a time via the address supplied in base
 5450   // using post-increment addressing.
 5451   template<int N>
 5452   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 5453     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 5454     for (int i = 0; i < N; i += 3) {
 5455       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 5456     }
 5457   }
 5458 
 5459   // load N/2 pairs of quadword values from memory into N vector
 5460   // registers via the address supplied in base with each pair indexed
 5461   // using the the start offset plus the corresponding entry in the
 5462   // offsets array
 5463   template<int N>
 5464   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5465     for (int i = 0; i < N/2; i++) {
 5466       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5467     }
 5468   }
 5469 
 5470   // store N vector registers into N/2 pairs of quadword memory
 5471   // locations via the address supplied in base with each pair indexed
 5472   // using the the start offset plus the corresponding entry in the
 5473   // offsets array
 5474   template<int N>
 5475   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5476     for (int i = 0; i < N/2; i++) {
 5477       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5478     }
 5479   }
 5480 
 5481   // load N single quadword values from memory into N vector registers
 5482   // via the address supplied in base with each value indexed using
 5483   // the the start offset plus the corresponding entry in the offsets
 5484   // array
 5485   template<int N>
 5486   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5487                       int start, int (&offsets)[N]) {
 5488     for (int i = 0; i < N; i++) {
 5489       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5490     }
 5491   }
 5492 
 5493   // store N vector registers into N single quadword memory locations
 5494   // via the address supplied in base with each value indexed using
 5495   // the the start offset plus the corresponding entry in the offsets
 5496   // array
 5497   template<int N>
 5498   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5499                       int start, int (&offsets)[N]) {
 5500     for (int i = 0; i < N; i++) {
 5501       __ str(v[i], T, Address(base, start + offsets[i]));
 5502     }
 5503   }
 5504 
 5505   // load N/2 pairs of quadword values from memory de-interleaved into
 5506   // N vector registers 2 at a time via the address supplied in base
 5507   // with each pair indexed using the the start offset plus the
 5508   // corresponding entry in the offsets array
 5509   template<int N>
 5510   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5511                       Register tmp, int start, int (&offsets)[N/2]) {
 5512     for (int i = 0; i < N/2; i++) {
 5513       __ add(tmp, base, start + offsets[i]);
 5514       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5515     }
 5516   }
 5517 
 5518   // store N vector registers 2 at a time interleaved into N/2 pairs
 5519   // of quadword memory locations via the address supplied in base
 5520   // with each pair indexed using the the start offset plus the
 5521   // corresponding entry in the offsets array
 5522   template<int N>
 5523   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5524                       Register tmp, int start, int (&offsets)[N/2]) {
 5525     for (int i = 0; i < N/2; i++) {
 5526       __ add(tmp, base, start + offsets[i]);
 5527       __ st2(v[2*i], v[2*i+1], T, tmp);
 5528     }
 5529   }
 5530 
 5531   // Helper routines for various flavours of Montgomery multiply
 5532 
 5533   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5534   // multiplications in parallel
 5535   //
 5536 
 5537   // See the montMul() method of the sun.security.provider.ML_DSA
 5538   // class.
 5539   //
 5540   // Computes 4x4S results or 8x8H results
 5541   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5542   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5543   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5544   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5545   // Outputs: va - 4x4S or 4x8H vector register sequences
 5546   // vb, vc, vtmp and vq must all be disjoint
 5547   // va must be disjoint from all other inputs/temps or must equal vc
 5548   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5549   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5550   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5551                    Assembler::SIMD_Arrangement T,
 5552                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5553     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5554     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5555     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5556     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5557 
 5558     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5559     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5560 
 5561     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5562 
 5563     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5564     assert(vs_disjoint(va, vb), "va and vb overlap");
 5565     assert(vs_disjoint(va, vq), "va and vq overlap");
 5566     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5567     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5568 
 5569     // schedule 4 streams of instructions across the vector sequences
 5570     for (int i = 0; i < 4; i++) {
 5571       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5572       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5573     }
 5574 
 5575     for (int i = 0; i < 4; i++) {
 5576       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5577     }
 5578 
 5579     for (int i = 0; i < 4; i++) {
 5580       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5581     }
 5582 
 5583     for (int i = 0; i < 4; i++) {
 5584       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5585     }
 5586   }
 5587 
 5588   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5589   // multiplications in parallel
 5590   //
 5591 
 5592   // See the montMul() method of the sun.security.provider.ML_DSA
 5593   // class.
 5594   //
 5595   // Computes 4x4S results or 8x8H results
 5596   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5597   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5598   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5599   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5600   // Outputs: va - 4x4S or 4x8H vector register sequences
 5601   // vb, vc, vtmp and vq must all be disjoint
 5602   // va must be disjoint from all other inputs/temps or must equal vc
 5603   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5604   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5605   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5606                    Assembler::SIMD_Arrangement T,
 5607                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5608     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5609     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5610     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5611     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5612 
 5613     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5614     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5615 
 5616     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5617 
 5618     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5619     assert(vs_disjoint(va, vb), "va and vb overlap");
 5620     assert(vs_disjoint(va, vq), "va and vq overlap");
 5621     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5622     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5623 
 5624     // schedule 2 streams of instructions across the vector sequences
 5625     for (int i = 0; i < 2; i++) {
 5626       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5627       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5628     }
 5629 
 5630     for (int i = 0; i < 2; i++) {
 5631       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5632     }
 5633 
 5634     for (int i = 0; i < 2; i++) {
 5635       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5636     }
 5637 
 5638     for (int i = 0; i < 2; i++) {
 5639       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5640     }
 5641   }
 5642 
 5643   // Perform 16 16-bit Montgomery multiplications in parallel.
 5644   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5645                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5646     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5647     // It will assert that the register use is valid
 5648     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5649   }
 5650 
 5651   // Perform 32 16-bit Montgomery multiplications in parallel.
 5652   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5653                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5654     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5655     // It will assert that the register use is valid
 5656     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5657   }
 5658 
 5659   // Perform 64 16-bit Montgomery multiplications in parallel.
 5660   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5661                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5662     // Schedule two successive 4x8H multiplies via the montmul helper
 5663     // on the front and back halves of va, vb and vc. The helper will
 5664     // assert that the register use has no overlap conflicts on each
 5665     // individual call but we also need to ensure that the necessary
 5666     // disjoint/equality constraints are met across both calls.
 5667 
 5668     // vb, vc, vtmp and vq must be disjoint. va must either be
 5669     // disjoint from all other registers or equal vc
 5670 
 5671     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5672     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5673     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5674 
 5675     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5676     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5677 
 5678     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5679 
 5680     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5681     assert(vs_disjoint(va, vb), "va and vb overlap");
 5682     assert(vs_disjoint(va, vq), "va and vq overlap");
 5683     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5684 
 5685     // we multiply the front and back halves of each sequence 4 at a
 5686     // time because
 5687     //
 5688     // 1) we are currently only able to get 4-way instruction
 5689     // parallelism at best
 5690     //
 5691     // 2) we need registers for the constants in vq and temporary
 5692     // scratch registers to hold intermediate results so vtmp can only
 5693     // be a VSeq<4> which means we only have 4 scratch slots
 5694 
 5695     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5696     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5697   }
 5698 
 5699   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5700                                const VSeq<4>& vc,
 5701                                const VSeq<4>& vtmp,
 5702                                const VSeq<2>& vq) {
 5703     // compute a = montmul(a1, c)
 5704     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5705     // ouptut a1 = a0 - a
 5706     vs_subv(va1, __ T8H, va0, vc);
 5707     //    and a0 = a0 + a
 5708     vs_addv(va0, __ T8H, va0, vc);
 5709   }
 5710 
 5711   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5712                                const VSeq<4>& vb,
 5713                                const VSeq<4>& vtmp1,
 5714                                const VSeq<4>& vtmp2,
 5715                                const VSeq<2>& vq) {
 5716     // compute c = a0 - a1
 5717     vs_subv(vtmp1, __ T8H, va0, va1);
 5718     // output a0 = a0 + a1
 5719     vs_addv(va0, __ T8H, va0, va1);
 5720     // output a1 = b montmul c
 5721     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5722   }
 5723 
 5724   void load64shorts(const VSeq<8>& v, Register shorts) {
 5725     vs_ldpq_post(v, shorts);
 5726   }
 5727 
 5728   void load32shorts(const VSeq<4>& v, Register shorts) {
 5729     vs_ldpq_post(v, shorts);
 5730   }
 5731 
 5732   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5733     vs_stpq_post(v, tmpAddr);
 5734   }
 5735 
 5736   // Kyber NTT function.
 5737   // Implements
 5738   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5739   //
 5740   // coeffs (short[256]) = c_rarg0
 5741   // ntt_zetas (short[256]) = c_rarg1
 5742   address generate_kyberNtt() {
 5743     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5744     int entry_count = StubInfo::entry_count(stub_id);
 5745     assert(entry_count == 1, "sanity check");
 5746     address start = load_archive_data(stub_id);
 5747     if (start != nullptr) {
 5748       return start;
 5749     }
 5750     __ align(CodeEntryAlignment);
 5751     StubCodeMark mark(this, stub_id);
 5752     start = __ pc();
 5753     __ enter();
 5754 
 5755     const Register coeffs = c_rarg0;
 5756     const Register zetas = c_rarg1;
 5757 
 5758     const Register kyberConsts = r10;
 5759     const Register tmpAddr = r11;
 5760 
 5761     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5762     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5763     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5764 
 5765     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5766     // load the montmul constants
 5767     vs_ldpq(vq, kyberConsts);
 5768 
 5769     // Each level corresponds to an iteration of the outermost loop of the
 5770     // Java method seilerNTT(int[] coeffs). There are some differences
 5771     // from what is done in the seilerNTT() method, though:
 5772     // 1. The computation is using 16-bit signed values, we do not convert them
 5773     // to ints here.
 5774     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5775     // this array for each level, it is easier that way to fill up the vector
 5776     // registers.
 5777     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5778     // multiplications (this is because that way there should not be any
 5779     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5780     // that we can use the 16-bit arithmetic in the vector unit.
 5781     //
 5782     // On each level, we fill up the vector registers in such a way that the
 5783     // array elements that need to be multiplied by the zetas go into one
 5784     // set of vector registers while the corresponding ones that don't need to
 5785     // be multiplied, go into another set.
 5786     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5787     // registers interleaving the steps of 4 identical computations,
 5788     // each done on 8 16-bit values per register.
 5789 
 5790     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5791     // to the zetas occur in discrete blocks whose size is some multiple
 5792     // of 32.
 5793 
 5794     // level 0
 5795     __ add(tmpAddr, coeffs, 256);
 5796     load64shorts(vs1, tmpAddr);
 5797     load64shorts(vs2, zetas);
 5798     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5799     __ add(tmpAddr, coeffs, 0);
 5800     load64shorts(vs1, tmpAddr);
 5801     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5802     vs_addv(vs1, __ T8H, vs1, vs2);
 5803     __ add(tmpAddr, coeffs, 0);
 5804     vs_stpq_post(vs1, tmpAddr);
 5805     __ add(tmpAddr, coeffs, 256);
 5806     vs_stpq_post(vs3, tmpAddr);
 5807     // restore montmul constants
 5808     vs_ldpq(vq, kyberConsts);
 5809     load64shorts(vs1, tmpAddr);
 5810     load64shorts(vs2, zetas);
 5811     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5812     __ add(tmpAddr, coeffs, 128);
 5813     load64shorts(vs1, tmpAddr);
 5814     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5815     vs_addv(vs1, __ T8H, vs1, vs2);
 5816     __ add(tmpAddr, coeffs, 128);
 5817     store64shorts(vs1, tmpAddr);
 5818     __ add(tmpAddr, coeffs, 384);
 5819     store64shorts(vs3, tmpAddr);
 5820 
 5821     // level 1
 5822     // restore montmul constants
 5823     vs_ldpq(vq, kyberConsts);
 5824     __ add(tmpAddr, coeffs, 128);
 5825     load64shorts(vs1, tmpAddr);
 5826     load64shorts(vs2, zetas);
 5827     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5828     __ add(tmpAddr, coeffs, 0);
 5829     load64shorts(vs1, tmpAddr);
 5830     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5831     vs_addv(vs1, __ T8H, vs1, vs2);
 5832     __ add(tmpAddr, coeffs, 0);
 5833     store64shorts(vs1, tmpAddr);
 5834     store64shorts(vs3, tmpAddr);
 5835     vs_ldpq(vq, kyberConsts);
 5836     __ add(tmpAddr, coeffs, 384);
 5837     load64shorts(vs1, tmpAddr);
 5838     load64shorts(vs2, zetas);
 5839     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5840     __ add(tmpAddr, coeffs, 256);
 5841     load64shorts(vs1, tmpAddr);
 5842     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5843     vs_addv(vs1, __ T8H, vs1, vs2);
 5844     __ add(tmpAddr, coeffs, 256);
 5845     store64shorts(vs1, tmpAddr);
 5846     store64shorts(vs3, tmpAddr);
 5847 
 5848     // level 2
 5849     vs_ldpq(vq, kyberConsts);
 5850     int offsets1[4] = { 0, 32, 128, 160 };
 5851     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5852     load64shorts(vs2, zetas);
 5853     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5854     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5855     // kyber_subv_addv64();
 5856     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5857     vs_addv(vs1, __ T8H, vs1, vs2);
 5858     __ add(tmpAddr, coeffs, 0);
 5859     vs_stpq_post(vs_front(vs1), tmpAddr);
 5860     vs_stpq_post(vs_front(vs3), tmpAddr);
 5861     vs_stpq_post(vs_back(vs1), tmpAddr);
 5862     vs_stpq_post(vs_back(vs3), tmpAddr);
 5863     vs_ldpq(vq, kyberConsts);
 5864     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5865     load64shorts(vs2, zetas);
 5866     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5867     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5868     // kyber_subv_addv64();
 5869     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5870     vs_addv(vs1, __ T8H, vs1, vs2);
 5871     __ add(tmpAddr, coeffs, 256);
 5872     vs_stpq_post(vs_front(vs1), tmpAddr);
 5873     vs_stpq_post(vs_front(vs3), tmpAddr);
 5874     vs_stpq_post(vs_back(vs1), tmpAddr);
 5875     vs_stpq_post(vs_back(vs3), tmpAddr);
 5876 
 5877     // level 3
 5878     vs_ldpq(vq, kyberConsts);
 5879     int offsets2[4] = { 0, 64, 128, 192 };
 5880     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5881     load64shorts(vs2, zetas);
 5882     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5883     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5884     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5885     vs_addv(vs1, __ T8H, vs1, vs2);
 5886     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5887     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5888 
 5889     vs_ldpq(vq, kyberConsts);
 5890     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5891     load64shorts(vs2, zetas);
 5892     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5893     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5894     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5895     vs_addv(vs1, __ T8H, vs1, vs2);
 5896     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5897     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5898 
 5899     // level 4
 5900     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5901     // so they are loaded using employing an ldr at 8 distinct offsets.
 5902 
 5903     vs_ldpq(vq, kyberConsts);
 5904     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5905     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5906     load64shorts(vs2, zetas);
 5907     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5908     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5909     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5910     vs_addv(vs1, __ T8H, vs1, vs2);
 5911     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5912     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5913 
 5914     vs_ldpq(vq, kyberConsts);
 5915     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5916     load64shorts(vs2, zetas);
 5917     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5918     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5919     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5920     vs_addv(vs1, __ T8H, vs1, vs2);
 5921     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5922     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5923 
 5924     // level 5
 5925     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5926     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5927 
 5928     vs_ldpq(vq, kyberConsts);
 5929     int offsets4[4] = { 0, 32, 64, 96 };
 5930     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5931     load32shorts(vs_front(vs2), zetas);
 5932     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5933     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5934     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5935     load32shorts(vs_front(vs2), zetas);
 5936     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5937     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5938     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5939     load32shorts(vs_front(vs2), zetas);
 5940     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5941     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5942 
 5943     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5944     load32shorts(vs_front(vs2), zetas);
 5945     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5946     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5947 
 5948     // level 6
 5949     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5950     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5951 
 5952     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5953     load32shorts(vs_front(vs2), zetas);
 5954     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5955     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5956     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5957     // __ ldpq(v18, v19, __ post(zetas, 32));
 5958     load32shorts(vs_front(vs2), zetas);
 5959     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5960     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5961 
 5962     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5963     load32shorts(vs_front(vs2), zetas);
 5964     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5965     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5966 
 5967     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5968     load32shorts(vs_front(vs2), zetas);
 5969     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5970     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5971 
 5972     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5973     __ mov(r0, zr); // return 0
 5974     __ ret(lr);
 5975 
 5976     // record the stub entry and end
 5977     store_archive_data(stub_id, start, __ pc());
 5978 
 5979     return start;
 5980   }
 5981 
 5982   // Kyber Inverse NTT function
 5983   // Implements
 5984   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5985   //
 5986   // coeffs (short[256]) = c_rarg0
 5987   // ntt_zetas (short[256]) = c_rarg1
 5988   address generate_kyberInverseNtt() {
 5989     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5990     int entry_count = StubInfo::entry_count(stub_id);
 5991     assert(entry_count == 1, "sanity check");
 5992     address start = load_archive_data(stub_id);
 5993     if (start != nullptr) {
 5994       return start;
 5995     }
 5996     __ align(CodeEntryAlignment);
 5997     StubCodeMark mark(this, stub_id);
 5998     start = __ pc();
 5999     __ enter();
 6000 
 6001     const Register coeffs = c_rarg0;
 6002     const Register zetas = c_rarg1;
 6003 
 6004     const Register kyberConsts = r10;
 6005     const Register tmpAddr = r11;
 6006     const Register tmpAddr2 = c_rarg2;
 6007 
 6008     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 6009     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6010     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6011 
 6012     __ lea(kyberConsts,
 6013              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6014 
 6015     // level 0
 6016     // At level 0 related coefficients occur in discrete blocks of size 4 so
 6017     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 6018 
 6019     vs_ldpq(vq, kyberConsts);
 6020     int offsets4[4] = { 0, 32, 64, 96 };
 6021     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6022     load32shorts(vs_front(vs2), zetas);
 6023     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6024                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6025     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 6026     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6027     load32shorts(vs_front(vs2), zetas);
 6028     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6029                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6030     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 6031     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6032     load32shorts(vs_front(vs2), zetas);
 6033     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6034                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6035     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 6036     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6037     load32shorts(vs_front(vs2), zetas);
 6038     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6039                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6040     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 6041 
 6042     // level 1
 6043     // At level 1 related coefficients occur in discrete blocks of size 8 so
 6044     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 6045 
 6046     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6047     load32shorts(vs_front(vs2), zetas);
 6048     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6049                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6050     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 6051     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6052     load32shorts(vs_front(vs2), zetas);
 6053     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6054                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6055     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 6056 
 6057     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6058     load32shorts(vs_front(vs2), zetas);
 6059     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6060                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6061     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 6062     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6063     load32shorts(vs_front(vs2), zetas);
 6064     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 6065                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 6066     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 6067 
 6068     // level 2
 6069     // At level 2 coefficients occur in 8 discrete blocks of size 16
 6070     // so they are loaded using employing an ldr at 8 distinct offsets.
 6071 
 6072     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6073     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6074     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6075     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6076     vs_subv(vs1, __ T8H, vs1, vs2);
 6077     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 6078     load64shorts(vs2, zetas);
 6079     vs_ldpq(vq, kyberConsts);
 6080     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6081     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 6082 
 6083     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6084     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6085     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6086     vs_subv(vs1, __ T8H, vs1, vs2);
 6087     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 6088     load64shorts(vs2, zetas);
 6089     vs_ldpq(vq, kyberConsts);
 6090     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6091     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 6092 
 6093     // Barrett reduction at indexes where overflow may happen
 6094 
 6095     // load q and the multiplier for the Barrett reduction
 6096     __ add(tmpAddr, kyberConsts, 16);
 6097     vs_ldpq(vq, tmpAddr);
 6098 
 6099     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 6100     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 6101     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 6102     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6103     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6104     vs_sshr(vs2, __ T8H, vs2, 11);
 6105     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6106     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 6107     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6108     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6109     vs_sshr(vs2, __ T8H, vs2, 11);
 6110     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6111     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 6112 
 6113     // level 3
 6114     // From level 3 upwards coefficients occur in discrete blocks whose size is
 6115     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 6116 
 6117     int offsets2[4] = { 0, 64, 128, 192 };
 6118     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 6119     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 6120     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6121     vs_subv(vs1, __ T8H, vs1, vs2);
 6122     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 6123     load64shorts(vs2, zetas);
 6124     vs_ldpq(vq, kyberConsts);
 6125     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6126     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 6127 
 6128     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 6129     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6130     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6131     vs_subv(vs1, __ T8H, vs1, vs2);
 6132     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 6133     load64shorts(vs2, zetas);
 6134     vs_ldpq(vq, kyberConsts);
 6135     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6136     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 6137 
 6138     // level 4
 6139 
 6140     int offsets1[4] = { 0, 32, 128, 160 };
 6141     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 6142     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 6143     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6144     vs_subv(vs1, __ T8H, vs1, vs2);
 6145     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 6146     load64shorts(vs2, zetas);
 6147     vs_ldpq(vq, kyberConsts);
 6148     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6149     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 6150 
 6151     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 6152     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6153     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6154     vs_subv(vs1, __ T8H, vs1, vs2);
 6155     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 6156     load64shorts(vs2, zetas);
 6157     vs_ldpq(vq, kyberConsts);
 6158     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6159     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 6160 
 6161     // level 5
 6162 
 6163     __ add(tmpAddr, coeffs, 0);
 6164     load64shorts(vs1, tmpAddr);
 6165     __ add(tmpAddr, coeffs, 128);
 6166     load64shorts(vs2, tmpAddr);
 6167     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6168     vs_subv(vs1, __ T8H, vs1, vs2);
 6169     __ add(tmpAddr, coeffs, 0);
 6170     store64shorts(vs3, tmpAddr);
 6171     load64shorts(vs2, zetas);
 6172     vs_ldpq(vq, kyberConsts);
 6173     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6174     __ add(tmpAddr, coeffs, 128);
 6175     store64shorts(vs2, tmpAddr);
 6176 
 6177     load64shorts(vs1, tmpAddr);
 6178     __ add(tmpAddr, coeffs, 384);
 6179     load64shorts(vs2, tmpAddr);
 6180     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6181     vs_subv(vs1, __ T8H, vs1, vs2);
 6182     __ add(tmpAddr, coeffs, 256);
 6183     store64shorts(vs3, tmpAddr);
 6184     load64shorts(vs2, zetas);
 6185     vs_ldpq(vq, kyberConsts);
 6186     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6187     __ add(tmpAddr, coeffs, 384);
 6188     store64shorts(vs2, tmpAddr);
 6189 
 6190     // Barrett reduction at indexes where overflow may happen
 6191 
 6192     // load q and the multiplier for the Barrett reduction
 6193     __ add(tmpAddr, kyberConsts, 16);
 6194     vs_ldpq(vq, tmpAddr);
 6195 
 6196     int offsets0[2] = { 0, 256 };
 6197     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6198     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 6199     vs_sshr(vs2, __ T8H, vs2, 11);
 6200     vs_mlsv(vs1, __ T8H, vs2, vq1);
 6201     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 6202 
 6203     // level 6
 6204 
 6205     __ add(tmpAddr, coeffs, 0);
 6206     load64shorts(vs1, tmpAddr);
 6207     __ add(tmpAddr, coeffs, 256);
 6208     load64shorts(vs2, tmpAddr);
 6209     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6210     vs_subv(vs1, __ T8H, vs1, vs2);
 6211     __ add(tmpAddr, coeffs, 0);
 6212     store64shorts(vs3, tmpAddr);
 6213     load64shorts(vs2, zetas);
 6214     vs_ldpq(vq, kyberConsts);
 6215     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6216     __ add(tmpAddr, coeffs, 256);
 6217     store64shorts(vs2, tmpAddr);
 6218 
 6219     __ add(tmpAddr, coeffs, 128);
 6220     load64shorts(vs1, tmpAddr);
 6221     __ add(tmpAddr, coeffs, 384);
 6222     load64shorts(vs2, tmpAddr);
 6223     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 6224     vs_subv(vs1, __ T8H, vs1, vs2);
 6225     __ add(tmpAddr, coeffs, 128);
 6226     store64shorts(vs3, tmpAddr);
 6227     load64shorts(vs2, zetas);
 6228     vs_ldpq(vq, kyberConsts);
 6229     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 6230     __ add(tmpAddr, coeffs, 384);
 6231     store64shorts(vs2, tmpAddr);
 6232 
 6233     // multiply by 2^-n
 6234 
 6235     // load toMont(2^-n mod q)
 6236     __ add(tmpAddr, kyberConsts, 48);
 6237     __ ldr(v29, __ Q, tmpAddr);
 6238 
 6239     vs_ldpq(vq, kyberConsts);
 6240     __ add(tmpAddr, coeffs, 0);
 6241     load64shorts(vs1, tmpAddr);
 6242     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6243     __ add(tmpAddr, coeffs, 0);
 6244     store64shorts(vs2, tmpAddr);
 6245 
 6246     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 6247     load64shorts(vs1, tmpAddr);
 6248     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6249     __ add(tmpAddr, coeffs, 128);
 6250     store64shorts(vs2, tmpAddr);
 6251 
 6252     // now tmpAddr contains coeffs + 256
 6253     load64shorts(vs1, tmpAddr);
 6254     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6255     __ add(tmpAddr, coeffs, 256);
 6256     store64shorts(vs2, tmpAddr);
 6257 
 6258     // now tmpAddr contains coeffs + 384
 6259     load64shorts(vs1, tmpAddr);
 6260     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 6261     __ add(tmpAddr, coeffs, 384);
 6262     store64shorts(vs2, tmpAddr);
 6263 
 6264     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6265     __ mov(r0, zr); // return 0
 6266     __ ret(lr);
 6267 
 6268     // record the stub entry and end
 6269     store_archive_data(stub_id, start, __ pc());
 6270 
 6271     return start;
 6272   }
 6273 
 6274   // Kyber multiply polynomials in the NTT domain.
 6275   // Implements
 6276   // static int implKyberNttMult(
 6277   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 6278   //
 6279   // The actual algorithm that is used here differs from the one in the Java
 6280   // implementation, it uses Montgomery multiplications instead of Barrett
 6281   // reduction, but the end result modulo MLKEM_Q is the same. This is the
 6282   // Java equivalent of this intrinsic implementation:
 6283   // static void implKyberNttMultJava(short[] result, short[] ntta, short[] nttb) {
 6284   //         for (int m = 0; m < ML_KEM_N / 2; m++) {
 6285   //             int a0 = ntta[2 * m];
 6286   //             int a1 = ntta[2 * m + 1];
 6287   //             int b0 = nttb[2 * m];
 6288   //             int b1 = nttb[2 * m + 1];
 6289   //             int r = montMul(a0, b0) +
 6290   //                     montMul(montMul(a1, b1), MONT_ZETAS_FOR_NTT_MULT[m]);
 6291   //             result[2 * m] = (short) montMul(r, MONT_R_SQUARE_MOD_Q);
 6292   //             result[2 * m + 1] = (short) montMul(
 6293   //                     (montMul(a0, b1) + montMul(a1, b0)), MONT_R_SQUARE_MOD_Q);
 6294   //          }
 6295   // }
 6296   //
 6297   // result (short[256]) = c_rarg0
 6298   // ntta (short[256]) = c_rarg1
 6299   // nttb (short[256]) = c_rarg2
 6300   // zetas (short[128]) = c_rarg3
 6301   address generate_kyberNttMult() {
 6302     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 6303     int entry_count = StubInfo::entry_count(stub_id);
 6304     assert(entry_count == 1, "sanity check");
 6305     address start = load_archive_data(stub_id);
 6306     if (start != nullptr) {
 6307       return start;
 6308     }
 6309     __ align(CodeEntryAlignment);
 6310     StubCodeMark mark(this, stub_id);
 6311     start = __ pc();
 6312     __ enter();
 6313 
 6314     const Register result = c_rarg0;
 6315     const Register ntta = c_rarg1;
 6316     const Register nttb = c_rarg2;
 6317     const Register zetas = c_rarg3;
 6318 
 6319     const Register kyberConsts = r10;
 6320     const Register limit = r11;
 6321 
 6322     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 6323     VSeq<4> vs3(16), vs4(20);
 6324     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 6325     VSeq<2> vz(28);          // pair of zetas
 6326     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 6327 
 6328     __ lea(kyberConsts,
 6329              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6330 
 6331     Label kyberNttMult_loop;
 6332 
 6333     __ add(limit, result, 512);
 6334 
 6335     // load q and qinv
 6336     vs_ldpq(vq, kyberConsts);
 6337 
 6338     // load R^2 mod q (to convert back from Montgomery representation)
 6339     __ add(kyberConsts, kyberConsts, 64);
 6340     __ ldr(v27, __ Q, kyberConsts);
 6341 
 6342     __ BIND(kyberNttMult_loop);
 6343 
 6344     // load 16 zetas
 6345     vs_ldpq_post(vz, zetas);
 6346 
 6347     // load 2 sets of 32 coefficients from the two input arrays
 6348     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 6349     // are striped across pairs of vector registers
 6350     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 6351     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 6352     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 6353     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 6354 
 6355     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 6356     // i.e. montmul the first and second halves of vs1 in order and
 6357     // then with one sequence reversed storing the two results in vs3
 6358     //
 6359     // vs3[0] <- montmul(a0, b0)
 6360     // vs3[1] <- montmul(a1, b1)
 6361     // vs3[2] <- montmul(a0, b1)
 6362     // vs3[3] <- montmul(a1, b0)
 6363     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 6364     kyber_montmul16(vs_back(vs3),
 6365                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 6366 
 6367     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 6368     // i.e. montmul the first and second halves of vs4 in order and
 6369     // then with one sequence reversed storing the two results in vs1
 6370     //
 6371     // vs1[0] <- montmul(a2, b2)
 6372     // vs1[1] <- montmul(a3, b3)
 6373     // vs1[2] <- montmul(a2, b3)
 6374     // vs1[3] <- montmul(a3, b2)
 6375     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 6376     kyber_montmul16(vs_back(vs1),
 6377                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 6378 
 6379     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 6380     // We can schedule two montmuls at a time if we use a suitable vector
 6381     // sequence <vs3[1], vs1[1]>.
 6382     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 6383     VSeq<2> vs5(vs3[1], delta);
 6384 
 6385     // vs3[1] <- montmul(montmul(a1, b1), z0)
 6386     // vs1[1] <- montmul(montmul(a3, b3), z1)
 6387     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 6388 
 6389     // add results in pairs storing in vs3
 6390     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 6391     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 6392     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 6393 
 6394     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 6395     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 6396     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 6397 
 6398     // vs1 <- montmul(vs3, montRSquareModQ)
 6399     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 6400 
 6401     // store back the two pairs of result vectors de-interleaved as 8H elements
 6402     // i.e. storing each pairs of shorts striped across a register pair adjacent
 6403     // in memory
 6404     vs_st2_post(vs1, __ T8H, result);
 6405 
 6406     __ cmp(result, limit);
 6407     __ br(Assembler::NE, kyberNttMult_loop);
 6408 
 6409     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6410     __ mov(r0, zr); // return 0
 6411     __ ret(lr);
 6412 
 6413     // record the stub entry and end
 6414     store_archive_data(stub_id, start, __ pc());
 6415 
 6416     return start;
 6417   }
 6418 
 6419   // Kyber add 2 polynomials.
 6420   // Implements
 6421   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 6422   //
 6423   // result (short[256]) = c_rarg0
 6424   // a (short[256]) = c_rarg1
 6425   // b (short[256]) = c_rarg2
 6426   address generate_kyberAddPoly_2() {
 6427     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 6428     int entry_count = StubInfo::entry_count(stub_id);
 6429     assert(entry_count == 1, "sanity check");
 6430     address start = load_archive_data(stub_id);
 6431     if (start != nullptr) {
 6432       return start;
 6433     }
 6434     __ align(CodeEntryAlignment);
 6435     StubCodeMark mark(this, stub_id);
 6436     start = __ pc();
 6437     __ enter();
 6438 
 6439     const Register result = c_rarg0;
 6440     const Register a = c_rarg1;
 6441     const Register b = c_rarg2;
 6442 
 6443     const Register kyberConsts = r11;
 6444 
 6445     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 6446     // So, we can load, add and store the data in 3 groups of 11,
 6447     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 6448     // registers. A further constraint is that the mapping needs
 6449     // to skip callee saves. So, we allocate the register
 6450     // sequences using two 8 sequences, two 2 sequences and two
 6451     // single registers.
 6452     VSeq<8> vs1_1(0);
 6453     VSeq<2> vs1_2(16);
 6454     FloatRegister vs1_3 = v28;
 6455     VSeq<8> vs2_1(18);
 6456     VSeq<2> vs2_2(26);
 6457     FloatRegister vs2_3 = v29;
 6458 
 6459     // two constant vector sequences
 6460     VSeq<8> vc_1(31, 0);
 6461     VSeq<2> vc_2(31, 0);
 6462 
 6463     FloatRegister vc_3 = v31;
 6464     __ lea(kyberConsts,
 6465              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6466 
 6467     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6468     for (int i = 0; i < 3; i++) {
 6469       // load 80 or 88 values from a into vs1_1/2/3
 6470       vs_ldpq_post(vs1_1, a);
 6471       vs_ldpq_post(vs1_2, a);
 6472       if (i < 2) {
 6473         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6474       }
 6475       // load 80 or 88 values from b into vs2_1/2/3
 6476       vs_ldpq_post(vs2_1, b);
 6477       vs_ldpq_post(vs2_2, b);
 6478       if (i < 2) {
 6479         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6480       }
 6481       // sum 80 or 88 values across vs1 and vs2 into vs1
 6482       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6483       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6484       if (i < 2) {
 6485         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6486       }
 6487       // add constant to all 80 or 88 results
 6488       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6489       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6490       if (i < 2) {
 6491         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6492       }
 6493       // store 80 or 88 values
 6494       vs_stpq_post(vs1_1, result);
 6495       vs_stpq_post(vs1_2, result);
 6496       if (i < 2) {
 6497         __ str(vs1_3, __ Q, __ post(result, 16));
 6498       }
 6499     }
 6500 
 6501     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6502     __ mov(r0, zr); // return 0
 6503     __ ret(lr);
 6504 
 6505     // record the stub entry and end
 6506     store_archive_data(stub_id, start, __ pc());
 6507 
 6508     return start;
 6509   }
 6510 
 6511   // Kyber add 3 polynomials.
 6512   // Implements
 6513   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6514   //
 6515   // result (short[256]) = c_rarg0
 6516   // a (short[256]) = c_rarg1
 6517   // b (short[256]) = c_rarg2
 6518   // c (short[256]) = c_rarg3
 6519   address generate_kyberAddPoly_3() {
 6520     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6521     int entry_count = StubInfo::entry_count(stub_id);
 6522     assert(entry_count == 1, "sanity check");
 6523     address start = load_archive_data(stub_id);
 6524     if (start != nullptr) {
 6525       return start;
 6526     }
 6527     __ align(CodeEntryAlignment);
 6528     StubCodeMark mark(this, stub_id);
 6529     start = __ pc();
 6530     __ enter();
 6531 
 6532     const Register result = c_rarg0;
 6533     const Register a = c_rarg1;
 6534     const Register b = c_rarg2;
 6535     const Register c = c_rarg3;
 6536 
 6537     const Register kyberConsts = r11;
 6538 
 6539     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6540     // quadwords.  So, we can load, add and store the data in 3
 6541     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6542     // of 10 or 11 registers. A further constraint is that the
 6543     // mapping needs to skip callee saves. So, we allocate the
 6544     // register sequences using two 8 sequences, two 2 sequences
 6545     // and two single registers.
 6546     VSeq<8> vs1_1(0);
 6547     VSeq<2> vs1_2(16);
 6548     FloatRegister vs1_3 = v28;
 6549     VSeq<8> vs2_1(18);
 6550     VSeq<2> vs2_2(26);
 6551     FloatRegister vs2_3 = v29;
 6552 
 6553     // two constant vector sequences
 6554     VSeq<8> vc_1(31, 0);
 6555     VSeq<2> vc_2(31, 0);
 6556 
 6557     FloatRegister vc_3 = v31;
 6558 
 6559     __ lea(kyberConsts,
 6560              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6561 
 6562     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6563     for (int i = 0; i < 3; i++) {
 6564       // load 80 or 88 values from a into vs1_1/2/3
 6565       vs_ldpq_post(vs1_1, a);
 6566       vs_ldpq_post(vs1_2, a);
 6567       if (i < 2) {
 6568         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6569       }
 6570       // load 80 or 88 values from b into vs2_1/2/3
 6571       vs_ldpq_post(vs2_1, b);
 6572       vs_ldpq_post(vs2_2, b);
 6573       if (i < 2) {
 6574         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6575       }
 6576       // sum 80 or 88 values across vs1 and vs2 into vs1
 6577       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6578       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6579       if (i < 2) {
 6580         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6581       }
 6582       // load 80 or 88 values from c into vs2_1/2/3
 6583       vs_ldpq_post(vs2_1, c);
 6584       vs_ldpq_post(vs2_2, c);
 6585       if (i < 2) {
 6586         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6587       }
 6588       // sum 80 or 88 values across vs1 and vs2 into vs1
 6589       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6590       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6591       if (i < 2) {
 6592         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6593       }
 6594       // add constant to all 80 or 88 results
 6595       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6596       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6597       if (i < 2) {
 6598         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6599       }
 6600       // store 80 or 88 values
 6601       vs_stpq_post(vs1_1, result);
 6602       vs_stpq_post(vs1_2, result);
 6603       if (i < 2) {
 6604         __ str(vs1_3, __ Q, __ post(result, 16));
 6605       }
 6606     }
 6607 
 6608     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6609     __ mov(r0, zr); // return 0
 6610     __ ret(lr);
 6611 
 6612     // record the stub entry and end
 6613     store_archive_data(stub_id, start, __ pc());
 6614 
 6615     return start;
 6616   }
 6617 
 6618   // Kyber parse XOF output to polynomial coefficient candidates
 6619   // or decodePoly(12, ...).
 6620   // Implements
 6621   // static int implKyber12To16(
 6622   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6623   //
 6624   // we assume that parsed and condensed are allocated such that for
 6625   // n = (parsedLength + 63) / 64
 6626   // n blocks of 96 bytes of input can be processed, i.e.
 6627   // index + n * 96 <= condensed.length and
 6628   // n * 64 <= parsed.length
 6629   //
 6630   // condensed (byte[]) = c_rarg0
 6631   // condensedIndex = c_rarg1
 6632   // parsed (short[]) = c_rarg2
 6633   // parsedLength = c_rarg3
 6634   address generate_kyber12To16() {
 6635     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6636     int entry_count = StubInfo::entry_count(stub_id);
 6637     assert(entry_count == 1, "sanity check");
 6638     address start = load_archive_data(stub_id);
 6639     if (start != nullptr) {
 6640       return start;
 6641     }
 6642     Label L_F00, L_loop;
 6643 
 6644     __ align(CodeEntryAlignment);
 6645     StubCodeMark mark(this, stub_id);
 6646     start = __ pc();
 6647     __ enter();
 6648 
 6649     const Register condensed = c_rarg0;
 6650     const Register condensedOffs = c_rarg1;
 6651     const Register parsed = c_rarg2;
 6652     const Register parsedLength = c_rarg3;
 6653 
 6654     const Register tmpAddr = r11;
 6655 
 6656     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6657     // quadwords so we need a 6 vector sequence for the inputs.
 6658     // Parsing produces 64 shorts, employing two 8 vector
 6659     // sequences to store and combine the intermediate data.
 6660     VSeq<6> vin(24);
 6661     VSeq<8> va(0), vb(16);
 6662 
 6663     __ adr(tmpAddr, L_F00);
 6664     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6665     __ add(condensed, condensed, condensedOffs);
 6666 
 6667     __ BIND(L_loop);
 6668     // load 96 (6 x 16B) byte values
 6669     vs_ld3_post(vin, __ T16B, condensed);
 6670 
 6671     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6672     // holds 48 (16x3) contiguous bytes from memory striped
 6673     // horizontally across each of the 16 byte lanes. Equivalently,
 6674     // that is 16 pairs of 12-bit integers. Likewise the back half
 6675     // holds the next 48 bytes in the same arrangement.
 6676 
 6677     // Each vector in the front half can also be viewed as a vertical
 6678     // strip across the 16 pairs of 12 bit integers. Each byte in
 6679     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6680     // byte in vin[1] stores the high 4 bits of the first int and the
 6681     // low 4 bits of the second int. Each byte in vin[2] stores the
 6682     // high 8 bits of the second int. Likewise the vectors in second
 6683     // half.
 6684 
 6685     // Converting the data to 16-bit shorts requires first of all
 6686     // expanding each of the 6 x 16B vectors into 6 corresponding
 6687     // pairs of 8H vectors. Mask, shift and add operations on the
 6688     // resulting vector pairs can be used to combine 4 and 8 bit
 6689     // parts of related 8H vector elements.
 6690     //
 6691     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6692     // twice, one copy manipulated to provide the lower 4 bits
 6693     // belonging to the first short in a pair and another copy
 6694     // manipulated to provide the higher 4 bits belonging to the
 6695     // second short in a pair. This is why the the vector sequences va
 6696     // and vb used to hold the expanded 8H elements are of length 8.
 6697 
 6698     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6699     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6700     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6701     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6702     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6703     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6704     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6705     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6706 
 6707     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6708     // and vb[4:5]
 6709     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6710     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6711     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6712     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6713     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6714     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6715 
 6716     // shift lo byte of copy 1 of the middle stripe into the high byte
 6717     __ shl(va[2], __ T8H, va[2], 8);
 6718     __ shl(va[3], __ T8H, va[3], 8);
 6719     __ shl(vb[2], __ T8H, vb[2], 8);
 6720     __ shl(vb[3], __ T8H, vb[3], 8);
 6721 
 6722     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6723     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6724     // are in bit positions [4..11].
 6725     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6726     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6727     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6728     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6729 
 6730     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6731     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6732     // copy2
 6733     __ andr(va[2], __ T16B, va[2], v31);
 6734     __ andr(va[3], __ T16B, va[3], v31);
 6735     __ ushr(va[4], __ T8H, va[4], 4);
 6736     __ ushr(va[5], __ T8H, va[5], 4);
 6737     __ andr(vb[2], __ T16B, vb[2], v31);
 6738     __ andr(vb[3], __ T16B, vb[3], v31);
 6739     __ ushr(vb[4], __ T8H, vb[4], 4);
 6740     __ ushr(vb[5], __ T8H, vb[5], 4);
 6741 
 6742     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6743     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6744     // n.b. the ordering ensures: i) inputs are consumed before they
 6745     // are overwritten ii) the order of 16-bit results across successive
 6746     // pairs of vectors in va and then vb reflects the order of the
 6747     // corresponding 12-bit inputs
 6748     __ addv(va[0], __ T8H, va[0], va[2]);
 6749     __ addv(va[2], __ T8H, va[1], va[3]);
 6750     __ addv(va[1], __ T8H, va[4], va[6]);
 6751     __ addv(va[3], __ T8H, va[5], va[7]);
 6752     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6753     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6754     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6755     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6756 
 6757     // store 64 results interleaved as shorts
 6758     vs_st2_post(vs_front(va), __ T8H, parsed);
 6759     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6760 
 6761     __ sub(parsedLength, parsedLength, 64);
 6762     __ cmp(parsedLength, (u1)0);
 6763     __ br(Assembler::GT, L_loop);
 6764 
 6765     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6766     __ mov(r0, zr); // return 0
 6767     __ ret(lr);
 6768 
 6769     // bind label and generate constant data used by this stub
 6770     __ BIND(L_F00);
 6771     __ emit_int64(0x0f000f000f000f00);
 6772     __ emit_int64(0x0f000f000f000f00);
 6773 
 6774     // record the stub entry and end
 6775     store_archive_data(stub_id, start, __ pc());
 6776 
 6777     return start;
 6778   }
 6779 
 6780   // Kyber Barrett reduce function.
 6781   // Implements
 6782   // static int implKyberBarrettReduce(short[] coeffs) {}
 6783   //
 6784   // coeffs (short[256]) = c_rarg0
 6785   address generate_kyberBarrettReduce() {
 6786     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6787     int entry_count = StubInfo::entry_count(stub_id);
 6788     assert(entry_count == 1, "sanity check");
 6789     address start = load_archive_data(stub_id);
 6790     if (start != nullptr) {
 6791       return start;
 6792     }
 6793     __ align(CodeEntryAlignment);
 6794     StubCodeMark mark(this, stub_id);
 6795     start = __ pc();
 6796     __ enter();
 6797 
 6798     const Register coeffs = c_rarg0;
 6799 
 6800     const Register kyberConsts = r10;
 6801     const Register result = r11;
 6802 
 6803     // As above we process 256 sets of values in total i.e. 32 x
 6804     // 8H quadwords. So, we can load, add and store the data in 3
 6805     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6806     // of 10 or 11 registers. A further constraint is that the
 6807     // mapping needs to skip callee saves. So, we allocate the
 6808     // register sequences using two 8 sequences, two 2 sequences
 6809     // and two single registers.
 6810     VSeq<8> vs1_1(0);
 6811     VSeq<2> vs1_2(16);
 6812     FloatRegister vs1_3 = v28;
 6813     VSeq<8> vs2_1(18);
 6814     VSeq<2> vs2_2(26);
 6815     FloatRegister vs2_3 = v29;
 6816 
 6817     // we also need a pair of corresponding constant sequences
 6818 
 6819     VSeq<8> vc1_1(30, 0);
 6820     VSeq<2> vc1_2(30, 0);
 6821     FloatRegister vc1_3 = v30; // for kyber_q
 6822 
 6823     VSeq<8> vc2_1(31, 0);
 6824     VSeq<2> vc2_2(31, 0);
 6825     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6826 
 6827     __ add(result, coeffs, 0);
 6828     __ lea(kyberConsts,
 6829              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6830 
 6831     // load q and the multiplier for the Barrett reduction
 6832     __ add(kyberConsts, kyberConsts, 16);
 6833     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6834 
 6835     for (int i = 0; i < 3; i++) {
 6836       // load 80 or 88 coefficients
 6837       vs_ldpq_post(vs1_1, coeffs);
 6838       vs_ldpq_post(vs1_2, coeffs);
 6839       if (i < 2) {
 6840         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6841       }
 6842 
 6843       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6844       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6845       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6846       if (i < 2) {
 6847         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6848       }
 6849 
 6850       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6851       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6852       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6853       if (i < 2) {
 6854         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6855       }
 6856 
 6857       // vs1 <- vs1 - vs2 * kyber_q
 6858       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6859       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6860       if (i < 2) {
 6861         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6862       }
 6863 
 6864       vs_stpq_post(vs1_1, result);
 6865       vs_stpq_post(vs1_2, result);
 6866       if (i < 2) {
 6867         __ str(vs1_3, __ Q, __ post(result, 16));
 6868       }
 6869     }
 6870 
 6871     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6872     __ mov(r0, zr); // return 0
 6873     __ ret(lr);
 6874 
 6875     // record the stub entry and end
 6876     store_archive_data(stub_id, start, __ pc());
 6877 
 6878     return start;
 6879   }
 6880 
 6881 
 6882   // Dilithium-specific montmul helper routines that generate parallel
 6883   // code for, respectively, a single 4x4s vector sequence montmul or
 6884   // two such multiplies in a row.
 6885 
 6886   // Perform 16 32-bit Montgomery multiplications in parallel
 6887   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6888                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6889     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6890     // It will assert that the register use is valid
 6891     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6892   }
 6893 
 6894   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6895   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6896                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6897     // Schedule two successive 4x4S multiplies via the montmul helper
 6898     // on the front and back halves of va, vb and vc. The helper will
 6899     // assert that the register use has no overlap conflicts on each
 6900     // individual call but we also need to ensure that the necessary
 6901     // disjoint/equality constraints are met across both calls.
 6902 
 6903     // vb, vc, vtmp and vq must be disjoint. va must either be
 6904     // disjoint from all other registers or equal vc
 6905 
 6906     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6907     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6908     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6909 
 6910     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6911     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6912 
 6913     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6914 
 6915     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6916     assert(vs_disjoint(va, vb), "va and vb overlap");
 6917     assert(vs_disjoint(va, vq), "va and vq overlap");
 6918     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6919 
 6920     // We multiply the front and back halves of each sequence 4 at a
 6921     // time because
 6922     //
 6923     // 1) we are currently only able to get 4-way instruction
 6924     // parallelism at best
 6925     //
 6926     // 2) we need registers for the constants in vq and temporary
 6927     // scratch registers to hold intermediate results so vtmp can only
 6928     // be a VSeq<4> which means we only have 4 scratch slots.
 6929 
 6930     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6931     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6932   }
 6933 
 6934   // Perform combined montmul then add/sub on 4x4S vectors.
 6935   void dilithium_montmul16_sub_add(
 6936           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6937           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6938     // compute a = montmul(a1, c)
 6939     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6940     // ouptut a1 = a0 - a
 6941     vs_subv(va1, __ T4S, va0, vc);
 6942     //    and a0 = a0 + a
 6943     vs_addv(va0, __ T4S, va0, vc);
 6944   }
 6945 
 6946   // Perform combined add/sub then montul on 4x4S vectors.
 6947   void dilithium_sub_add_montmul16(
 6948           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6949           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6950     // compute c = a0 - a1
 6951     vs_subv(vtmp1, __ T4S, va0, va1);
 6952     // output a0 = a0 + a1
 6953     vs_addv(va0, __ T4S, va0, va1);
 6954     // output a1 = b montmul c
 6955     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6956   }
 6957 
 6958   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6959   // in the Java implementation come in sequences of at least 8, so we
 6960   // can use ldpq to collect the corresponding data into pairs of vector
 6961   // registers.
 6962   // We collect the coefficients corresponding to the 'j+l' indexes into
 6963   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6964   // then we do the (Montgomery) multiplications by the zetas in parallel
 6965   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6966   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6967   // v0-v7 and finally save the results back to the coeffs array.
 6968   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6969     const Register coeffs, const Register zetas) {
 6970     int c1 = 0;
 6971     int c2 = 512;
 6972     int startIncr;
 6973     // don't use callee save registers v8 - v15
 6974     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6975     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6976     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6977     int offsets[4] = { 0, 32, 64, 96 };
 6978 
 6979     for (int level = 0; level < 5; level++) {
 6980       int c1Start = c1;
 6981       int c2Start = c2;
 6982       if (level == 3) {
 6983         offsets[1] = 32;
 6984         offsets[2] = 128;
 6985         offsets[3] = 160;
 6986       } else if (level == 4) {
 6987         offsets[1] = 64;
 6988         offsets[2] = 128;
 6989         offsets[3] = 192;
 6990       }
 6991 
 6992       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6993       // time at 4 different offsets and multiply them in order by the
 6994       // next set of input values. So we employ indexed load and store
 6995       // pair instructions with arrangement 4S.
 6996       for (int i = 0; i < 4; i++) {
 6997         // reload q and qinv
 6998         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6999         // load 8x4S coefficients via second start pos == c2
 7000         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 7001         // load next 8x4S inputs == b
 7002         vs_ldpq_post(vs2, zetas);
 7003         // compute a == c2 * b mod MONT_Q
 7004         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7005         // load 8x4s coefficients via first start pos == c1
 7006         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7007         // compute a1 =  c1 + a
 7008         vs_addv(vs3, __ T4S, vs1, vs2);
 7009         // compute a2 =  c1 - a
 7010         vs_subv(vs1, __ T4S, vs1, vs2);
 7011         // output a1 and a2
 7012         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7013         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 7014 
 7015         int k = 4 * level + i;
 7016 
 7017         if (k > 7) {
 7018           startIncr = 256;
 7019         } else if (k == 5) {
 7020           startIncr = 384;
 7021         } else {
 7022           startIncr = 128;
 7023         }
 7024 
 7025         c1Start += startIncr;
 7026         c2Start += startIncr;
 7027       }
 7028 
 7029       c2 /= 2;
 7030     }
 7031   }
 7032 
 7033   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 7034   // Implements the method
 7035   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 7036   // of the Java class sun.security.provider
 7037   //
 7038   // coeffs (int[256]) = c_rarg0
 7039   // zetas (int[256]) = c_rarg1
 7040   address generate_dilithiumAlmostNtt() {
 7041     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 7042     int entry_count = StubInfo::entry_count(stub_id);
 7043     assert(entry_count == 1, "sanity check");
 7044     address start = load_archive_data(stub_id);
 7045     if (start != nullptr) {
 7046       return start;
 7047     }
 7048     __ align(CodeEntryAlignment);
 7049     StubCodeMark mark(this, stub_id);
 7050     start = __ pc();
 7051     __ enter();
 7052 
 7053     const Register coeffs = c_rarg0;
 7054     const Register zetas = c_rarg1;
 7055 
 7056     const Register tmpAddr = r9;
 7057     const Register dilithiumConsts = r10;
 7058     const Register result = r11;
 7059     // don't use callee save registers v8 - v15
 7060     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7061     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7062     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7063     int offsets[4] = { 0, 32, 64, 96};
 7064     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7065     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7066     __ add(result, coeffs, 0);
 7067     __ lea(dilithiumConsts,
 7068              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7069 
 7070     // Each level represents one iteration of the outer for loop of the Java version.
 7071 
 7072     // level 0-4
 7073     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 7074 
 7075     // level 5
 7076 
 7077     // At level 5 the coefficients we need to combine with the zetas
 7078     // are grouped in memory in blocks of size 4. So, for both sets of
 7079     // coefficients we load 4 adjacent values at 8 different offsets
 7080     // using an indexed ldr with register variant Q and multiply them
 7081     // in sequence order by the next set of inputs. Likewise we store
 7082     // the resuls using an indexed str with register variant Q.
 7083     for (int i = 0; i < 1024; i += 256) {
 7084       // reload constants q, qinv each iteration as they get clobbered later
 7085       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7086       // load 32 (8x4S) coefficients via first offsets = c1
 7087       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7088       // load next 32 (8x4S) inputs = b
 7089       vs_ldpq_post(vs2, zetas);
 7090       // a = b montul c1
 7091       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7092       // load 32 (8x4S) coefficients via second offsets = c2
 7093       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 7094       // add/sub with result of multiply
 7095       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 7096       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 7097       // write back new coefficients using same offsets
 7098       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 7099       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 7100     }
 7101 
 7102     // level 6
 7103     // At level 6 the coefficients we need to combine with the zetas
 7104     // are grouped in memory in pairs, the first two being montmul
 7105     // inputs and the second add/sub inputs. We can still implement
 7106     // the montmul+sub+add using 4-way parallelism but only if we
 7107     // combine the coefficients with the zetas 16 at a time. We load 8
 7108     // adjacent values at 4 different offsets using an ld2 load with
 7109     // arrangement 2D. That interleaves the lower and upper halves of
 7110     // each pair of quadwords into successive vector registers. We
 7111     // then need to montmul the 4 even elements of the coefficients
 7112     // register sequence by the zetas in order and then add/sub the 4
 7113     // odd elements of the coefficients register sequence. We use an
 7114     // equivalent st2 operation to store the results back into memory
 7115     // de-interleaved.
 7116     for (int i = 0; i < 1024; i += 128) {
 7117       // reload constants q, qinv each iteration as they get clobbered later
 7118       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7119       // load interleaved 16 (4x2D) coefficients via offsets
 7120       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7121       // load next 16 (4x4S) inputs
 7122       vs_ldpq_post(vs_front(vs2), zetas);
 7123       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7124       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7125                                   vs_front(vs2), vtmp, vq);
 7126       // store interleaved 16 (4x2D) coefficients via offsets
 7127       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7128     }
 7129 
 7130     // level 7
 7131     // At level 7 the coefficients we need to combine with the zetas
 7132     // occur singly with montmul inputs alterating with add/sub
 7133     // inputs. Once again we can use 4-way parallelism to combine 16
 7134     // zetas at a time. However, we have to load 8 adjacent values at
 7135     // 4 different offsets using an ld2 load with arrangement 4S. That
 7136     // interleaves the the odd words of each pair into one
 7137     // coefficients vector register and the even words of the pair
 7138     // into the next register. We then need to montmul the 4 even
 7139     // elements of the coefficients register sequence by the zetas in
 7140     // order and then add/sub the 4 odd elements of the coefficients
 7141     // register sequence. We use an equivalent st2 operation to store
 7142     // the results back into memory de-interleaved.
 7143 
 7144     for (int i = 0; i < 1024; i += 128) {
 7145       // reload constants q, qinv each iteration as they get clobbered later
 7146       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7147       // load interleaved 16 (4x4S) coefficients via offsets
 7148       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7149       // load next 16 (4x4S) inputs
 7150       vs_ldpq_post(vs_front(vs2), zetas);
 7151       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 7152       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 7153                                   vs_front(vs2), vtmp, vq);
 7154       // store interleaved 16 (4x4S) coefficients via offsets
 7155       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7156     }
 7157     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7158     __ mov(r0, zr); // return 0
 7159     __ ret(lr);
 7160 
 7161     // record the stub entry and end
 7162     store_archive_data(stub_id, start, __ pc());
 7163 
 7164     return start;
 7165   }
 7166 
 7167   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 7168   // in the Java implementation come in sequences of at least 8, so we
 7169   // can use ldpq to collect the corresponding data into pairs of vector
 7170   // registers
 7171   // We collect the coefficients that correspond to the 'j's into vs1
 7172   // the coefficiets that correspond to the 'j+l's into vs2 then
 7173   // do the additions into vs3 and the subtractions into vs1 then
 7174   // save the result of the additions, load the zetas into vs2
 7175   // do the (Montgomery) multiplications by zeta in parallel into vs2
 7176   // finally save the results back to the coeffs array
 7177   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 7178     const Register coeffs, const Register zetas) {
 7179     int c1 = 0;
 7180     int c2 = 32;
 7181     int startIncr;
 7182     int offsets[4];
 7183     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7184     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7185     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7186 
 7187     offsets[0] = 0;
 7188 
 7189     for (int level = 3; level < 8; level++) {
 7190       int c1Start = c1;
 7191       int c2Start = c2;
 7192       if (level == 3) {
 7193         offsets[1] = 64;
 7194         offsets[2] = 128;
 7195         offsets[3] = 192;
 7196       } else if (level == 4) {
 7197         offsets[1] = 32;
 7198         offsets[2] = 128;
 7199         offsets[3] = 160;
 7200       } else {
 7201         offsets[1] = 32;
 7202         offsets[2] = 64;
 7203         offsets[3] = 96;
 7204       }
 7205 
 7206       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 7207       // time at 4 different offsets and multiply them in order by the
 7208       // next set of input values. So we employ indexed load and store
 7209       // pair instructions with arrangement 4S.
 7210       for (int i = 0; i < 4; i++) {
 7211         // load v1 32 (8x4S) coefficients relative to first start index
 7212         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 7213         // load v2 32 (8x4S) coefficients relative to second start index
 7214         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 7215         // a0 = v1 + v2 -- n.b. clobbers vqs
 7216         vs_addv(vs3, __ T4S, vs1, vs2);
 7217         // a1 = v1 - v2
 7218         vs_subv(vs1, __ T4S, vs1, vs2);
 7219         // save a1 relative to first start index
 7220         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 7221         // load constants q, qinv each iteration as they get clobbered above
 7222         vs_ldpq(vq, dilithiumConsts); // qInv, q
 7223         // load b next 32 (8x4S) inputs
 7224         vs_ldpq_post(vs2, zetas);
 7225         // a = a1 montmul b
 7226         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7227         // save a relative to second start index
 7228         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 7229 
 7230         int k = 4 * level + i;
 7231 
 7232         if (k < 24) {
 7233           startIncr = 256;
 7234         } else if (k == 25) {
 7235           startIncr = 384;
 7236         } else {
 7237           startIncr = 128;
 7238         }
 7239 
 7240         c1Start += startIncr;
 7241         c2Start += startIncr;
 7242       }
 7243 
 7244       c2 *= 2;
 7245     }
 7246   }
 7247 
 7248   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 7249   // Implements the method
 7250   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 7251   // the sun.security.provider.ML_DSA class.
 7252   //
 7253   // coeffs (int[256]) = c_rarg0
 7254   // zetas (int[256]) = c_rarg1
 7255   address generate_dilithiumAlmostInverseNtt() {
 7256     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 7257     int entry_count = StubInfo::entry_count(stub_id);
 7258     assert(entry_count == 1, "sanity check");
 7259     address start = load_archive_data(stub_id);
 7260     if (start != nullptr) {
 7261       return start;
 7262     }
 7263     __ align(CodeEntryAlignment);
 7264     StubCodeMark mark(this, stub_id);
 7265     start = __ pc();
 7266     __ enter();
 7267 
 7268     const Register coeffs = c_rarg0;
 7269     const Register zetas = c_rarg1;
 7270 
 7271     const Register tmpAddr = r9;
 7272     const Register dilithiumConsts = r10;
 7273     const Register result = r11;
 7274     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7275     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 7276     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7277     int offsets[4] = { 0, 32, 64, 96 };
 7278     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 7279     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 7280 
 7281     __ add(result, coeffs, 0);
 7282     __ lea(dilithiumConsts,
 7283              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7284 
 7285     // Each level represents one iteration of the outer for loop of the Java version
 7286 
 7287     // level 0
 7288     // At level 0 we need to interleave adjacent quartets of
 7289     // coefficients before we multiply and add/sub by the next 16
 7290     // zetas just as we did for level 7 in the multiply code. So we
 7291     // load and store the values using an ld2/st2 with arrangement 4S.
 7292     for (int i = 0; i < 1024; i += 128) {
 7293       // load constants q, qinv
 7294       // n.b. this can be moved out of the loop as they do not get
 7295       // clobbered by first two loops
 7296       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7297       // a0/a1 load interleaved 32 (8x4S) coefficients
 7298       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7299       // b load next 32 (8x4S) inputs
 7300       vs_ldpq_post(vs_front(vs2), zetas);
 7301       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7302       // n.b. second half of vs2 provides temporary register storage
 7303       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7304                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7305       // a0/a1 store interleaved 32 (8x4S) coefficients
 7306       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 7307     }
 7308 
 7309     // level 1
 7310     // At level 1 we need to interleave pairs of adjacent pairs of
 7311     // coefficients before we multiply by the next 16 zetas just as we
 7312     // did for level 6 in the multiply code. So we load and store the
 7313     // values an ld2/st2 with arrangement 2D.
 7314     for (int i = 0; i < 1024; i += 128) {
 7315       // a0/a1 load interleaved 32 (8x2D) coefficients
 7316       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7317       // b load next 16 (4x4S) inputs
 7318       vs_ldpq_post(vs_front(vs2), zetas);
 7319       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 7320       // n.b. second half of vs2 provides temporary register storage
 7321       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 7322                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 7323       // a0/a1 store interleaved 32 (8x2D) coefficients
 7324       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 7325     }
 7326 
 7327     // level 2
 7328     // At level 2 coefficients come in blocks of 4. So, we load 4
 7329     // adjacent coefficients at 8 distinct offsets for both the first
 7330     // and second coefficient sequences, using an ldr with register
 7331     // variant Q then combine them with next set of 32 zetas. Likewise
 7332     // we store the results using an str with register variant Q.
 7333     for (int i = 0; i < 1024; i += 256) {
 7334       // c0 load 32 (8x4S) coefficients via first offsets
 7335       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 7336       // c1 load 32 (8x4S) coefficients via second offsets
 7337       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 7338       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 7339       vs_addv(vs3, __ T4S, vs1, vs2);
 7340       // c = c0 - c1
 7341       vs_subv(vs1, __ T4S, vs1, vs2);
 7342       // store a0 32 (8x4S) coefficients via first offsets
 7343       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 7344       // b load 32 (8x4S) next inputs
 7345       vs_ldpq_post(vs2, zetas);
 7346       // reload constants q, qinv -- they were clobbered earlier
 7347       vs_ldpq(vq, dilithiumConsts); // qInv, q
 7348       // compute a1 = b montmul c
 7349       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7350       // store a1 32 (8x4S) coefficients via second offsets
 7351       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 7352     }
 7353 
 7354     // level 3-7
 7355     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 7356 
 7357     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7358     __ mov(r0, zr); // return 0
 7359     __ ret(lr);
 7360 
 7361     // record the stub entry and end
 7362     store_archive_data(stub_id, start, __ pc());
 7363 
 7364     return start;
 7365   }
 7366 
 7367   // Dilithium multiply polynomials in the NTT domain.
 7368   // Straightforward implementation of the method
 7369   // static int implDilithiumNttMult(
 7370   //              int[] result, int[] ntta, int[] nttb {} of
 7371   // the sun.security.provider.ML_DSA class.
 7372   //
 7373   // result (int[256]) = c_rarg0
 7374   // poly1 (int[256]) = c_rarg1
 7375   // poly2 (int[256]) = c_rarg2
 7376   address generate_dilithiumNttMult() {
 7377     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 7378     int entry_count = StubInfo::entry_count(stub_id);
 7379     assert(entry_count == 1, "sanity check");
 7380     address start = load_archive_data(stub_id);
 7381     if (start != nullptr) {
 7382       return start;
 7383     }
 7384     __ align(CodeEntryAlignment);
 7385     StubCodeMark mark(this, stub_id);
 7386     start = __ pc();
 7387     __ enter();
 7388 
 7389     Label L_loop;
 7390 
 7391     const Register result = c_rarg0;
 7392     const Register poly1 = c_rarg1;
 7393     const Register poly2 = c_rarg2;
 7394 
 7395     const Register dilithiumConsts = r10;
 7396     const Register len = r11;
 7397 
 7398     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7399     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 7400     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7401     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 7402 
 7403     __ lea(dilithiumConsts,
 7404              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7405 
 7406     // load constants q, qinv
 7407     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7408     // load constant rSquare into v29
 7409     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 7410 
 7411     __ mov(len, zr);
 7412     __ add(len, len, 1024);
 7413 
 7414     __ BIND(L_loop);
 7415 
 7416     // b load 32 (8x4S) next inputs from poly1
 7417     vs_ldpq_post(vs1, poly1);
 7418     // c load 32 (8x4S) next inputs from poly2
 7419     vs_ldpq_post(vs2, poly2);
 7420     // compute a = b montmul c
 7421     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 7422     // compute a = rsquare montmul a
 7423     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 7424     // save a 32 (8x4S) results
 7425     vs_stpq_post(vs2, result);
 7426 
 7427     __ sub(len, len, 128);
 7428     __ cmp(len, (u1)128);
 7429     __ br(Assembler::GE, L_loop);
 7430 
 7431     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7432     __ mov(r0, zr); // return 0
 7433     __ ret(lr);
 7434 
 7435     // record the stub entry and end
 7436     store_archive_data(stub_id, start, __ pc());
 7437 
 7438     return start;
 7439   }
 7440 
 7441   // Dilithium Motgomery multiply an array by a constant.
 7442   // A straightforward implementation of the method
 7443   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 7444   // of the sun.security.provider.MLDSA class
 7445   //
 7446   // coeffs (int[256]) = c_rarg0
 7447   // constant (int) = c_rarg1
 7448   address generate_dilithiumMontMulByConstant() {
 7449     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 7450     int entry_count = StubInfo::entry_count(stub_id);
 7451     assert(entry_count == 1, "sanity check");
 7452     address start = load_archive_data(stub_id);
 7453     if (start != nullptr) {
 7454       return start;
 7455     }
 7456     __ align(CodeEntryAlignment);
 7457     StubCodeMark mark(this, stub_id);
 7458     start = __ pc();
 7459     __ enter();
 7460 
 7461     Label L_loop;
 7462 
 7463     const Register coeffs = c_rarg0;
 7464     const Register constant = c_rarg1;
 7465 
 7466     const Register dilithiumConsts = r10;
 7467     const Register result = r11;
 7468     const Register len = r12;
 7469 
 7470     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 7471     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 7472     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 7473     VSeq<8> vconst(29, 0);             // for montmul by constant
 7474 
 7475     // results track inputs
 7476     __ add(result, coeffs, 0);
 7477     __ lea(dilithiumConsts,
 7478              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7479 
 7480     // load constants q, qinv -- they do not get clobbered by first two loops
 7481     vs_ldpq(vq, dilithiumConsts); // qInv, q
 7482     // copy caller supplied constant across vconst
 7483     __ dup(vconst[0], __ T4S, constant);
 7484     __ mov(len, zr);
 7485     __ add(len, len, 1024);
 7486 
 7487     __ BIND(L_loop);
 7488 
 7489     // load next 32 inputs
 7490     vs_ldpq_post(vs2, coeffs);
 7491     // mont mul by constant
 7492     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 7493     // write next 32 results
 7494     vs_stpq_post(vs2, result);
 7495 
 7496     __ sub(len, len, 128);
 7497     __ cmp(len, (u1)128);
 7498     __ br(Assembler::GE, L_loop);
 7499 
 7500     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7501     __ mov(r0, zr); // return 0
 7502     __ ret(lr);
 7503 
 7504     // record the stub entry and end
 7505     store_archive_data(stub_id, start, __ pc());
 7506 
 7507     return start;
 7508   }
 7509 
 7510   // Dilithium decompose poly.
 7511   // Implements the method
 7512   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 7513   // of the sun.security.provider.ML_DSA class
 7514   //
 7515   // input (int[256]) = c_rarg0
 7516   // lowPart (int[256]) = c_rarg1
 7517   // highPart (int[256]) = c_rarg2
 7518   // twoGamma2  (int) = c_rarg3
 7519   // multiplier (int) = c_rarg4
 7520   address generate_dilithiumDecomposePoly() {
 7521     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7522     int entry_count = StubInfo::entry_count(stub_id);
 7523     assert(entry_count == 1, "sanity check");
 7524     address start = load_archive_data(stub_id);
 7525     if (start != nullptr) {
 7526       return start;
 7527     }
 7528     __ align(CodeEntryAlignment);
 7529     StubCodeMark mark(this, stub_id);
 7530     start = __ pc();
 7531     Label L_loop;
 7532 
 7533     const Register input = c_rarg0;
 7534     const Register lowPart = c_rarg1;
 7535     const Register highPart = c_rarg2;
 7536     const Register twoGamma2 = c_rarg3;
 7537     const Register multiplier = c_rarg4;
 7538 
 7539     const Register len = r9;
 7540     const Register dilithiumConsts = r10;
 7541     const Register tmp = r11;
 7542 
 7543     // 6 independent sets of 4x4s values
 7544     VSeq<4> vs1(0), vs2(4), vs3(8);
 7545     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7546 
 7547     // 7 constants for cross-multiplying
 7548     VSeq<4> one(25, 0);
 7549     VSeq<4> qminus1(26, 0);
 7550     VSeq<4> g2(27, 0);
 7551     VSeq<4> twog2(28, 0);
 7552     VSeq<4> mult(29, 0);
 7553     VSeq<4> q(30, 0);
 7554     VSeq<4> qadd(31, 0);
 7555 
 7556     __ enter();
 7557 
 7558     __ lea(dilithiumConsts,
 7559              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7560 
 7561     // save callee-saved registers
 7562     __ stpd(v8, v9, __ pre(sp, -64));
 7563     __ stpd(v10, v11, Address(sp, 16));
 7564     __ stpd(v12, v13, Address(sp, 32));
 7565     __ stpd(v14, v15, Address(sp, 48));
 7566 
 7567     // populate constant registers
 7568     __ mov(tmp, zr);
 7569     __ add(tmp, tmp, 1);
 7570     __ dup(one[0], __ T4S, tmp); // 1
 7571     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7572     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7573     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7574     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7575     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7576     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7577 
 7578     __ mov(len, zr);
 7579     __ add(len, len, 1024);
 7580 
 7581     __ BIND(L_loop);
 7582 
 7583     // load next 4x4S inputs interleaved: rplus --> vs1
 7584     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7585 
 7586     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7587     vs_addv(vtmp, __ T4S, vs1, qadd);
 7588     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7589     vs_mulv(vtmp, __ T4S, vtmp, q);
 7590     vs_subv(vs1, __ T4S, vs1, vtmp);
 7591 
 7592     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7593     vs_sshr(vtmp, __ T4S, vs1, 31);
 7594     vs_andr(vtmp, vtmp, q);
 7595     vs_addv(vs1, __ T4S, vs1, vtmp);
 7596 
 7597     // quotient --> vs2
 7598     // int quotient = (rplus * multiplier) >> 22;
 7599     vs_mulv(vtmp, __ T4S, vs1, mult);
 7600     vs_sshr(vs2, __ T4S, vtmp, 22);
 7601 
 7602     // r0 --> vs3
 7603     // int r0 = rplus - quotient * twoGamma2;
 7604     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7605     vs_subv(vs3, __ T4S, vs1, vtmp);
 7606 
 7607     // mask --> vs4
 7608     // int mask = (twoGamma2 - r0) >> 22;
 7609     vs_subv(vtmp, __ T4S, twog2, vs3);
 7610     vs_sshr(vs4, __ T4S, vtmp, 22);
 7611 
 7612     // r0 -= (mask & twoGamma2);
 7613     vs_andr(vtmp, vs4, twog2);
 7614     vs_subv(vs3, __ T4S, vs3, vtmp);
 7615 
 7616     //  quotient += (mask & 1);
 7617     vs_andr(vtmp, vs4, one);
 7618     vs_addv(vs2, __ T4S, vs2, vtmp);
 7619 
 7620     // mask = (twoGamma2 / 2 - r0) >> 31;
 7621     vs_subv(vtmp, __ T4S, g2, vs3);
 7622     vs_sshr(vs4, __ T4S, vtmp, 31);
 7623 
 7624     // r0 -= (mask & twoGamma2);
 7625     vs_andr(vtmp, vs4, twog2);
 7626     vs_subv(vs3, __ T4S, vs3, vtmp);
 7627 
 7628     // quotient += (mask & 1);
 7629     vs_andr(vtmp, vs4, one);
 7630     vs_addv(vs2, __ T4S, vs2, vtmp);
 7631 
 7632     // r1 --> vs5
 7633     // int r1 = rplus - r0 - (dilithium_q - 1);
 7634     vs_subv(vtmp, __ T4S, vs1, vs3);
 7635     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7636 
 7637     // r1 --> vs1 (overwriting rplus)
 7638     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7639     vs_negr(vtmp, __ T4S, vs5);
 7640     vs_orr(vtmp, vs5, vtmp);
 7641     vs_sshr(vs1, __ T4S, vtmp, 31);
 7642 
 7643     // r0 += ~r1;
 7644     vs_notr(vtmp, vs1);
 7645     vs_addv(vs3, __ T4S, vs3, vtmp);
 7646 
 7647     // r1 = r1 & quotient;
 7648     vs_andr(vs1, vs2, vs1);
 7649 
 7650     // store results inteleaved
 7651     // lowPart[m] = r0;
 7652     // highPart[m] = r1;
 7653     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7654     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7655 
 7656     __ sub(len, len, 64);
 7657     __ cmp(len, (u1)64);
 7658     __ br(Assembler::GE, L_loop);
 7659 
 7660     // restore callee-saved vector registers
 7661     __ ldpd(v14, v15, Address(sp, 48));
 7662     __ ldpd(v12, v13, Address(sp, 32));
 7663     __ ldpd(v10, v11, Address(sp, 16));
 7664     __ ldpd(v8, v9, __ post(sp, 64));
 7665 
 7666     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7667     __ mov(r0, zr); // return 0
 7668     __ ret(lr);
 7669 
 7670     // record the stub entry and end
 7671     store_archive_data(stub_id, start, __ pc());
 7672 
 7673     return start;
 7674   }
 7675 
 7676   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7677              Register tmp0, Register tmp1, Register tmp2) {
 7678     __ bic(tmp0, a2, a1); // for a0
 7679     __ bic(tmp1, a3, a2); // for a1
 7680     __ bic(tmp2, a4, a3); // for a2
 7681     __ eor(a2, a2, tmp2);
 7682     __ bic(tmp2, a0, a4); // for a3
 7683     __ eor(a3, a3, tmp2);
 7684     __ bic(tmp2, a1, a0); // for a4
 7685     __ eor(a0, a0, tmp0);
 7686     __ eor(a1, a1, tmp1);
 7687     __ eor(a4, a4, tmp2);
 7688   }
 7689 
 7690   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7691                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7692                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7693                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7694                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7695                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7696                         Register tmp0, Register tmp1, Register tmp2) {
 7697     __ eor3(tmp1, a4, a9, a14);
 7698     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7699     __ eor3(tmp2, a1, a6, a11);
 7700     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7701     __ rax1(tmp2, tmp0, tmp1); // d0
 7702     {
 7703 
 7704       Register tmp3, tmp4;
 7705       if (can_use_fp && can_use_r18) {
 7706         tmp3 = rfp;
 7707         tmp4 = r18_tls;
 7708       } else {
 7709         tmp3 = a4;
 7710         tmp4 = a9;
 7711         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7712       }
 7713 
 7714       __ eor3(tmp3, a0, a5, a10);
 7715       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7716       __ eor(a0, a0, tmp2);
 7717       __ eor(a5, a5, tmp2);
 7718       __ eor(a10, a10, tmp2);
 7719       __ eor(a15, a15, tmp2);
 7720       __ eor(a20, a20, tmp2); // d0(tmp2)
 7721       __ eor3(tmp3, a2, a7, a12);
 7722       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7723       __ rax1(tmp3, tmp4, tmp2); // d1
 7724       __ eor(a1, a1, tmp3);
 7725       __ eor(a6, a6, tmp3);
 7726       __ eor(a11, a11, tmp3);
 7727       __ eor(a16, a16, tmp3);
 7728       __ eor(a21, a21, tmp3); // d1(tmp3)
 7729       __ rax1(tmp3, tmp2, tmp0); // d3
 7730       __ eor3(tmp2, a3, a8, a13);
 7731       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7732       __ eor(a3, a3, tmp3);
 7733       __ eor(a8, a8, tmp3);
 7734       __ eor(a13, a13, tmp3);
 7735       __ eor(a18, a18, tmp3);
 7736       __ eor(a23, a23, tmp3);
 7737       __ rax1(tmp2, tmp1, tmp0); // d2
 7738       __ eor(a2, a2, tmp2);
 7739       __ eor(a7, a7, tmp2);
 7740       __ eor(a12, a12, tmp2);
 7741       __ rax1(tmp0, tmp0, tmp4); // d4
 7742       if (!can_use_fp || !can_use_r18) {
 7743         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7744       }
 7745       __ eor(a17, a17, tmp2);
 7746       __ eor(a22, a22, tmp2);
 7747       __ eor(a4, a4, tmp0);
 7748       __ eor(a9, a9, tmp0);
 7749       __ eor(a14, a14, tmp0);
 7750       __ eor(a19, a19, tmp0);
 7751       __ eor(a24, a24, tmp0);
 7752     }
 7753 
 7754     __ rol(tmp0, a10, 3);
 7755     __ rol(a10, a1, 1);
 7756     __ rol(a1, a6, 44);
 7757     __ rol(a6, a9, 20);
 7758     __ rol(a9, a22, 61);
 7759     __ rol(a22, a14, 39);
 7760     __ rol(a14, a20, 18);
 7761     __ rol(a20, a2, 62);
 7762     __ rol(a2, a12, 43);
 7763     __ rol(a12, a13, 25);
 7764     __ rol(a13, a19, 8) ;
 7765     __ rol(a19, a23, 56);
 7766     __ rol(a23, a15, 41);
 7767     __ rol(a15, a4, 27);
 7768     __ rol(a4, a24, 14);
 7769     __ rol(a24, a21, 2);
 7770     __ rol(a21, a8, 55);
 7771     __ rol(a8, a16, 45);
 7772     __ rol(a16, a5, 36);
 7773     __ rol(a5, a3, 28);
 7774     __ rol(a3, a18, 21);
 7775     __ rol(a18, a17, 15);
 7776     __ rol(a17, a11, 10);
 7777     __ rol(a11, a7, 6);
 7778     __ mov(a7, tmp0);
 7779 
 7780     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7781     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7782     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7783     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7784     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7785 
 7786     __ ldr(tmp1, __ post(rc, 8));
 7787     __ eor(a0, a0, tmp1);
 7788 
 7789   }
 7790 
 7791   // Arguments:
 7792   //
 7793   // Inputs:
 7794   //   c_rarg0   - byte[]  source+offset
 7795   //   c_rarg1   - byte[]  SHA.state
 7796   //   c_rarg2   - int     block_size
 7797   //   c_rarg3   - int     offset
 7798   //   c_rarg4   - int     limit
 7799   //
 7800   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7801     bool multi_block;
 7802     switch (stub_id) {
 7803     case StubId::stubgen_sha3_implCompress_id:
 7804       multi_block = false;
 7805       break;
 7806     case StubId::stubgen_sha3_implCompressMB_id:
 7807       multi_block = true;
 7808       break;
 7809     default:
 7810       ShouldNotReachHere();
 7811     }
 7812     int entry_count = StubInfo::entry_count(stub_id);
 7813     assert(entry_count == 1, "sanity check");
 7814     address start = load_archive_data(stub_id);
 7815     if (start != nullptr) {
 7816       return start;
 7817     }
 7818     __ align(CodeEntryAlignment);
 7819     StubCodeMark mark(this, stub_id);
 7820     start = __ pc();
 7821 
 7822     Register buf           = c_rarg0;
 7823     Register state         = c_rarg1;
 7824     Register block_size    = c_rarg2;
 7825     Register ofs           = c_rarg3;
 7826     Register limit         = c_rarg4;
 7827 
 7828     // use r3.r17,r19..r28 to keep a0..a24.
 7829     // a0..a24 are respective locals from SHA3.java
 7830     Register a0 = r25,
 7831              a1 = r26,
 7832              a2 = r27,
 7833              a3 = r3,
 7834              a4 = r4,
 7835              a5 = r5,
 7836              a6 = r6,
 7837              a7 = r7,
 7838              a8 = rscratch1, // r8
 7839              a9 = rscratch2, // r9
 7840              a10 = r10,
 7841              a11 = r11,
 7842              a12 = r12,
 7843              a13 = r13,
 7844              a14 = r14,
 7845              a15 = r15,
 7846              a16 = r16,
 7847              a17 = r17,
 7848              a18 = r28,
 7849              a19 = r19,
 7850              a20 = r20,
 7851              a21 = r21,
 7852              a22 = r22,
 7853              a23 = r23,
 7854              a24 = r24;
 7855 
 7856     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7857 
 7858     Label sha3_loop, rounds24_preloop, loop_body;
 7859     Label sha3_512_or_sha3_384, shake128;
 7860 
 7861     bool can_use_r18 = false;
 7862 #ifndef R18_RESERVED
 7863     can_use_r18 = true;
 7864 #endif
 7865     bool can_use_fp = !PreserveFramePointer;
 7866 
 7867     __ enter();
 7868 
 7869     // save almost all yet unsaved gpr registers on stack
 7870     __ str(block_size, __ pre(sp, -128));
 7871     if (multi_block) {
 7872       __ stpw(ofs, limit, Address(sp, 8));
 7873     }
 7874     // 8 bytes at sp+16 will be used to keep buf
 7875     __ stp(r19, r20, Address(sp, 32));
 7876     __ stp(r21, r22, Address(sp, 48));
 7877     __ stp(r23, r24, Address(sp, 64));
 7878     __ stp(r25, r26, Address(sp, 80));
 7879     __ stp(r27, r28, Address(sp, 96));
 7880     if (can_use_r18 && can_use_fp) {
 7881       __ stp(r18_tls, state, Address(sp, 112));
 7882     } else {
 7883       __ str(state, Address(sp, 112));
 7884     }
 7885 
 7886     // begin sha3 calculations: loading a0..a24 from state arrary
 7887     __ ldp(a0, a1, state);
 7888     __ ldp(a2, a3, Address(state, 16));
 7889     __ ldp(a4, a5, Address(state, 32));
 7890     __ ldp(a6, a7, Address(state, 48));
 7891     __ ldp(a8, a9, Address(state, 64));
 7892     __ ldp(a10, a11, Address(state, 80));
 7893     __ ldp(a12, a13, Address(state, 96));
 7894     __ ldp(a14, a15, Address(state, 112));
 7895     __ ldp(a16, a17, Address(state, 128));
 7896     __ ldp(a18, a19, Address(state, 144));
 7897     __ ldp(a20, a21, Address(state, 160));
 7898     __ ldp(a22, a23, Address(state, 176));
 7899     __ ldr(a24, Address(state, 192));
 7900 
 7901     __ BIND(sha3_loop);
 7902 
 7903     // load input
 7904     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7905     __ eor(a0, a0, tmp3);
 7906     __ eor(a1, a1, tmp2);
 7907     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7908     __ eor(a2, a2, tmp3);
 7909     __ eor(a3, a3, tmp2);
 7910     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7911     __ eor(a4, a4, tmp3);
 7912     __ eor(a5, a5, tmp2);
 7913     __ ldr(tmp3, __ post(buf, 8));
 7914     __ eor(a6, a6, tmp3);
 7915 
 7916     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7917     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7918 
 7919     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7920     __ eor(a7, a7, tmp3);
 7921     __ eor(a8, a8, tmp2);
 7922     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7923     __ eor(a9, a9, tmp3);
 7924     __ eor(a10, a10, tmp2);
 7925     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7926     __ eor(a11, a11, tmp3);
 7927     __ eor(a12, a12, tmp2);
 7928     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7929     __ eor(a13, a13, tmp3);
 7930     __ eor(a14, a14, tmp2);
 7931     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7932     __ eor(a15, a15, tmp3);
 7933     __ eor(a16, a16, tmp2);
 7934 
 7935     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7936     __ andw(tmp2, block_size, 48);
 7937     __ cbzw(tmp2, rounds24_preloop);
 7938     __ tbnz(block_size, 5, shake128);
 7939     // block_size == 144, bit5 == 0, SHA3-244
 7940     __ ldr(tmp3, __ post(buf, 8));
 7941     __ eor(a17, a17, tmp3);
 7942     __ b(rounds24_preloop);
 7943 
 7944     __ BIND(shake128);
 7945     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7946     __ eor(a17, a17, tmp3);
 7947     __ eor(a18, a18, tmp2);
 7948     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7949     __ eor(a19, a19, tmp3);
 7950     __ eor(a20, a20, tmp2);
 7951     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7952 
 7953     __ BIND(sha3_512_or_sha3_384);
 7954     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7955     __ eor(a7, a7, tmp3);
 7956     __ eor(a8, a8, tmp2);
 7957     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7958 
 7959     // SHA3-384
 7960     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7961     __ eor(a9, a9, tmp3);
 7962     __ eor(a10, a10, tmp2);
 7963     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7964     __ eor(a11, a11, tmp3);
 7965     __ eor(a12, a12, tmp2);
 7966 
 7967     __ BIND(rounds24_preloop);
 7968     __ fmovs(v0, 24.0); // float loop counter,
 7969     __ fmovs(v1, 1.0);  // exact representation
 7970 
 7971     __ str(buf, Address(sp, 16));
 7972     __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
 7973 
 7974     __ BIND(loop_body);
 7975     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7976                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7977                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7978                      tmp0, tmp1, tmp2);
 7979     __ fsubs(v0, v0, v1);
 7980     __ fcmps(v0, 0.0);
 7981     __ br(__ NE, loop_body);
 7982 
 7983     if (multi_block) {
 7984       __ ldrw(block_size, sp); // block_size
 7985       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7986       __ addw(tmp2, tmp2, block_size);
 7987       __ cmpw(tmp2, tmp1);
 7988       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7989       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7990       __ br(Assembler::LE, sha3_loop);
 7991       __ movw(c_rarg0, tmp2); // return offset
 7992     }
 7993     if (can_use_fp && can_use_r18) {
 7994       __ ldp(r18_tls, state, Address(sp, 112));
 7995     } else {
 7996       __ ldr(state, Address(sp, 112));
 7997     }
 7998     // save calculated sha3 state
 7999     __ stp(a0, a1, Address(state));
 8000     __ stp(a2, a3, Address(state, 16));
 8001     __ stp(a4, a5, Address(state, 32));
 8002     __ stp(a6, a7, Address(state, 48));
 8003     __ stp(a8, a9, Address(state, 64));
 8004     __ stp(a10, a11, Address(state, 80));
 8005     __ stp(a12, a13, Address(state, 96));
 8006     __ stp(a14, a15, Address(state, 112));
 8007     __ stp(a16, a17, Address(state, 128));
 8008     __ stp(a18, a19, Address(state, 144));
 8009     __ stp(a20, a21, Address(state, 160));
 8010     __ stp(a22, a23, Address(state, 176));
 8011     __ str(a24, Address(state, 192));
 8012 
 8013     // restore required registers from stack
 8014     __ ldp(r19, r20, Address(sp, 32));
 8015     __ ldp(r21, r22, Address(sp, 48));
 8016     __ ldp(r23, r24, Address(sp, 64));
 8017     __ ldp(r25, r26, Address(sp, 80));
 8018     __ ldp(r27, r28, Address(sp, 96));
 8019     if (can_use_fp && can_use_r18) {
 8020       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 8021     } // else no need to recalculate rfp, since it wasn't changed
 8022 
 8023     __ leave();
 8024 
 8025     __ ret(lr);
 8026 
 8027     // record the stub entry and end
 8028     store_archive_data(stub_id, start, __ pc());
 8029 
 8030     return start;
 8031   }
 8032 
 8033   /**
 8034    *  Arguments:
 8035    *
 8036    * Inputs:
 8037    *   c_rarg0   - int crc
 8038    *   c_rarg1   - byte* buf
 8039    *   c_rarg2   - int length
 8040    *
 8041    * Output:
 8042    *       rax   - int crc result
 8043    */
 8044   address generate_updateBytesCRC32() {
 8045     assert(UseCRC32Intrinsics, "what are we doing here?");
 8046     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 8047     int entry_count = StubInfo::entry_count(stub_id);
 8048     assert(entry_count == 1, "sanity check");
 8049     address start = load_archive_data(stub_id);
 8050     if (start != nullptr) {
 8051       return start;
 8052     }
 8053     __ align(CodeEntryAlignment);
 8054     StubCodeMark mark(this, stub_id);
 8055 
 8056     start = __ pc();
 8057 
 8058     const Register crc   = c_rarg0;  // crc
 8059     const Register buf   = c_rarg1;  // source java byte array address
 8060     const Register len   = c_rarg2;  // length
 8061     const Register table0 = c_rarg3; // crc_table address
 8062     const Register table1 = c_rarg4;
 8063     const Register table2 = c_rarg5;
 8064     const Register table3 = c_rarg6;
 8065     const Register tmp3 = c_rarg7;
 8066 
 8067     BLOCK_COMMENT("Entry:");
 8068     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8069 
 8070     __ kernel_crc32(crc, buf, len,
 8071               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 8072 
 8073     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8074     __ ret(lr);
 8075 
 8076     // record the stub entry and end
 8077     store_archive_data(stub_id, start, __ pc());
 8078 
 8079     return start;
 8080   }
 8081 
 8082   /**
 8083    *  Arguments:
 8084    *
 8085    * Inputs:
 8086    *   c_rarg0   - int crc
 8087    *   c_rarg1   - byte* buf
 8088    *   c_rarg2   - int length
 8089    *   c_rarg3   - int* table
 8090    *
 8091    * Output:
 8092    *       r0   - int crc result
 8093    */
 8094   address generate_updateBytesCRC32C() {
 8095     assert(UseCRC32CIntrinsics, "what are we doing here?");
 8096     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 8097     int entry_count = StubInfo::entry_count(stub_id);
 8098     assert(entry_count == 1, "sanity check");
 8099     address start = load_archive_data(stub_id);
 8100     if (start != nullptr) {
 8101       return start;
 8102     }
 8103     __ align(CodeEntryAlignment);
 8104     StubCodeMark mark(this, stub_id);
 8105 
 8106     start = __ pc();
 8107 
 8108     const Register crc   = c_rarg0;  // crc
 8109     const Register buf   = c_rarg1;  // source java byte array address
 8110     const Register len   = c_rarg2;  // length
 8111     const Register table0 = c_rarg3; // crc_table address
 8112     const Register table1 = c_rarg4;
 8113     const Register table2 = c_rarg5;
 8114     const Register table3 = c_rarg6;
 8115     const Register tmp3 = c_rarg7;
 8116 
 8117     BLOCK_COMMENT("Entry:");
 8118     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8119 
 8120     __ kernel_crc32c(crc, buf, len,
 8121               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 8122 
 8123     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8124     __ ret(lr);
 8125 
 8126     // record the stub entry and end
 8127     store_archive_data(stub_id, start, __ pc());
 8128 
 8129     return start;
 8130   }
 8131 
 8132   /***
 8133    *  Arguments:
 8134    *
 8135    *  Inputs:
 8136    *   c_rarg0   - int   adler
 8137    *   c_rarg1   - byte* buff
 8138    *   c_rarg2   - int   len
 8139    *
 8140    * Output:
 8141    *   c_rarg0   - int adler result
 8142    */
 8143   address generate_updateBytesAdler32() {
 8144     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 8145     int entry_count = StubInfo::entry_count(stub_id);
 8146     assert(entry_count == 1, "sanity check");
 8147     address start = load_archive_data(stub_id);
 8148     if (start != nullptr) {
 8149       return start;
 8150     }
 8151     __ align(CodeEntryAlignment);
 8152     StubCodeMark mark(this, stub_id);
 8153     start = __ pc();
 8154 
 8155     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 8156 
 8157     // Aliases
 8158     Register adler  = c_rarg0;
 8159     Register s1     = c_rarg0;
 8160     Register s2     = c_rarg3;
 8161     Register buff   = c_rarg1;
 8162     Register len    = c_rarg2;
 8163     Register nmax  = r4;
 8164     Register base  = r5;
 8165     Register count = r6;
 8166     Register temp0 = rscratch1;
 8167     Register temp1 = rscratch2;
 8168     FloatRegister vbytes = v0;
 8169     FloatRegister vs1acc = v1;
 8170     FloatRegister vs2acc = v2;
 8171     FloatRegister vtable = v3;
 8172 
 8173     // Max number of bytes we can process before having to take the mod
 8174     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 8175     uint64_t BASE = 0xfff1;
 8176     uint64_t NMAX = 0x15B0;
 8177 
 8178     __ mov(base, BASE);
 8179     __ mov(nmax, NMAX);
 8180 
 8181     // Load accumulation coefficients for the upper 16 bits
 8182     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 8183     __ ld1(vtable, __ T16B, Address(temp0));
 8184 
 8185     // s1 is initialized to the lower 16 bits of adler
 8186     // s2 is initialized to the upper 16 bits of adler
 8187     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 8188     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 8189 
 8190     // The pipelined loop needs at least 16 elements for 1 iteration
 8191     // It does check this, but it is more effective to skip to the cleanup loop
 8192     __ cmp(len, (u1)16);
 8193     __ br(Assembler::HS, L_nmax);
 8194     __ cbz(len, L_combine);
 8195 
 8196     __ bind(L_simple_by1_loop);
 8197     __ ldrb(temp0, Address(__ post(buff, 1)));
 8198     __ add(s1, s1, temp0);
 8199     __ add(s2, s2, s1);
 8200     __ subs(len, len, 1);
 8201     __ br(Assembler::HI, L_simple_by1_loop);
 8202 
 8203     // s1 = s1 % BASE
 8204     __ subs(temp0, s1, base);
 8205     __ csel(s1, temp0, s1, Assembler::HS);
 8206 
 8207     // s2 = s2 % BASE
 8208     __ lsr(temp0, s2, 16);
 8209     __ lsl(temp1, temp0, 4);
 8210     __ sub(temp1, temp1, temp0);
 8211     __ add(s2, temp1, s2, ext::uxth);
 8212 
 8213     __ subs(temp0, s2, base);
 8214     __ csel(s2, temp0, s2, Assembler::HS);
 8215 
 8216     __ b(L_combine);
 8217 
 8218     __ bind(L_nmax);
 8219     __ subs(len, len, nmax);
 8220     __ sub(count, nmax, 16);
 8221     __ br(Assembler::LO, L_by16);
 8222 
 8223     __ bind(L_nmax_loop);
 8224 
 8225     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 8226                                       vbytes, vs1acc, vs2acc, vtable);
 8227 
 8228     __ subs(count, count, 16);
 8229     __ br(Assembler::HS, L_nmax_loop);
 8230 
 8231     // s1 = s1 % BASE
 8232     __ lsr(temp0, s1, 16);
 8233     __ lsl(temp1, temp0, 4);
 8234     __ sub(temp1, temp1, temp0);
 8235     __ add(temp1, temp1, s1, ext::uxth);
 8236 
 8237     __ lsr(temp0, temp1, 16);
 8238     __ lsl(s1, temp0, 4);
 8239     __ sub(s1, s1, temp0);
 8240     __ add(s1, s1, temp1, ext:: uxth);
 8241 
 8242     __ subs(temp0, s1, base);
 8243     __ csel(s1, temp0, s1, Assembler::HS);
 8244 
 8245     // s2 = s2 % BASE
 8246     __ lsr(temp0, s2, 16);
 8247     __ lsl(temp1, temp0, 4);
 8248     __ sub(temp1, temp1, temp0);
 8249     __ add(temp1, temp1, s2, ext::uxth);
 8250 
 8251     __ lsr(temp0, temp1, 16);
 8252     __ lsl(s2, temp0, 4);
 8253     __ sub(s2, s2, temp0);
 8254     __ add(s2, s2, temp1, ext:: uxth);
 8255 
 8256     __ subs(temp0, s2, base);
 8257     __ csel(s2, temp0, s2, Assembler::HS);
 8258 
 8259     __ subs(len, len, nmax);
 8260     __ sub(count, nmax, 16);
 8261     __ br(Assembler::HS, L_nmax_loop);
 8262 
 8263     __ bind(L_by16);
 8264     __ adds(len, len, count);
 8265     __ br(Assembler::LO, L_by1);
 8266 
 8267     __ bind(L_by16_loop);
 8268 
 8269     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 8270                                       vbytes, vs1acc, vs2acc, vtable);
 8271 
 8272     __ subs(len, len, 16);
 8273     __ br(Assembler::HS, L_by16_loop);
 8274 
 8275     __ bind(L_by1);
 8276     __ adds(len, len, 15);
 8277     __ br(Assembler::LO, L_do_mod);
 8278 
 8279     __ bind(L_by1_loop);
 8280     __ ldrb(temp0, Address(__ post(buff, 1)));
 8281     __ add(s1, temp0, s1);
 8282     __ add(s2, s2, s1);
 8283     __ subs(len, len, 1);
 8284     __ br(Assembler::HS, L_by1_loop);
 8285 
 8286     __ bind(L_do_mod);
 8287     // s1 = s1 % BASE
 8288     __ lsr(temp0, s1, 16);
 8289     __ lsl(temp1, temp0, 4);
 8290     __ sub(temp1, temp1, temp0);
 8291     __ add(temp1, temp1, s1, ext::uxth);
 8292 
 8293     __ lsr(temp0, temp1, 16);
 8294     __ lsl(s1, temp0, 4);
 8295     __ sub(s1, s1, temp0);
 8296     __ add(s1, s1, temp1, ext:: uxth);
 8297 
 8298     __ subs(temp0, s1, base);
 8299     __ csel(s1, temp0, s1, Assembler::HS);
 8300 
 8301     // s2 = s2 % BASE
 8302     __ lsr(temp0, s2, 16);
 8303     __ lsl(temp1, temp0, 4);
 8304     __ sub(temp1, temp1, temp0);
 8305     __ add(temp1, temp1, s2, ext::uxth);
 8306 
 8307     __ lsr(temp0, temp1, 16);
 8308     __ lsl(s2, temp0, 4);
 8309     __ sub(s2, s2, temp0);
 8310     __ add(s2, s2, temp1, ext:: uxth);
 8311 
 8312     __ subs(temp0, s2, base);
 8313     __ csel(s2, temp0, s2, Assembler::HS);
 8314 
 8315     // Combine lower bits and higher bits
 8316     __ bind(L_combine);
 8317     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 8318 
 8319     __ ret(lr);
 8320 
 8321     // record the stub entry and end
 8322     store_archive_data(stub_id, start, __ pc());
 8323 
 8324     return start;
 8325   }
 8326 
 8327   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 8328           Register temp0, Register temp1, FloatRegister vbytes,
 8329           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 8330     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 8331     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 8332     // In non-vectorized code, we update s1 and s2 as:
 8333     //   s1 <- s1 + b1
 8334     //   s2 <- s2 + s1
 8335     //   s1 <- s1 + b2
 8336     //   s2 <- s2 + b1
 8337     //   ...
 8338     //   s1 <- s1 + b16
 8339     //   s2 <- s2 + s1
 8340     // Putting above assignments together, we have:
 8341     //   s1_new = s1 + b1 + b2 + ... + b16
 8342     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 8343     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 8344     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 8345     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 8346 
 8347     // s2 = s2 + s1 * 16
 8348     __ add(s2, s2, s1, Assembler::LSL, 4);
 8349 
 8350     // vs1acc = b1 + b2 + b3 + ... + b16
 8351     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 8352     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 8353     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 8354     __ uaddlv(vs1acc, __ T16B, vbytes);
 8355     __ uaddlv(vs2acc, __ T8H, vs2acc);
 8356 
 8357     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 8358     __ fmovd(temp0, vs1acc);
 8359     __ fmovd(temp1, vs2acc);
 8360     __ add(s1, s1, temp0);
 8361     __ add(s2, s2, temp1);
 8362   }
 8363 
 8364   /**
 8365    *  Arguments:
 8366    *
 8367    *  Input:
 8368    *    c_rarg0   - x address
 8369    *    c_rarg1   - x length
 8370    *    c_rarg2   - y address
 8371    *    c_rarg3   - y length
 8372    *    c_rarg4   - z address
 8373    */
 8374   address generate_multiplyToLen() {
 8375     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 8376     int entry_count = StubInfo::entry_count(stub_id);
 8377     assert(entry_count == 1, "sanity check");
 8378     address start = load_archive_data(stub_id);
 8379     if (start != nullptr) {
 8380       return start;
 8381     }
 8382     __ align(CodeEntryAlignment);
 8383     StubCodeMark mark(this, stub_id);
 8384 
 8385     start = __ pc();
 8386     const Register x     = r0;
 8387     const Register xlen  = r1;
 8388     const Register y     = r2;
 8389     const Register ylen  = r3;
 8390     const Register z     = r4;
 8391 
 8392     const Register tmp0  = r5;
 8393     const Register tmp1  = r10;
 8394     const Register tmp2  = r11;
 8395     const Register tmp3  = r12;
 8396     const Register tmp4  = r13;
 8397     const Register tmp5  = r14;
 8398     const Register tmp6  = r15;
 8399     const Register tmp7  = r16;
 8400 
 8401     BLOCK_COMMENT("Entry:");
 8402     __ enter(); // required for proper stackwalking of RuntimeStub frame
 8403     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 8404     __ leave(); // required for proper stackwalking of RuntimeStub frame
 8405     __ ret(lr);
 8406 
 8407     // record the stub entry and end
 8408     store_archive_data(stub_id, start, __ pc());
 8409 
 8410     return start;
 8411   }
 8412 
 8413   address generate_squareToLen() {
 8414     // squareToLen algorithm for sizes 1..127 described in java code works
 8415     // faster than multiply_to_len on some CPUs and slower on others, but
 8416     // multiply_to_len shows a bit better overall results
 8417     StubId stub_id = StubId::stubgen_squareToLen_id;
 8418     int entry_count = StubInfo::entry_count(stub_id);
 8419     assert(entry_count == 1, "sanity check");
 8420     address start = load_archive_data(stub_id);
 8421     if (start != nullptr) {
 8422       return start;
 8423     }
 8424     __ align(CodeEntryAlignment);
 8425     StubCodeMark mark(this, stub_id);
 8426     start = __ pc();
 8427 
 8428     const Register x     = r0;
 8429     const Register xlen  = r1;
 8430     const Register z     = r2;
 8431     const Register y     = r4; // == x
 8432     const Register ylen  = r5; // == xlen
 8433 
 8434     const Register tmp0  = r3;
 8435     const Register tmp1  = r10;
 8436     const Register tmp2  = r11;
 8437     const Register tmp3  = r12;
 8438     const Register tmp4  = r13;
 8439     const Register tmp5  = r14;
 8440     const Register tmp6  = r15;
 8441     const Register tmp7  = r16;
 8442 
 8443     RegSet spilled_regs = RegSet::of(y, ylen);
 8444     BLOCK_COMMENT("Entry:");
 8445     __ enter();
 8446     __ push(spilled_regs, sp);
 8447     __ mov(y, x);
 8448     __ mov(ylen, xlen);
 8449     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 8450     __ pop(spilled_regs, sp);
 8451     __ leave();
 8452     __ ret(lr);
 8453 
 8454     // record the stub entry and end
 8455     store_archive_data(stub_id, start, __ pc());
 8456 
 8457     return start;
 8458   }
 8459 
 8460   address generate_mulAdd() {
 8461     StubId stub_id = StubId::stubgen_mulAdd_id;
 8462     int entry_count = StubInfo::entry_count(stub_id);
 8463     assert(entry_count == 1, "sanity check");
 8464     address start = load_archive_data(stub_id);
 8465     if (start != nullptr) {
 8466       return start;
 8467     }
 8468     __ align(CodeEntryAlignment);
 8469     StubCodeMark mark(this, stub_id);
 8470 
 8471     start = __ pc();
 8472 
 8473     const Register out     = r0;
 8474     const Register in      = r1;
 8475     const Register offset  = r2;
 8476     const Register len     = r3;
 8477     const Register k       = r4;
 8478 
 8479     BLOCK_COMMENT("Entry:");
 8480     __ enter();
 8481     __ mul_add(out, in, offset, len, k);
 8482     __ leave();
 8483     __ ret(lr);
 8484 
 8485     // record the stub entry and end
 8486     store_archive_data(stub_id, start, __ pc());
 8487 
 8488     return start;
 8489   }
 8490 
 8491   // Arguments:
 8492   //
 8493   // Input:
 8494   //   c_rarg0   - newArr address
 8495   //   c_rarg1   - oldArr address
 8496   //   c_rarg2   - newIdx
 8497   //   c_rarg3   - shiftCount
 8498   //   c_rarg4   - numIter
 8499   //
 8500   address generate_bigIntegerRightShift() {
 8501     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 8502     int entry_count = StubInfo::entry_count(stub_id);
 8503     assert(entry_count == 1, "sanity check");
 8504     address start = load_archive_data(stub_id);
 8505     if (start != nullptr) {
 8506       return start;
 8507     }
 8508     __ align(CodeEntryAlignment);
 8509     StubCodeMark mark(this, stub_id);
 8510     start = __ pc();
 8511 
 8512     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8513 
 8514     Register newArr        = c_rarg0;
 8515     Register oldArr        = c_rarg1;
 8516     Register newIdx        = c_rarg2;
 8517     Register shiftCount    = c_rarg3;
 8518     Register numIter       = c_rarg4;
 8519     Register idx           = numIter;
 8520 
 8521     Register newArrCur     = rscratch1;
 8522     Register shiftRevCount = rscratch2;
 8523     Register oldArrCur     = r13;
 8524     Register oldArrNext    = r14;
 8525 
 8526     FloatRegister oldElem0        = v0;
 8527     FloatRegister oldElem1        = v1;
 8528     FloatRegister newElem         = v2;
 8529     FloatRegister shiftVCount     = v3;
 8530     FloatRegister shiftVRevCount  = v4;
 8531 
 8532     __ cbz(idx, Exit);
 8533 
 8534     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8535 
 8536     // left shift count
 8537     __ movw(shiftRevCount, 32);
 8538     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8539 
 8540     // numIter too small to allow a 4-words SIMD loop, rolling back
 8541     __ cmp(numIter, (u1)4);
 8542     __ br(Assembler::LT, ShiftThree);
 8543 
 8544     __ dup(shiftVCount,    __ T4S, shiftCount);
 8545     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 8546     __ negr(shiftVCount,   __ T4S, shiftVCount);
 8547 
 8548     __ BIND(ShiftSIMDLoop);
 8549 
 8550     // Calculate the load addresses
 8551     __ sub(idx, idx, 4);
 8552     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8553     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8554     __ add(oldArrCur,  oldArrNext, 4);
 8555 
 8556     // Load 4 words and process
 8557     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 8558     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 8559     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8560     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8561     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8562     __ st1(newElem,   __ T4S,  Address(newArrCur));
 8563 
 8564     __ cmp(idx, (u1)4);
 8565     __ br(Assembler::LT, ShiftTwoLoop);
 8566     __ b(ShiftSIMDLoop);
 8567 
 8568     __ BIND(ShiftTwoLoop);
 8569     __ cbz(idx, Exit);
 8570     __ cmp(idx, (u1)1);
 8571     __ br(Assembler::EQ, ShiftOne);
 8572 
 8573     // Calculate the load addresses
 8574     __ sub(idx, idx, 2);
 8575     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8576     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8577     __ add(oldArrCur,  oldArrNext, 4);
 8578 
 8579     // Load 2 words and process
 8580     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 8581     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 8582     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 8583     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 8584     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 8585     __ st1(newElem,   __ T2S, Address(newArrCur));
 8586     __ b(ShiftTwoLoop);
 8587 
 8588     __ BIND(ShiftThree);
 8589     __ tbz(idx, 1, ShiftOne);
 8590     __ tbz(idx, 0, ShiftTwo);
 8591     __ ldrw(r10,  Address(oldArr, 12));
 8592     __ ldrw(r11,  Address(oldArr, 8));
 8593     __ lsrvw(r10, r10, shiftCount);
 8594     __ lslvw(r11, r11, shiftRevCount);
 8595     __ orrw(r12,  r10, r11);
 8596     __ strw(r12,  Address(newArr, 8));
 8597 
 8598     __ BIND(ShiftTwo);
 8599     __ ldrw(r10,  Address(oldArr, 8));
 8600     __ ldrw(r11,  Address(oldArr, 4));
 8601     __ lsrvw(r10, r10, shiftCount);
 8602     __ lslvw(r11, r11, shiftRevCount);
 8603     __ orrw(r12,  r10, r11);
 8604     __ strw(r12,  Address(newArr, 4));
 8605 
 8606     __ BIND(ShiftOne);
 8607     __ ldrw(r10,  Address(oldArr, 4));
 8608     __ ldrw(r11,  Address(oldArr));
 8609     __ lsrvw(r10, r10, shiftCount);
 8610     __ lslvw(r11, r11, shiftRevCount);
 8611     __ orrw(r12,  r10, r11);
 8612     __ strw(r12,  Address(newArr));
 8613 
 8614     __ BIND(Exit);
 8615     __ ret(lr);
 8616 
 8617     // record the stub entry and end
 8618     store_archive_data(stub_id, start, __ pc());
 8619 
 8620     return start;
 8621   }
 8622 
 8623   // Arguments:
 8624   //
 8625   // Input:
 8626   //   c_rarg0   - newArr address
 8627   //   c_rarg1   - oldArr address
 8628   //   c_rarg2   - newIdx
 8629   //   c_rarg3   - shiftCount
 8630   //   c_rarg4   - numIter
 8631   //
 8632   address generate_bigIntegerLeftShift() {
 8633     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8634     int entry_count = StubInfo::entry_count(stub_id);
 8635     assert(entry_count == 1, "sanity check");
 8636     address start = load_archive_data(stub_id);
 8637     if (start != nullptr) {
 8638       return start;
 8639     }
 8640     __ align(CodeEntryAlignment);
 8641     StubCodeMark mark(this, stub_id);
 8642     start = __ pc();
 8643 
 8644     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8645 
 8646     Register newArr        = c_rarg0;
 8647     Register oldArr        = c_rarg1;
 8648     Register newIdx        = c_rarg2;
 8649     Register shiftCount    = c_rarg3;
 8650     Register numIter       = c_rarg4;
 8651 
 8652     Register shiftRevCount = rscratch1;
 8653     Register oldArrNext    = rscratch2;
 8654 
 8655     FloatRegister oldElem0        = v0;
 8656     FloatRegister oldElem1        = v1;
 8657     FloatRegister newElem         = v2;
 8658     FloatRegister shiftVCount     = v3;
 8659     FloatRegister shiftVRevCount  = v4;
 8660 
 8661     __ cbz(numIter, Exit);
 8662 
 8663     __ add(oldArrNext, oldArr, 4);
 8664     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8665 
 8666     // right shift count
 8667     __ movw(shiftRevCount, 32);
 8668     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8669 
 8670     // numIter too small to allow a 4-words SIMD loop, rolling back
 8671     __ cmp(numIter, (u1)4);
 8672     __ br(Assembler::LT, ShiftThree);
 8673 
 8674     __ dup(shiftVCount,     __ T4S, shiftCount);
 8675     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8676     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8677 
 8678     __ BIND(ShiftSIMDLoop);
 8679 
 8680     // load 4 words and process
 8681     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8682     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8683     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8684     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8685     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8686     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8687     __ sub(numIter,   numIter, 4);
 8688 
 8689     __ cmp(numIter, (u1)4);
 8690     __ br(Assembler::LT, ShiftTwoLoop);
 8691     __ b(ShiftSIMDLoop);
 8692 
 8693     __ BIND(ShiftTwoLoop);
 8694     __ cbz(numIter, Exit);
 8695     __ cmp(numIter, (u1)1);
 8696     __ br(Assembler::EQ, ShiftOne);
 8697 
 8698     // load 2 words and process
 8699     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8700     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8701     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8702     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8703     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8704     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8705     __ sub(numIter,   numIter, 2);
 8706     __ b(ShiftTwoLoop);
 8707 
 8708     __ BIND(ShiftThree);
 8709     __ ldrw(r10,  __ post(oldArr, 4));
 8710     __ ldrw(r11,  __ post(oldArrNext, 4));
 8711     __ lslvw(r10, r10, shiftCount);
 8712     __ lsrvw(r11, r11, shiftRevCount);
 8713     __ orrw(r12,  r10, r11);
 8714     __ strw(r12,  __ post(newArr, 4));
 8715     __ tbz(numIter, 1, Exit);
 8716     __ tbz(numIter, 0, ShiftOne);
 8717 
 8718     __ BIND(ShiftTwo);
 8719     __ ldrw(r10,  __ post(oldArr, 4));
 8720     __ ldrw(r11,  __ post(oldArrNext, 4));
 8721     __ lslvw(r10, r10, shiftCount);
 8722     __ lsrvw(r11, r11, shiftRevCount);
 8723     __ orrw(r12,  r10, r11);
 8724     __ strw(r12,  __ post(newArr, 4));
 8725 
 8726     __ BIND(ShiftOne);
 8727     __ ldrw(r10,  Address(oldArr));
 8728     __ ldrw(r11,  Address(oldArrNext));
 8729     __ lslvw(r10, r10, shiftCount);
 8730     __ lsrvw(r11, r11, shiftRevCount);
 8731     __ orrw(r12,  r10, r11);
 8732     __ strw(r12,  Address(newArr));
 8733 
 8734     __ BIND(Exit);
 8735     __ ret(lr);
 8736 
 8737     // record the stub entry and end
 8738     store_archive_data(stub_id, start, __ pc());
 8739 
 8740     return start;
 8741   }
 8742 
 8743   address generate_count_positives(address &count_positives_long) {
 8744     StubId stub_id = StubId::stubgen_count_positives_id;
 8745     GrowableArray<address> entries;
 8746     int entry_count = StubInfo::entry_count(stub_id);
 8747     // We have an extra entry for count_positives_long.
 8748     assert(entry_count == 2, "sanity check");
 8749     address start = load_archive_data(stub_id, &entries);
 8750     if (start != nullptr) {
 8751       assert(entries.length() == 1,
 8752              "unexpected extra entry count %d", entries.length());
 8753       count_positives_long = entries.at(0);
 8754       return start;
 8755     }
 8756     const u1 large_loop_size = 64;
 8757     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8758     int dcache_line = VM_Version::dcache_line_size();
 8759 
 8760     Register ary1 = r1, len = r2, result = r0;
 8761 
 8762     __ align(CodeEntryAlignment);
 8763     StubCodeMark mark(this, stub_id);
 8764 
 8765     address entry = __ pc();
 8766 
 8767     __ enter();
 8768     // precondition: a copy of len is already in result
 8769     // __ mov(result, len);
 8770 
 8771   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8772         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8773 
 8774   __ cmp(len, (u1)15);
 8775   __ br(Assembler::GT, LEN_OVER_15);
 8776   // The only case when execution falls into this code is when pointer is near
 8777   // the end of memory page and we have to avoid reading next page
 8778   __ add(ary1, ary1, len);
 8779   __ subs(len, len, 8);
 8780   __ br(Assembler::GT, LEN_OVER_8);
 8781   __ ldr(rscratch2, Address(ary1, -8));
 8782   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8783   __ lsrv(rscratch2, rscratch2, rscratch1);
 8784   __ tst(rscratch2, UPPER_BIT_MASK);
 8785   __ csel(result, zr, result, Assembler::NE);
 8786   __ leave();
 8787   __ ret(lr);
 8788   __ bind(LEN_OVER_8);
 8789   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8790   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8791   __ tst(rscratch2, UPPER_BIT_MASK);
 8792   __ br(Assembler::NE, RET_NO_POP);
 8793   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8794   __ lsrv(rscratch1, rscratch1, rscratch2);
 8795   __ tst(rscratch1, UPPER_BIT_MASK);
 8796   __ bind(RET_NO_POP);
 8797   __ csel(result, zr, result, Assembler::NE);
 8798   __ leave();
 8799   __ ret(lr);
 8800 
 8801   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8802   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8803 
 8804   count_positives_long = __ pc(); // 2nd entry point
 8805   entries.append(count_positives_long);
 8806 
 8807   __ enter();
 8808 
 8809   __ bind(LEN_OVER_15);
 8810     __ push(spilled_regs, sp);
 8811     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8812     __ cbz(rscratch2, ALIGNED);
 8813     __ ldp(tmp6, tmp1, Address(ary1));
 8814     __ mov(tmp5, 16);
 8815     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8816     __ add(ary1, ary1, rscratch1);
 8817     __ orr(tmp6, tmp6, tmp1);
 8818     __ tst(tmp6, UPPER_BIT_MASK);
 8819     __ br(Assembler::NE, RET_ADJUST);
 8820     __ sub(len, len, rscratch1);
 8821 
 8822   __ bind(ALIGNED);
 8823     __ cmp(len, large_loop_size);
 8824     __ br(Assembler::LT, CHECK_16);
 8825     // Perform 16-byte load as early return in pre-loop to handle situation
 8826     // when initially aligned large array has negative values at starting bytes,
 8827     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8828     // slower. Cases with negative bytes further ahead won't be affected that
 8829     // much. In fact, it'll be faster due to early loads, less instructions and
 8830     // less branches in LARGE_LOOP.
 8831     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8832     __ sub(len, len, 16);
 8833     __ orr(tmp6, tmp6, tmp1);
 8834     __ tst(tmp6, UPPER_BIT_MASK);
 8835     __ br(Assembler::NE, RET_ADJUST_16);
 8836     __ cmp(len, large_loop_size);
 8837     __ br(Assembler::LT, CHECK_16);
 8838 
 8839     if (SoftwarePrefetchHintDistance >= 0
 8840         && SoftwarePrefetchHintDistance >= dcache_line) {
 8841       // initial prefetch
 8842       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8843     }
 8844   __ bind(LARGE_LOOP);
 8845     if (SoftwarePrefetchHintDistance >= 0) {
 8846       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8847     }
 8848     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8849     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8850     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8851     // instructions per cycle and have less branches, but this approach disables
 8852     // early return, thus, all 64 bytes are loaded and checked every time.
 8853     __ ldp(tmp2, tmp3, Address(ary1));
 8854     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8855     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8856     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8857     __ add(ary1, ary1, large_loop_size);
 8858     __ sub(len, len, large_loop_size);
 8859     __ orr(tmp2, tmp2, tmp3);
 8860     __ orr(tmp4, tmp4, tmp5);
 8861     __ orr(rscratch1, rscratch1, rscratch2);
 8862     __ orr(tmp6, tmp6, tmp1);
 8863     __ orr(tmp2, tmp2, tmp4);
 8864     __ orr(rscratch1, rscratch1, tmp6);
 8865     __ orr(tmp2, tmp2, rscratch1);
 8866     __ tst(tmp2, UPPER_BIT_MASK);
 8867     __ br(Assembler::NE, RET_ADJUST_LONG);
 8868     __ cmp(len, large_loop_size);
 8869     __ br(Assembler::GE, LARGE_LOOP);
 8870 
 8871   __ bind(CHECK_16); // small 16-byte load pre-loop
 8872     __ cmp(len, (u1)16);
 8873     __ br(Assembler::LT, POST_LOOP16);
 8874 
 8875   __ bind(LOOP16); // small 16-byte load loop
 8876     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8877     __ sub(len, len, 16);
 8878     __ orr(tmp2, tmp2, tmp3);
 8879     __ tst(tmp2, UPPER_BIT_MASK);
 8880     __ br(Assembler::NE, RET_ADJUST_16);
 8881     __ cmp(len, (u1)16);
 8882     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8883 
 8884   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8885     __ cmp(len, (u1)8);
 8886     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8887     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8888     __ tst(tmp3, UPPER_BIT_MASK);
 8889     __ br(Assembler::NE, RET_ADJUST);
 8890     __ sub(len, len, 8);
 8891 
 8892   __ bind(POST_LOOP16_LOAD_TAIL);
 8893     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8894     __ ldr(tmp1, Address(ary1));
 8895     __ mov(tmp2, 64);
 8896     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8897     __ lslv(tmp1, tmp1, tmp4);
 8898     __ tst(tmp1, UPPER_BIT_MASK);
 8899     __ br(Assembler::NE, RET_ADJUST);
 8900     // Fallthrough
 8901 
 8902   __ bind(RET_LEN);
 8903     __ pop(spilled_regs, sp);
 8904     __ leave();
 8905     __ ret(lr);
 8906 
 8907     // difference result - len is the count of guaranteed to be
 8908     // positive bytes
 8909 
 8910   __ bind(RET_ADJUST_LONG);
 8911     __ add(len, len, (u1)(large_loop_size - 16));
 8912   __ bind(RET_ADJUST_16);
 8913     __ add(len, len, 16);
 8914   __ bind(RET_ADJUST);
 8915     __ pop(spilled_regs, sp);
 8916     __ leave();
 8917     __ sub(result, result, len);
 8918     __ ret(lr);
 8919 
 8920     // record the stub entry and end plus the extra entry
 8921     store_archive_data(stub_id, entry, __ pc(), &entries);
 8922 
 8923     return entry;
 8924   }
 8925 
 8926   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8927         bool usePrefetch, Label &NOT_EQUAL) {
 8928     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8929         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8930         tmp7 = r12, tmp8 = r13;
 8931     Label LOOP;
 8932 
 8933     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8934     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8935     __ bind(LOOP);
 8936     if (usePrefetch) {
 8937       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8938       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8939     }
 8940     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8941     __ eor(tmp1, tmp1, tmp2);
 8942     __ eor(tmp3, tmp3, tmp4);
 8943     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8944     __ orr(tmp1, tmp1, tmp3);
 8945     __ cbnz(tmp1, NOT_EQUAL);
 8946     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8947     __ eor(tmp5, tmp5, tmp6);
 8948     __ eor(tmp7, tmp7, tmp8);
 8949     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8950     __ orr(tmp5, tmp5, tmp7);
 8951     __ cbnz(tmp5, NOT_EQUAL);
 8952     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8953     __ eor(tmp1, tmp1, tmp2);
 8954     __ eor(tmp3, tmp3, tmp4);
 8955     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8956     __ orr(tmp1, tmp1, tmp3);
 8957     __ cbnz(tmp1, NOT_EQUAL);
 8958     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8959     __ eor(tmp5, tmp5, tmp6);
 8960     __ sub(cnt1, cnt1, 8 * wordSize);
 8961     __ eor(tmp7, tmp7, tmp8);
 8962     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8963     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8964     // cmp) because subs allows an unlimited range of immediate operand.
 8965     __ subs(tmp6, cnt1, loopThreshold);
 8966     __ orr(tmp5, tmp5, tmp7);
 8967     __ cbnz(tmp5, NOT_EQUAL);
 8968     __ br(__ GE, LOOP);
 8969     // post-loop
 8970     __ eor(tmp1, tmp1, tmp2);
 8971     __ eor(tmp3, tmp3, tmp4);
 8972     __ orr(tmp1, tmp1, tmp3);
 8973     __ sub(cnt1, cnt1, 2 * wordSize);
 8974     __ cbnz(tmp1, NOT_EQUAL);
 8975   }
 8976 
 8977   void generate_large_array_equals_loop_simd(int loopThreshold,
 8978         bool usePrefetch, Label &NOT_EQUAL) {
 8979     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8980         tmp2 = rscratch2;
 8981     Label LOOP;
 8982 
 8983     __ bind(LOOP);
 8984     if (usePrefetch) {
 8985       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8986       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8987     }
 8988     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8989     __ sub(cnt1, cnt1, 8 * wordSize);
 8990     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8991     __ subs(tmp1, cnt1, loopThreshold);
 8992     __ eor(v0, __ T16B, v0, v4);
 8993     __ eor(v1, __ T16B, v1, v5);
 8994     __ eor(v2, __ T16B, v2, v6);
 8995     __ eor(v3, __ T16B, v3, v7);
 8996     __ orr(v0, __ T16B, v0, v1);
 8997     __ orr(v1, __ T16B, v2, v3);
 8998     __ orr(v0, __ T16B, v0, v1);
 8999     __ umov(tmp1, v0, __ D, 0);
 9000     __ umov(tmp2, v0, __ D, 1);
 9001     __ orr(tmp1, tmp1, tmp2);
 9002     __ cbnz(tmp1, NOT_EQUAL);
 9003     __ br(__ GE, LOOP);
 9004   }
 9005 
 9006   // a1 = r1 - array1 address
 9007   // a2 = r2 - array2 address
 9008   // result = r0 - return value. Already contains "false"
 9009   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 9010   // r3-r5 are reserved temporary registers
 9011   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 9012   address generate_large_array_equals() {
 9013     StubId stub_id = StubId::stubgen_large_array_equals_id;
 9014     int entry_count = StubInfo::entry_count(stub_id);
 9015     assert(entry_count == 1, "sanity check");
 9016     address start = load_archive_data(stub_id);
 9017     if (start != nullptr) {
 9018       return start;
 9019     }
 9020     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 9021         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 9022         tmp7 = r12, tmp8 = r13;
 9023     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 9024         SMALL_LOOP, POST_LOOP;
 9025     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 9026     // calculate if at least 32 prefetched bytes are used
 9027     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 9028     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 9029     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 9030     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 9031         tmp5, tmp6, tmp7, tmp8);
 9032 
 9033     __ align(CodeEntryAlignment);
 9034 
 9035     StubCodeMark mark(this, stub_id);
 9036 
 9037     address entry = __ pc();
 9038     __ enter();
 9039     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 9040     // also advance pointers to use post-increment instead of pre-increment
 9041     __ add(a1, a1, wordSize);
 9042     __ add(a2, a2, wordSize);
 9043     if (AvoidUnalignedAccesses) {
 9044       // both implementations (SIMD/nonSIMD) are using relatively large load
 9045       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 9046       // on some CPUs in case of address is not at least 16-byte aligned.
 9047       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 9048       // load if needed at least for 1st address and make if 16-byte aligned.
 9049       Label ALIGNED16;
 9050       __ tbz(a1, 3, ALIGNED16);
 9051       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 9052       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 9053       __ sub(cnt1, cnt1, wordSize);
 9054       __ eor(tmp1, tmp1, tmp2);
 9055       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 9056       __ bind(ALIGNED16);
 9057     }
 9058     if (UseSIMDForArrayEquals) {
 9059       if (SoftwarePrefetchHintDistance >= 0) {
 9060         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 9061         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 9062         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 9063             /* prfm = */ true, NOT_EQUAL);
 9064         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 9065         __ br(__ LT, TAIL);
 9066       }
 9067       __ bind(NO_PREFETCH_LARGE_LOOP);
 9068       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 9069           /* prfm = */ false, NOT_EQUAL);
 9070     } else {
 9071       __ push(spilled_regs, sp);
 9072       if (SoftwarePrefetchHintDistance >= 0) {
 9073         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 9074         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 9075         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 9076             /* prfm = */ true, NOT_EQUAL);
 9077         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 9078         __ br(__ LT, TAIL);
 9079       }
 9080       __ bind(NO_PREFETCH_LARGE_LOOP);
 9081       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 9082           /* prfm = */ false, NOT_EQUAL);
 9083     }
 9084     __ bind(TAIL);
 9085       __ cbz(cnt1, EQUAL);
 9086       __ subs(cnt1, cnt1, wordSize);
 9087       __ br(__ LE, POST_LOOP);
 9088     __ bind(SMALL_LOOP);
 9089       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 9090       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 9091       __ subs(cnt1, cnt1, wordSize);
 9092       __ eor(tmp1, tmp1, tmp2);
 9093       __ cbnz(tmp1, NOT_EQUAL);
 9094       __ br(__ GT, SMALL_LOOP);
 9095     __ bind(POST_LOOP);
 9096       __ ldr(tmp1, Address(a1, cnt1));
 9097       __ ldr(tmp2, Address(a2, cnt1));
 9098       __ eor(tmp1, tmp1, tmp2);
 9099       __ cbnz(tmp1, NOT_EQUAL);
 9100     __ bind(EQUAL);
 9101       __ mov(result, true);
 9102     __ bind(NOT_EQUAL);
 9103       if (!UseSIMDForArrayEquals) {
 9104         __ pop(spilled_regs, sp);
 9105       }
 9106     __ bind(NOT_EQUAL_NO_POP);
 9107     __ leave();
 9108     __ ret(lr);
 9109 
 9110     // record the stub entry and end
 9111     store_archive_data(stub_id, entry, __ pc());
 9112 
 9113     return entry;
 9114   }
 9115 
 9116   // result = r0 - return value. Contains initial hashcode value on entry.
 9117   // ary = r1 - array address
 9118   // cnt = r2 - elements count
 9119   // Clobbers: v0-v13, rscratch1, rscratch2
 9120   address generate_large_arrays_hashcode(BasicType eltype) {
 9121     StubId stub_id;
 9122     switch (eltype) {
 9123     case T_BOOLEAN:
 9124       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 9125       break;
 9126     case T_BYTE:
 9127       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 9128       break;
 9129     case T_CHAR:
 9130       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 9131       break;
 9132     case T_SHORT:
 9133       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 9134       break;
 9135     case T_INT:
 9136       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 9137       break;
 9138     default:
 9139       stub_id = StubId::NO_STUBID;
 9140       ShouldNotReachHere();
 9141     };
 9142     int entry_count = StubInfo::entry_count(stub_id);
 9143     assert(entry_count == 1, "sanity check");
 9144     address start = load_archive_data(stub_id);
 9145     if (start != nullptr) {
 9146       return start;
 9147     }
 9148     const Register result = r0, ary = r1, cnt = r2;
 9149     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 9150     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 9151     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 9152     const FloatRegister vpowm = v13;
 9153 
 9154     ARRAYS_HASHCODE_REGISTERS;
 9155 
 9156     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 9157 
 9158     unsigned int vf; // vectorization factor
 9159     bool multiply_by_halves;
 9160     Assembler::SIMD_Arrangement load_arrangement;
 9161     switch (eltype) {
 9162     case T_BOOLEAN:
 9163     case T_BYTE:
 9164       load_arrangement = Assembler::T8B;
 9165       multiply_by_halves = true;
 9166       vf = 8;
 9167       break;
 9168     case T_CHAR:
 9169     case T_SHORT:
 9170       load_arrangement = Assembler::T8H;
 9171       multiply_by_halves = true;
 9172       vf = 8;
 9173       break;
 9174     case T_INT:
 9175       load_arrangement = Assembler::T4S;
 9176       multiply_by_halves = false;
 9177       vf = 4;
 9178       break;
 9179     default:
 9180       ShouldNotReachHere();
 9181     }
 9182 
 9183     // Unroll factor
 9184     const unsigned uf = 4;
 9185 
 9186     // Effective vectorization factor
 9187     const unsigned evf = vf * uf;
 9188 
 9189     __ align(CodeEntryAlignment);
 9190 
 9191     StubCodeMark mark(this, stub_id);
 9192 
 9193     address entry = __ pc();
 9194     __ enter();
 9195 
 9196     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 9197     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 9198     // value shouldn't change throughout both loops.
 9199     __ movw(rscratch1, intpow(31U, 3));
 9200     __ mov(vpow, Assembler::S, 0, rscratch1);
 9201     __ movw(rscratch1, intpow(31U, 2));
 9202     __ mov(vpow, Assembler::S, 1, rscratch1);
 9203     __ movw(rscratch1, intpow(31U, 1));
 9204     __ mov(vpow, Assembler::S, 2, rscratch1);
 9205     __ movw(rscratch1, intpow(31U, 0));
 9206     __ mov(vpow, Assembler::S, 3, rscratch1);
 9207 
 9208     __ mov(vmul0, Assembler::T16B, 0);
 9209     __ mov(vmul0, Assembler::S, 3, result);
 9210 
 9211     __ andr(rscratch2, cnt, (uf - 1) * vf);
 9212     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 9213 
 9214     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 9215     __ mov(vpowm, Assembler::S, 0, rscratch1);
 9216 
 9217     // SMALL LOOP
 9218     __ bind(SMALL_LOOP);
 9219 
 9220     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 9221     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9222     __ subsw(rscratch2, rscratch2, vf);
 9223 
 9224     if (load_arrangement == Assembler::T8B) {
 9225       // Extend 8B to 8H to be able to use vector multiply
 9226       // instructions
 9227       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 9228       if (is_signed_subword_type(eltype)) {
 9229         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9230       } else {
 9231         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9232       }
 9233     }
 9234 
 9235     switch (load_arrangement) {
 9236     case Assembler::T4S:
 9237       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 9238       break;
 9239     case Assembler::T8B:
 9240     case Assembler::T8H:
 9241       assert(is_subword_type(eltype), "subword type expected");
 9242       if (is_signed_subword_type(eltype)) {
 9243         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9244       } else {
 9245         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9246       }
 9247       break;
 9248     default:
 9249       __ should_not_reach_here();
 9250     }
 9251 
 9252     // Process the upper half of a vector
 9253     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 9254       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9255       if (is_signed_subword_type(eltype)) {
 9256         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9257       } else {
 9258         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9259       }
 9260     }
 9261 
 9262     __ br(Assembler::HI, SMALL_LOOP);
 9263 
 9264     // SMALL LOOP'S EPILOQUE
 9265     __ lsr(rscratch2, cnt, exact_log2(evf));
 9266     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 9267 
 9268     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 9269     __ addv(vmul0, Assembler::T4S, vmul0);
 9270     __ umov(result, vmul0, Assembler::S, 0);
 9271 
 9272     // TAIL
 9273     __ bind(TAIL);
 9274 
 9275     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 9276     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 9277     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 9278     __ andr(rscratch2, cnt, vf - 1);
 9279     __ bind(TAIL_SHORTCUT);
 9280     __ adr(rscratch1, BR_BASE);
 9281     // For Cortex-A53 offset is 4 because 2 nops are generated.
 9282     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 9283     __ movw(rscratch2, 0x1f);
 9284     __ br(rscratch1);
 9285 
 9286     for (size_t i = 0; i < vf - 1; ++i) {
 9287       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 9288                                    eltype);
 9289       __ maddw(result, result, rscratch2, rscratch1);
 9290       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 9291       // Generate 2nd nop to have 4 instructions per iteration.
 9292       if (VM_Version::supports_a53mac()) {
 9293         __ nop();
 9294       }
 9295     }
 9296     __ bind(BR_BASE);
 9297 
 9298     __ leave();
 9299     __ ret(lr);
 9300 
 9301     // LARGE LOOP
 9302     __ bind(LARGE_LOOP_PREHEADER);
 9303 
 9304     __ lsr(rscratch2, cnt, exact_log2(evf));
 9305 
 9306     if (multiply_by_halves) {
 9307       // 31^4 - multiplier between lower and upper parts of a register
 9308       __ movw(rscratch1, intpow(31U, vf / 2));
 9309       __ mov(vpowm, Assembler::S, 1, rscratch1);
 9310       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 9311       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 9312       __ mov(vpowm, Assembler::S, 0, rscratch1);
 9313     } else {
 9314       // 31^16
 9315       __ movw(rscratch1, intpow(31U, evf));
 9316       __ mov(vpowm, Assembler::S, 0, rscratch1);
 9317     }
 9318 
 9319     __ mov(vmul3, Assembler::T16B, 0);
 9320     __ mov(vmul2, Assembler::T16B, 0);
 9321     __ mov(vmul1, Assembler::T16B, 0);
 9322 
 9323     __ bind(LARGE_LOOP);
 9324 
 9325     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 9326     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 9327     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 9328     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 9329 
 9330     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 9331            Address(__ post(ary, evf * type2aelembytes(eltype))));
 9332 
 9333     if (load_arrangement == Assembler::T8B) {
 9334       // Extend 8B to 8H to be able to use vector multiply
 9335       // instructions
 9336       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 9337       if (is_signed_subword_type(eltype)) {
 9338         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 9339         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 9340         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 9341         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9342       } else {
 9343         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 9344         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 9345         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 9346         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 9347       }
 9348     }
 9349 
 9350     switch (load_arrangement) {
 9351     case Assembler::T4S:
 9352       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 9353       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 9354       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 9355       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 9356       break;
 9357     case Assembler::T8B:
 9358     case Assembler::T8H:
 9359       assert(is_subword_type(eltype), "subword type expected");
 9360       if (is_signed_subword_type(eltype)) {
 9361         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 9362         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 9363         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 9364         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9365       } else {
 9366         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 9367         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 9368         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 9369         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 9370       }
 9371       break;
 9372     default:
 9373       __ should_not_reach_here();
 9374     }
 9375 
 9376     // Process the upper half of a vector
 9377     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 9378       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 9379       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 9380       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 9381       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 9382       if (is_signed_subword_type(eltype)) {
 9383         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 9384         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 9385         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 9386         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9387       } else {
 9388         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 9389         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 9390         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 9391         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 9392       }
 9393     }
 9394 
 9395     __ subsw(rscratch2, rscratch2, 1);
 9396     __ br(Assembler::HI, LARGE_LOOP);
 9397 
 9398     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 9399     __ addv(vmul3, Assembler::T4S, vmul3);
 9400     __ umov(result, vmul3, Assembler::S, 0);
 9401 
 9402     __ mov(rscratch2, intpow(31U, vf));
 9403 
 9404     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 9405     __ addv(vmul2, Assembler::T4S, vmul2);
 9406     __ umov(rscratch1, vmul2, Assembler::S, 0);
 9407     __ maddw(result, result, rscratch2, rscratch1);
 9408 
 9409     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 9410     __ addv(vmul1, Assembler::T4S, vmul1);
 9411     __ umov(rscratch1, vmul1, Assembler::S, 0);
 9412     __ maddw(result, result, rscratch2, rscratch1);
 9413 
 9414     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 9415     __ addv(vmul0, Assembler::T4S, vmul0);
 9416     __ umov(rscratch1, vmul0, Assembler::S, 0);
 9417     __ maddw(result, result, rscratch2, rscratch1);
 9418 
 9419     __ andr(rscratch2, cnt, vf - 1);
 9420     __ cbnz(rscratch2, TAIL_SHORTCUT);
 9421 
 9422     __ leave();
 9423     __ ret(lr);
 9424 
 9425     // record the stub entry and end
 9426     store_archive_data(stub_id, entry, __ pc());
 9427 
 9428     return entry;
 9429   }
 9430 
 9431   address generate_dsin_dcos(bool isCos) {
 9432     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 9433     int entry_count = StubInfo::entry_count(stub_id);
 9434     assert(entry_count == 1, "sanity check");
 9435     address start = load_archive_data(stub_id);
 9436     if (start != nullptr) {
 9437       return start;
 9438     }
 9439     __ align(CodeEntryAlignment);
 9440     StubCodeMark mark(this, stub_id);
 9441     start = __ pc();
 9442     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 9443         (address)StubRoutines::aarch64::_two_over_pi,
 9444         (address)StubRoutines::aarch64::_pio2,
 9445         (address)StubRoutines::aarch64::_dsin_coef,
 9446         (address)StubRoutines::aarch64::_dcos_coef);
 9447 
 9448     // record the stub entry and end
 9449     store_archive_data(stub_id, start, __ pc());
 9450 
 9451     return start;
 9452   }
 9453 
 9454   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 9455   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 9456       Label &DIFF2) {
 9457     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 9458     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 9459 
 9460     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 9461     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 9462     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 9463     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 9464 
 9465     __ fmovd(tmpL, vtmp3);
 9466     __ eor(rscratch2, tmp3, tmpL);
 9467     __ cbnz(rscratch2, DIFF2);
 9468 
 9469     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9470     __ umov(tmpL, vtmp3, __ D, 1);
 9471     __ eor(rscratch2, tmpU, tmpL);
 9472     __ cbnz(rscratch2, DIFF1);
 9473 
 9474     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 9475     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 9476     __ fmovd(tmpL, vtmp);
 9477     __ eor(rscratch2, tmp3, tmpL);
 9478     __ cbnz(rscratch2, DIFF2);
 9479 
 9480     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9481     __ umov(tmpL, vtmp, __ D, 1);
 9482     __ eor(rscratch2, tmpU, tmpL);
 9483     __ cbnz(rscratch2, DIFF1);
 9484   }
 9485 
 9486   // r0  = result
 9487   // r1  = str1
 9488   // r2  = cnt1
 9489   // r3  = str2
 9490   // r4  = cnt2
 9491   // r10 = tmp1
 9492   // r11 = tmp2
 9493   address generate_compare_long_string_different_encoding(bool isLU) {
 9494     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 9495     int entry_count = StubInfo::entry_count(stub_id);
 9496     assert(entry_count == 1, "sanity check");
 9497     address start = load_archive_data(stub_id);
 9498     if (start != nullptr) {
 9499       return start;
 9500     }
 9501     __ align(CodeEntryAlignment);
 9502     StubCodeMark mark(this, stub_id);
 9503     address entry = __ pc();
 9504     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 9505         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 9506         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 9507     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9508         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 9509     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 9510     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 9511 
 9512     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 9513 
 9514     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 9515     // cnt2 == amount of characters left to compare
 9516     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 9517     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 9518     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 9519     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 9520     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 9521     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 9522     __ eor(rscratch2, tmp1, tmp2);
 9523     __ mov(rscratch1, tmp2);
 9524     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 9525     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 9526              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 9527     __ push(spilled_regs, sp);
 9528     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 9529     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 9530 
 9531     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 9532 
 9533     if (SoftwarePrefetchHintDistance >= 0) {
 9534       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 9535       __ br(__ LT, NO_PREFETCH);
 9536       __ bind(LARGE_LOOP_PREFETCH);
 9537         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 9538         __ mov(tmp4, 2);
 9539         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 9540         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 9541           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9542           __ subs(tmp4, tmp4, 1);
 9543           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 9544           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 9545           __ mov(tmp4, 2);
 9546         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 9547           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9548           __ subs(tmp4, tmp4, 1);
 9549           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 9550           __ sub(cnt2, cnt2, 64);
 9551           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 9552           __ br(__ GE, LARGE_LOOP_PREFETCH);
 9553     }
 9554     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 9555     __ bind(NO_PREFETCH);
 9556     __ subs(cnt2, cnt2, 16);
 9557     __ br(__ LT, TAIL);
 9558     __ align(OptoLoopAlignment);
 9559     __ bind(SMALL_LOOP); // smaller loop
 9560       __ subs(cnt2, cnt2, 16);
 9561       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 9562       __ br(__ GE, SMALL_LOOP);
 9563       __ cmn(cnt2, (u1)16);
 9564       __ br(__ EQ, LOAD_LAST);
 9565     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 9566       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 9567       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 9568       __ ldr(tmp3, Address(cnt1, -8));
 9569       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 9570       __ b(LOAD_LAST);
 9571     __ bind(DIFF2);
 9572       __ mov(tmpU, tmp3);
 9573     __ bind(DIFF1);
 9574       __ pop(spilled_regs, sp);
 9575       __ b(CALCULATE_DIFFERENCE);
 9576     __ bind(LOAD_LAST);
 9577       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 9578       // No need to load it again
 9579       __ mov(tmpU, tmp3);
 9580       __ pop(spilled_regs, sp);
 9581 
 9582       // tmp2 points to the address of the last 4 Latin1 characters right now
 9583       __ ldrs(vtmp, Address(tmp2));
 9584       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 9585       __ fmovd(tmpL, vtmp);
 9586 
 9587       __ eor(rscratch2, tmpU, tmpL);
 9588       __ cbz(rscratch2, DONE);
 9589 
 9590     // Find the first different characters in the longwords and
 9591     // compute their difference.
 9592     __ bind(CALCULATE_DIFFERENCE);
 9593       __ rev(rscratch2, rscratch2);
 9594       __ clz(rscratch2, rscratch2);
 9595       __ andr(rscratch2, rscratch2, -16);
 9596       __ lsrv(tmp1, tmp1, rscratch2);
 9597       __ uxthw(tmp1, tmp1);
 9598       __ lsrv(rscratch1, rscratch1, rscratch2);
 9599       __ uxthw(rscratch1, rscratch1);
 9600       __ subw(result, tmp1, rscratch1);
 9601     __ bind(DONE);
 9602       __ ret(lr);
 9603 
 9604       // record the stub entry and end
 9605       store_archive_data(stub_id, entry, __ pc());
 9606 
 9607       return entry;
 9608   }
 9609 
 9610   // r0 = input (float16)
 9611   // v0 = result (float)
 9612   // v1 = temporary float register
 9613   address generate_float16ToFloat() {
 9614     StubId stub_id = StubId::stubgen_hf2f_id;
 9615     int entry_count = StubInfo::entry_count(stub_id);
 9616     assert(entry_count == 1, "sanity check");
 9617     address start = load_archive_data(stub_id);
 9618     if (start != nullptr) {
 9619       return start;
 9620     }
 9621     __ align(CodeEntryAlignment);
 9622     StubCodeMark mark(this, stub_id);
 9623     address entry = __ pc();
 9624     BLOCK_COMMENT("Entry:");
 9625     __ flt16_to_flt(v0, r0, v1);
 9626     __ ret(lr);
 9627 
 9628     // record the stub entry and end
 9629     store_archive_data(stub_id, entry, __ pc());
 9630 
 9631     return entry;
 9632   }
 9633 
 9634   // v0 = input (float)
 9635   // r0 = result (float16)
 9636   // v1 = temporary float register
 9637   address generate_floatToFloat16() {
 9638     StubId stub_id = StubId::stubgen_f2hf_id;
 9639     int entry_count = StubInfo::entry_count(stub_id);
 9640     assert(entry_count == 1, "sanity check");
 9641     address start = load_archive_data(stub_id);
 9642     if (start != nullptr) {
 9643       return start;
 9644     }
 9645     __ align(CodeEntryAlignment);
 9646     StubCodeMark mark(this, stub_id);
 9647     address entry = __ pc();
 9648     BLOCK_COMMENT("Entry:");
 9649     __ flt_to_flt16(r0, v0, v1);
 9650     __ ret(lr);
 9651 
 9652     // record the stub entry and end
 9653     store_archive_data(stub_id, entry, __ pc());
 9654 
 9655     return entry;
 9656   }
 9657 
 9658   address generate_method_entry_barrier() {
 9659     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 9660     int entry_count = StubInfo::entry_count(stub_id);
 9661     assert(entry_count == 1, "sanity check");
 9662     address start = load_archive_data(stub_id);
 9663     if (start != nullptr) {
 9664       return start;
 9665     }
 9666     __ align(CodeEntryAlignment);
 9667     StubCodeMark mark(this, stub_id);
 9668 
 9669     Label deoptimize_label;
 9670 
 9671     start = __ pc();
 9672 
 9673     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 9674 
 9675     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 9676       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 9677       // We can get here despite the nmethod being good, if we have not
 9678       // yet applied our cross modification fence (or data fence).
 9679       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9680       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9681       __ ldrw(rscratch2, rscratch2);
 9682       __ strw(rscratch2, thread_epoch_addr);
 9683       __ isb();
 9684       __ membar(__ LoadLoad);
 9685     }
 9686 
 9687     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9688 
 9689     __ enter();
 9690     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9691 
 9692     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9693 
 9694     __ push_call_clobbered_registers();
 9695 
 9696     __ mov(c_rarg0, rscratch2);
 9697     __ call_VM_leaf
 9698          (CAST_FROM_FN_PTR
 9699           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9700 
 9701     __ reset_last_Java_frame(true);
 9702 
 9703     __ mov(rscratch1, r0);
 9704 
 9705     __ pop_call_clobbered_registers();
 9706 
 9707     __ cbnz(rscratch1, deoptimize_label);
 9708 
 9709     __ leave();
 9710     __ ret(lr);
 9711 
 9712     __ BIND(deoptimize_label);
 9713 
 9714     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9715     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9716 
 9717     __ mov(sp, rscratch1);
 9718     __ br(rscratch2);
 9719 
 9720     // record the stub entry and end
 9721     store_archive_data(stub_id, start, __ pc());
 9722 
 9723     return start;
 9724   }
 9725 
 9726   // r0  = result
 9727   // r1  = str1
 9728   // r2  = cnt1
 9729   // r3  = str2
 9730   // r4  = cnt2
 9731   // r10 = tmp1
 9732   // r11 = tmp2
 9733   address generate_compare_long_string_same_encoding(bool isLL) {
 9734     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9735     int entry_count = StubInfo::entry_count(stub_id);
 9736     assert(entry_count == 1, "sanity check");
 9737     address start = load_archive_data(stub_id);
 9738     if (start != nullptr) {
 9739       return start;
 9740     }
 9741     __ align(CodeEntryAlignment);
 9742     StubCodeMark mark(this, stub_id);
 9743     address entry = __ pc();
 9744     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9745         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9746 
 9747     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9748 
 9749     // exit from large loop when less than 64 bytes left to read or we're about
 9750     // to prefetch memory behind array border
 9751     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9752 
 9753     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9754     __ eor(rscratch2, tmp1, tmp2);
 9755     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9756 
 9757     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9758     // update pointers, because of previous read
 9759     __ add(str1, str1, wordSize);
 9760     __ add(str2, str2, wordSize);
 9761     if (SoftwarePrefetchHintDistance >= 0) {
 9762       __ align(OptoLoopAlignment);
 9763       __ bind(LARGE_LOOP_PREFETCH);
 9764         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9765         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9766 
 9767         for (int i = 0; i < 4; i++) {
 9768           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9769           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9770           __ cmp(tmp1, tmp2);
 9771           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9772           __ br(Assembler::NE, DIFF);
 9773         }
 9774         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9775         __ add(str1, str1, 64);
 9776         __ add(str2, str2, 64);
 9777         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9778         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9779         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9780     }
 9781 
 9782     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9783     __ br(Assembler::LE, LESS16);
 9784     __ align(OptoLoopAlignment);
 9785     __ bind(LOOP_COMPARE16);
 9786       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9787       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9788       __ cmp(tmp1, tmp2);
 9789       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9790       __ br(Assembler::NE, DIFF);
 9791       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9792       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9793       __ br(Assembler::LT, LESS16);
 9794 
 9795       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9796       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9797       __ cmp(tmp1, tmp2);
 9798       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9799       __ br(Assembler::NE, DIFF);
 9800       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9801       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9802       __ br(Assembler::GE, LOOP_COMPARE16);
 9803       __ cbz(cnt2, LENGTH_DIFF);
 9804 
 9805     __ bind(LESS16);
 9806       // each 8 compare
 9807       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9808       __ br(Assembler::LE, LESS8);
 9809       __ ldr(tmp1, Address(__ post(str1, 8)));
 9810       __ ldr(tmp2, Address(__ post(str2, 8)));
 9811       __ eor(rscratch2, tmp1, tmp2);
 9812       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9813       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9814 
 9815     __ bind(LESS8); // directly load last 8 bytes
 9816       if (!isLL) {
 9817         __ add(cnt2, cnt2, cnt2);
 9818       }
 9819       __ ldr(tmp1, Address(str1, cnt2));
 9820       __ ldr(tmp2, Address(str2, cnt2));
 9821       __ eor(rscratch2, tmp1, tmp2);
 9822       __ cbz(rscratch2, LENGTH_DIFF);
 9823       __ b(CAL_DIFFERENCE);
 9824 
 9825     __ bind(DIFF);
 9826       __ cmp(tmp1, tmp2);
 9827       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9828       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9829       // reuse rscratch2 register for the result of eor instruction
 9830       __ eor(rscratch2, tmp1, tmp2);
 9831 
 9832     __ bind(CAL_DIFFERENCE);
 9833       __ rev(rscratch2, rscratch2);
 9834       __ clz(rscratch2, rscratch2);
 9835       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9836       __ lsrv(tmp1, tmp1, rscratch2);
 9837       __ lsrv(tmp2, tmp2, rscratch2);
 9838       if (isLL) {
 9839         __ uxtbw(tmp1, tmp1);
 9840         __ uxtbw(tmp2, tmp2);
 9841       } else {
 9842         __ uxthw(tmp1, tmp1);
 9843         __ uxthw(tmp2, tmp2);
 9844       }
 9845       __ subw(result, tmp1, tmp2);
 9846 
 9847     __ bind(LENGTH_DIFF);
 9848       __ ret(lr);
 9849 
 9850     // record the stub entry and end
 9851     store_archive_data(stub_id, entry, __ pc());
 9852 
 9853     return entry;
 9854   }
 9855 
 9856   enum string_compare_mode {
 9857     LL,
 9858     LU,
 9859     UL,
 9860     UU,
 9861   };
 9862 
 9863   // The following registers are declared in aarch64.ad
 9864   // r0  = result
 9865   // r1  = str1
 9866   // r2  = cnt1
 9867   // r3  = str2
 9868   // r4  = cnt2
 9869   // r10 = tmp1
 9870   // r11 = tmp2
 9871   // z0  = ztmp1
 9872   // z1  = ztmp2
 9873   // p0  = pgtmp1
 9874   // p1  = pgtmp2
 9875   address generate_compare_long_string_sve(string_compare_mode mode) {
 9876     StubId stub_id;
 9877     switch (mode) {
 9878       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9879       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9880       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9881       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9882       default: ShouldNotReachHere();
 9883     }
 9884     int entry_count = StubInfo::entry_count(stub_id);
 9885     assert(entry_count == 1, "sanity check");
 9886     address start = load_archive_data(stub_id);
 9887     if (start != nullptr) {
 9888       return start;
 9889     }
 9890     __ align(CodeEntryAlignment);
 9891     StubCodeMark mark(this, stub_id);
 9892     address entry = __ pc();
 9893     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9894              tmp1 = r10, tmp2 = r11;
 9895 
 9896     Label LOOP, DONE, MISMATCH;
 9897     Register vec_len = tmp1;
 9898     Register idx = tmp2;
 9899     // The minimum of the string lengths has been stored in cnt2.
 9900     Register cnt = cnt2;
 9901     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9902     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9903 
 9904 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9905     switch (mode) {                                                            \
 9906       case LL:                                                                 \
 9907         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9908         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9909         break;                                                                 \
 9910       case LU:                                                                 \
 9911         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9912         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9913         break;                                                                 \
 9914       case UL:                                                                 \
 9915         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9916         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9917         break;                                                                 \
 9918       case UU:                                                                 \
 9919         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9920         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9921         break;                                                                 \
 9922       default:                                                                 \
 9923         ShouldNotReachHere();                                                  \
 9924     }
 9925 
 9926     __ mov(idx, 0);
 9927     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9928 
 9929     if (mode == LL) {
 9930       __ sve_cntb(vec_len);
 9931     } else {
 9932       __ sve_cnth(vec_len);
 9933     }
 9934 
 9935     __ sub(rscratch1, cnt, vec_len);
 9936 
 9937     __ bind(LOOP);
 9938 
 9939       // main loop
 9940       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9941       __ add(idx, idx, vec_len);
 9942       // Compare strings.
 9943       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9944       __ br(__ NE, MISMATCH);
 9945       __ cmp(idx, rscratch1);
 9946       __ br(__ LT, LOOP);
 9947 
 9948     // post loop, last iteration
 9949     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9950 
 9951     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9952     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9953     __ br(__ EQ, DONE);
 9954 
 9955     __ bind(MISMATCH);
 9956 
 9957     // Crop the vector to find its location.
 9958     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9959     // Extract the first different characters of each string.
 9960     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9961     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9962 
 9963     // Compute the difference of the first different characters.
 9964     __ sub(result, rscratch1, rscratch2);
 9965 
 9966     __ bind(DONE);
 9967     __ ret(lr);
 9968 #undef LOAD_PAIR
 9969 
 9970     // record the stub entry and end
 9971     store_archive_data(stub_id, entry, __ pc());
 9972 
 9973     return entry;
 9974   }
 9975 
 9976   void generate_compare_long_strings() {
 9977     if (UseSVE == 0) {
 9978       StubRoutines::aarch64::_compare_long_string_LL
 9979           = generate_compare_long_string_same_encoding(true);
 9980       StubRoutines::aarch64::_compare_long_string_UU
 9981           = generate_compare_long_string_same_encoding(false);
 9982       StubRoutines::aarch64::_compare_long_string_LU
 9983           = generate_compare_long_string_different_encoding(true);
 9984       StubRoutines::aarch64::_compare_long_string_UL
 9985           = generate_compare_long_string_different_encoding(false);
 9986     } else {
 9987       StubRoutines::aarch64::_compare_long_string_LL
 9988           = generate_compare_long_string_sve(LL);
 9989       StubRoutines::aarch64::_compare_long_string_UU
 9990           = generate_compare_long_string_sve(UU);
 9991       StubRoutines::aarch64::_compare_long_string_LU
 9992           = generate_compare_long_string_sve(LU);
 9993       StubRoutines::aarch64::_compare_long_string_UL
 9994           = generate_compare_long_string_sve(UL);
 9995     }
 9996   }
 9997 
 9998   // R0 = result
 9999   // R1 = str2
10000   // R2 = cnt1
10001   // R3 = str1
10002   // R4 = cnt2
10003   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
10004   //
10005   // This generic linear code use few additional ideas, which makes it faster:
10006   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
10007   // in order to skip initial loading(help in systems with 1 ld pipeline)
10008   // 2) we can use "fast" algorithm of finding single character to search for
10009   // first symbol with less branches(1 branch per each loaded register instead
10010   // of branch for each symbol), so, this is where constants like
10011   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
10012   // 3) after loading and analyzing 1st register of source string, it can be
10013   // used to search for every 1st character entry, saving few loads in
10014   // comparison with "simplier-but-slower" implementation
10015   // 4) in order to avoid lots of push/pop operations, code below is heavily
10016   // re-using/re-initializing/compressing register values, which makes code
10017   // larger and a bit less readable, however, most of extra operations are
10018   // issued during loads or branches, so, penalty is minimal
10019   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
10020     StubId stub_id;
10021     if (str1_isL) {
10022       if (str2_isL) {
10023         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
10024       } else {
10025         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
10026       }
10027     } else {
10028       if (str2_isL) {
10029         ShouldNotReachHere();
10030       } else {
10031         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
10032       }
10033     }
10034     int entry_count = StubInfo::entry_count(stub_id);
10035     assert(entry_count == 1, "sanity check");
10036     address start = load_archive_data(stub_id);
10037     if (start != nullptr) {
10038       return start;
10039     }
10040     __ align(CodeEntryAlignment);
10041     StubCodeMark mark(this, stub_id);
10042     address entry = __ pc();
10043 
10044     int str1_chr_size = str1_isL ? 1 : 2;
10045     int str2_chr_size = str2_isL ? 1 : 2;
10046     int str1_chr_shift = str1_isL ? 0 : 1;
10047     int str2_chr_shift = str2_isL ? 0 : 1;
10048     bool isL = str1_isL && str2_isL;
10049    // parameters
10050     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
10051     // temporary registers
10052     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
10053     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
10054     // redefinitions
10055     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
10056 
10057     __ push(spilled_regs, sp);
10058     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
10059         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
10060         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
10061         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
10062         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
10063         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
10064     // Read whole register from str1. It is safe, because length >=8 here
10065     __ ldr(ch1, Address(str1));
10066     // Read whole register from str2. It is safe, because length >=8 here
10067     __ ldr(ch2, Address(str2));
10068     __ sub(cnt2, cnt2, cnt1);
10069     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
10070     if (str1_isL != str2_isL) {
10071       __ eor(v0, __ T16B, v0, v0);
10072     }
10073     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
10074     __ mul(first, first, tmp1);
10075     // check if we have less than 1 register to check
10076     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
10077     if (str1_isL != str2_isL) {
10078       __ fmovd(v1, ch1);
10079     }
10080     __ br(__ LE, L_SMALL);
10081     __ eor(ch2, first, ch2);
10082     if (str1_isL != str2_isL) {
10083       __ zip1(v1, __ T16B, v1, v0);
10084     }
10085     __ sub(tmp2, ch2, tmp1);
10086     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10087     __ bics(tmp2, tmp2, ch2);
10088     if (str1_isL != str2_isL) {
10089       __ fmovd(ch1, v1);
10090     }
10091     __ br(__ NE, L_HAS_ZERO);
10092     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10093     __ add(result, result, wordSize/str2_chr_size);
10094     __ add(str2, str2, wordSize);
10095     __ br(__ LT, L_POST_LOOP);
10096     __ BIND(L_LOOP);
10097       __ ldr(ch2, Address(str2));
10098       __ eor(ch2, first, ch2);
10099       __ sub(tmp2, ch2, tmp1);
10100       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10101       __ bics(tmp2, tmp2, ch2);
10102       __ br(__ NE, L_HAS_ZERO);
10103     __ BIND(L_LOOP_PROCEED);
10104       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10105       __ add(str2, str2, wordSize);
10106       __ add(result, result, wordSize/str2_chr_size);
10107       __ br(__ GE, L_LOOP);
10108     __ BIND(L_POST_LOOP);
10109       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
10110       __ br(__ LE, NOMATCH);
10111       __ ldr(ch2, Address(str2));
10112       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10113       __ eor(ch2, first, ch2);
10114       __ sub(tmp2, ch2, tmp1);
10115       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10116       __ mov(tmp4, -1); // all bits set
10117       __ b(L_SMALL_PROCEED);
10118     __ align(OptoLoopAlignment);
10119     __ BIND(L_SMALL);
10120       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10121       __ eor(ch2, first, ch2);
10122       if (str1_isL != str2_isL) {
10123         __ zip1(v1, __ T16B, v1, v0);
10124       }
10125       __ sub(tmp2, ch2, tmp1);
10126       __ mov(tmp4, -1); // all bits set
10127       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10128       if (str1_isL != str2_isL) {
10129         __ fmovd(ch1, v1); // move converted 4 symbols
10130       }
10131     __ BIND(L_SMALL_PROCEED);
10132       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
10133       __ bic(tmp2, tmp2, ch2);
10134       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
10135       __ rbit(tmp2, tmp2);
10136       __ br(__ EQ, NOMATCH);
10137     __ BIND(L_SMALL_HAS_ZERO_LOOP);
10138       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
10139       __ cmp(cnt1, u1(wordSize/str2_chr_size));
10140       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
10141       if (str2_isL) { // LL
10142         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10143         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10144         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10145         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10146         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10147       } else {
10148         __ mov(ch2, 0xE); // all bits in byte set except last one
10149         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10150         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10151         __ lslv(tmp2, tmp2, tmp4);
10152         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10153         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10154         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10155         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10156       }
10157       __ cmp(ch1, ch2);
10158       __ mov(tmp4, wordSize/str2_chr_size);
10159       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10160     __ BIND(L_SMALL_CMP_LOOP);
10161       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10162                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10163       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10164                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10165       __ add(tmp4, tmp4, 1);
10166       __ cmp(tmp4, cnt1);
10167       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
10168       __ cmp(first, ch2);
10169       __ br(__ EQ, L_SMALL_CMP_LOOP);
10170     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
10171       __ cbz(tmp2, NOMATCH); // no more matches. exit
10172       __ clz(tmp4, tmp2);
10173       __ add(result, result, 1); // advance index
10174       __ add(str2, str2, str2_chr_size); // advance pointer
10175       __ b(L_SMALL_HAS_ZERO_LOOP);
10176     __ align(OptoLoopAlignment);
10177     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
10178       __ cmp(first, ch2);
10179       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10180       __ b(DONE);
10181     __ align(OptoLoopAlignment);
10182     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
10183       if (str2_isL) { // LL
10184         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10185         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10186         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10187         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10188         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10189       } else {
10190         __ mov(ch2, 0xE); // all bits in byte set except last one
10191         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10192         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10193         __ lslv(tmp2, tmp2, tmp4);
10194         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10195         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10196         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10197         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10198       }
10199       __ cmp(ch1, ch2);
10200       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10201       __ b(DONE);
10202     __ align(OptoLoopAlignment);
10203     __ BIND(L_HAS_ZERO);
10204       __ rbit(tmp2, tmp2);
10205       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
10206       // Now, perform compression of counters(cnt2 and cnt1) into one register.
10207       // It's fine because both counters are 32bit and are not changed in this
10208       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
10209       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
10210       __ sub(result, result, 1);
10211     __ BIND(L_HAS_ZERO_LOOP);
10212       __ mov(cnt1, wordSize/str2_chr_size);
10213       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10214       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
10215       if (str2_isL) {
10216         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10217         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10218         __ lslv(tmp2, tmp2, tmp4);
10219         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10220         __ add(tmp4, tmp4, 1);
10221         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10222         __ lsl(tmp2, tmp2, 1);
10223         __ mov(tmp4, wordSize/str2_chr_size);
10224       } else {
10225         __ mov(ch2, 0xE);
10226         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10227         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10228         __ lslv(tmp2, tmp2, tmp4);
10229         __ add(tmp4, tmp4, 1);
10230         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10231         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10232         __ lsl(tmp2, tmp2, 1);
10233         __ mov(tmp4, wordSize/str2_chr_size);
10234         __ sub(str2, str2, str2_chr_size);
10235       }
10236       __ cmp(ch1, ch2);
10237       __ mov(tmp4, wordSize/str2_chr_size);
10238       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10239     __ BIND(L_CMP_LOOP);
10240       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10241                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10242       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10243                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10244       __ add(tmp4, tmp4, 1);
10245       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10246       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
10247       __ cmp(cnt1, ch2);
10248       __ br(__ EQ, L_CMP_LOOP);
10249     __ BIND(L_CMP_LOOP_NOMATCH);
10250       // here we're not matched
10251       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
10252       __ clz(tmp4, tmp2);
10253       __ add(str2, str2, str2_chr_size); // advance pointer
10254       __ b(L_HAS_ZERO_LOOP);
10255     __ align(OptoLoopAlignment);
10256     __ BIND(L_CMP_LOOP_LAST_CMP);
10257       __ cmp(cnt1, ch2);
10258       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10259       __ b(DONE);
10260     __ align(OptoLoopAlignment);
10261     __ BIND(L_CMP_LOOP_LAST_CMP2);
10262       if (str2_isL) {
10263         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10264         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10265         __ lslv(tmp2, tmp2, tmp4);
10266         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10267         __ add(tmp4, tmp4, 1);
10268         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10269         __ lsl(tmp2, tmp2, 1);
10270       } else {
10271         __ mov(ch2, 0xE);
10272         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10273         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10274         __ lslv(tmp2, tmp2, tmp4);
10275         __ add(tmp4, tmp4, 1);
10276         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10277         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10278         __ lsl(tmp2, tmp2, 1);
10279         __ sub(str2, str2, str2_chr_size);
10280       }
10281       __ cmp(ch1, ch2);
10282       __ br(__ NE, L_CMP_LOOP_NOMATCH);
10283       __ b(DONE);
10284     __ align(OptoLoopAlignment);
10285     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
10286       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
10287       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
10288       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
10289       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
10290       // result by analyzed characters value, so, we can just reset lower bits
10291       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
10292       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
10293       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
10294       // index of last analyzed substring inside current octet. So, str2 in at
10295       // respective start address. We need to advance it to next octet
10296       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
10297       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
10298       __ bfm(result, zr, 0, 2 - str2_chr_shift);
10299       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
10300       __ movw(cnt2, cnt2);
10301       __ b(L_LOOP_PROCEED);
10302     __ align(OptoLoopAlignment);
10303     __ BIND(NOMATCH);
10304       __ mov(result, -1);
10305     __ BIND(DONE);
10306       __ pop(spilled_regs, sp);
10307       __ ret(lr);
10308 
10309     // record the stub entry and end
10310     store_archive_data(stub_id, entry, __ pc());
10311 
10312     return entry;
10313   }
10314 
10315   void generate_string_indexof_stubs() {
10316     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
10317     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
10318     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
10319   }
10320 
10321   void inflate_and_store_2_fp_registers(bool generatePrfm,
10322       FloatRegister src1, FloatRegister src2) {
10323     Register dst = r1;
10324     __ zip1(v1, __ T16B, src1, v0);
10325     __ zip2(v2, __ T16B, src1, v0);
10326     if (generatePrfm) {
10327       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
10328     }
10329     __ zip1(v3, __ T16B, src2, v0);
10330     __ zip2(v4, __ T16B, src2, v0);
10331     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
10332   }
10333 
10334   // R0 = src
10335   // R1 = dst
10336   // R2 = len
10337   // R3 = len >> 3
10338   // V0 = 0
10339   // v1 = loaded 8 bytes
10340   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
10341   address generate_large_byte_array_inflate() {
10342     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
10343     int entry_count = StubInfo::entry_count(stub_id);
10344     assert(entry_count == 1, "sanity check");
10345     address start = load_archive_data(stub_id);
10346     if (start != nullptr) {
10347       return start;
10348     }
10349     __ align(CodeEntryAlignment);
10350     StubCodeMark mark(this, stub_id);
10351     address entry = __ pc();
10352     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
10353     Register src = r0, dst = r1, len = r2, octetCounter = r3;
10354     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
10355 
10356     // do one more 8-byte read to have address 16-byte aligned in most cases
10357     // also use single store instruction
10358     __ ldrd(v2, __ post(src, 8));
10359     __ sub(octetCounter, octetCounter, 2);
10360     __ zip1(v1, __ T16B, v1, v0);
10361     __ zip1(v2, __ T16B, v2, v0);
10362     __ st1(v1, v2, __ T16B, __ post(dst, 32));
10363     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10364     __ subs(rscratch1, octetCounter, large_loop_threshold);
10365     __ br(__ LE, LOOP_START);
10366     __ b(LOOP_PRFM_START);
10367     __ bind(LOOP_PRFM);
10368       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10369     __ bind(LOOP_PRFM_START);
10370       __ prfm(Address(src, SoftwarePrefetchHintDistance));
10371       __ sub(octetCounter, octetCounter, 8);
10372       __ subs(rscratch1, octetCounter, large_loop_threshold);
10373       inflate_and_store_2_fp_registers(true, v3, v4);
10374       inflate_and_store_2_fp_registers(true, v5, v6);
10375       __ br(__ GT, LOOP_PRFM);
10376       __ cmp(octetCounter, (u1)8);
10377       __ br(__ LT, DONE);
10378     __ bind(LOOP);
10379       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10380       __ bind(LOOP_START);
10381       __ sub(octetCounter, octetCounter, 8);
10382       __ cmp(octetCounter, (u1)8);
10383       inflate_and_store_2_fp_registers(false, v3, v4);
10384       inflate_and_store_2_fp_registers(false, v5, v6);
10385       __ br(__ GE, LOOP);
10386     __ bind(DONE);
10387       __ ret(lr);
10388 
10389     // record the stub entry and end
10390     store_archive_data(stub_id, entry, __ pc());
10391 
10392     return entry;
10393   }
10394 
10395   /**
10396    *  Arguments:
10397    *
10398    *  Input:
10399    *  c_rarg0   - current state address
10400    *  c_rarg1   - H key address
10401    *  c_rarg2   - data address
10402    *  c_rarg3   - number of blocks
10403    *
10404    *  Output:
10405    *  Updated state at c_rarg0
10406    */
10407   address generate_ghash_processBlocks_small() {
10408     // Bafflingly, GCM uses little-endian for the byte order, but
10409     // big-endian for the bit order.  For example, the polynomial 1 is
10410     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
10411     //
10412     // So, we must either reverse the bytes in each word and do
10413     // everything big-endian or reverse the bits in each byte and do
10414     // it little-endian.  On AArch64 it's more idiomatic to reverse
10415     // the bits in each byte (we have an instruction, RBIT, to do
10416     // that) and keep the data in little-endian bit order through the
10417     // calculation, bit-reversing the inputs and outputs.
10418 
10419     StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
10420     int entry_count = StubInfo::entry_count(stub_id);
10421     assert(entry_count == 1, "sanity check");
10422     address start = load_archive_data(stub_id);
10423     if (start != nullptr) {
10424       return start;
10425     }
10426     __ align(CodeEntryAlignment);
10427     StubCodeMark mark(this, stub_id);
10428     Label polynomial; // local data generated at end of stub
10429     start = __ pc();
10430 
10431     Register state   = c_rarg0;
10432     Register subkeyH = c_rarg1;
10433     Register data    = c_rarg2;
10434     Register blocks  = c_rarg3;
10435 
10436     FloatRegister vzr = v30;
10437     __ eor(vzr, __ T16B, vzr, vzr); // zero register
10438 
10439     __ adr(rscratch1, polynomial);
10440     __ ldrq(v24, rscratch1);    // The field polynomial
10441 
10442     __ ldrq(v0, Address(state));
10443     __ ldrq(v1, Address(subkeyH));
10444 
10445     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
10446     __ rbit(v0, __ T16B, v0);
10447     __ rev64(v1, __ T16B, v1);
10448     __ rbit(v1, __ T16B, v1);
10449 
10450     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
10451     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
10452 
10453     {
10454       Label L_ghash_loop;
10455       __ bind(L_ghash_loop);
10456 
10457       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
10458                                                  // reversing each byte
10459       __ rbit(v2, __ T16B, v2);
10460       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
10461 
10462       // Multiply state in v2 by subkey in v1
10463       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
10464                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
10465                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
10466       // Reduce v7:v5 by the field polynomial
10467       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
10468 
10469       __ sub(blocks, blocks, 1);
10470       __ cbnz(blocks, L_ghash_loop);
10471     }
10472 
10473     // The bit-reversed result is at this point in v0
10474     __ rev64(v0, __ T16B, v0);
10475     __ rbit(v0, __ T16B, v0);
10476 
10477     __ st1(v0, __ T16B, state);
10478     __ ret(lr);
10479 
10480     // bind label and generate local polynomial data
10481     __ align(wordSize * 2);
10482     __ bind(polynomial);
10483     __ emit_int64(0x87);  // The low-order bits of the field
10484                           // polynomial (i.e. p = z^7+z^2+z+1)
10485                           // repeated in the low and high parts of a
10486                           // 128-bit vector
10487     __ emit_int64(0x87);
10488 
10489     // record the stub entry and end
10490     store_archive_data(stub_id, start, __ pc());
10491 
10492     return start;
10493   }
10494 
10495   address generate_ghash_processBlocks(address small) {
10496     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
10497     int entry_count = StubInfo::entry_count(stub_id);
10498     assert(entry_count == 1, "sanity check");
10499     address start = load_archive_data(stub_id);
10500     if (start != nullptr) {
10501       return start;
10502     }
10503     Label polynomial;           // local data generated after stub
10504     __ align(CodeEntryAlignment);
10505     StubCodeMark mark(this, stub_id);
10506     start = __ pc();
10507 
10508     Register state   = c_rarg0;
10509     Register subkeyH = c_rarg1;
10510     Register data    = c_rarg2;
10511     Register blocks  = c_rarg3;
10512 
10513     const int unroll = 4;
10514 
10515     __ cmp(blocks, (unsigned char)(unroll * 2));
10516     __ br(__ LT, small);
10517 
10518     if (unroll > 1) {
10519     // Save state before entering routine
10520       __ sub(sp, sp, 4 * 16);
10521       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
10522       __ sub(sp, sp, 4 * 16);
10523       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
10524     }
10525 
10526     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
10527 
10528     if (unroll > 1) {
10529       // And restore state
10530       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
10531       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
10532     }
10533 
10534     __ cmp(blocks, (unsigned char)0);
10535     __ br(__ GT, small);
10536 
10537     __ ret(lr);
10538 
10539     // bind label and generate polynomial data
10540     __ align(wordSize * 2);
10541     __ bind(polynomial);
10542     __ emit_int64(0x87);  // The low-order bits of the field
10543                           // polynomial (i.e. p = z^7+z^2+z+1)
10544                           // repeated in the low and high parts of a
10545                           // 128-bit vector
10546     __ emit_int64(0x87);
10547 
10548     // record the stub entry and end
10549     store_archive_data(stub_id, start, __ pc());
10550 
10551     return start;
10552   }
10553 
10554   void generate_base64_encode_simdround(Register src, Register dst,
10555         FloatRegister codec, u8 size) {
10556 
10557     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
10558     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
10559     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
10560 
10561     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10562 
10563     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
10564 
10565     __ ushr(ind0, arrangement, in0,  2);
10566 
10567     __ ushr(ind1, arrangement, in1,  2);
10568     __ shl(in0,   arrangement, in0,  6);
10569     __ orr(ind1,  arrangement, ind1, in0);
10570     __ ushr(ind1, arrangement, ind1, 2);
10571 
10572     __ ushr(ind2, arrangement, in2,  4);
10573     __ shl(in1,   arrangement, in1,  4);
10574     __ orr(ind2,  arrangement, in1,  ind2);
10575     __ ushr(ind2, arrangement, ind2, 2);
10576 
10577     __ shl(ind3,  arrangement, in2,  2);
10578     __ ushr(ind3, arrangement, ind3, 2);
10579 
10580     __ tbl(out0,  arrangement, codec,  4, ind0);
10581     __ tbl(out1,  arrangement, codec,  4, ind1);
10582     __ tbl(out2,  arrangement, codec,  4, ind2);
10583     __ tbl(out3,  arrangement, codec,  4, ind3);
10584 
10585     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
10586   }
10587 
10588    /**
10589    *  Arguments:
10590    *
10591    *  Input:
10592    *  c_rarg0   - src_start
10593    *  c_rarg1   - src_offset
10594    *  c_rarg2   - src_length
10595    *  c_rarg3   - dest_start
10596    *  c_rarg4   - dest_offset
10597    *  c_rarg5   - isURL
10598    *
10599    */
10600   address generate_base64_encodeBlock() {
10601 
10602     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
10603     int entry_count = StubInfo::entry_count(stub_id);
10604     assert(entry_count == 1, "sanity check");
10605     address start = load_archive_data(stub_id);
10606     if (start != nullptr) {
10607       return start;
10608     }
10609     __ align(CodeEntryAlignment);
10610     StubCodeMark mark(this, stub_id);
10611     start = __ pc();
10612 
10613     Register src   = c_rarg0;  // source array
10614     Register soff  = c_rarg1;  // source start offset
10615     Register send  = c_rarg2;  // source end offset
10616     Register dst   = c_rarg3;  // dest array
10617     Register doff  = c_rarg4;  // position for writing to dest array
10618     Register isURL = c_rarg5;  // Base64 or URL character set
10619 
10620     // c_rarg6 and c_rarg7 are free to use as temps
10621     Register codec  = c_rarg6;
10622     Register length = c_rarg7;
10623 
10624     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
10625 
10626     __ add(src, src, soff);
10627     __ add(dst, dst, doff);
10628     __ sub(length, send, soff);
10629 
10630     // load the codec base address
10631     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
10632     __ cbz(isURL, ProcessData);
10633     __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
10634 
10635     __ BIND(ProcessData);
10636 
10637     // too short to formup a SIMD loop, roll back
10638     __ cmp(length, (u1)24);
10639     __ br(Assembler::LT, Process3B);
10640 
10641     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
10642 
10643     __ BIND(Process48B);
10644     __ cmp(length, (u1)48);
10645     __ br(Assembler::LT, Process24B);
10646     generate_base64_encode_simdround(src, dst, v0, 16);
10647     __ sub(length, length, 48);
10648     __ b(Process48B);
10649 
10650     __ BIND(Process24B);
10651     __ cmp(length, (u1)24);
10652     __ br(Assembler::LT, SIMDExit);
10653     generate_base64_encode_simdround(src, dst, v0, 8);
10654     __ sub(length, length, 24);
10655 
10656     __ BIND(SIMDExit);
10657     __ cbz(length, Exit);
10658 
10659     __ BIND(Process3B);
10660     //  3 src bytes, 24 bits
10661     __ ldrb(r10, __ post(src, 1));
10662     __ ldrb(r11, __ post(src, 1));
10663     __ ldrb(r12, __ post(src, 1));
10664     __ orrw(r11, r11, r10, Assembler::LSL, 8);
10665     __ orrw(r12, r12, r11, Assembler::LSL, 8);
10666     // codec index
10667     __ ubfmw(r15, r12, 18, 23);
10668     __ ubfmw(r14, r12, 12, 17);
10669     __ ubfmw(r13, r12, 6,  11);
10670     __ andw(r12,  r12, 63);
10671     // get the code based on the codec
10672     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
10673     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
10674     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
10675     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
10676     __ strb(r15, __ post(dst, 1));
10677     __ strb(r14, __ post(dst, 1));
10678     __ strb(r13, __ post(dst, 1));
10679     __ strb(r12, __ post(dst, 1));
10680     __ sub(length, length, 3);
10681     __ cbnz(length, Process3B);
10682 
10683     __ BIND(Exit);
10684     __ ret(lr);
10685 
10686     // record the stub entry and end
10687     store_archive_data(stub_id, start, __ pc());
10688 
10689     return start;
10690   }
10691 
10692   void generate_base64_decode_simdround(Register src, Register dst,
10693         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
10694 
10695     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
10696     FloatRegister out0 = v20, out1 = v21, out2 = v22;
10697 
10698     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
10699     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
10700 
10701     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
10702 
10703     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10704 
10705     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10706 
10707     // we need unsigned saturating subtract, to make sure all input values
10708     // in range [0, 63] will have 0U value in the higher half lookup
10709     __ uqsubv(decH0, __ T16B, in0, v27);
10710     __ uqsubv(decH1, __ T16B, in1, v27);
10711     __ uqsubv(decH2, __ T16B, in2, v27);
10712     __ uqsubv(decH3, __ T16B, in3, v27);
10713 
10714     // lower half lookup
10715     __ tbl(decL0, arrangement, codecL, 4, in0);
10716     __ tbl(decL1, arrangement, codecL, 4, in1);
10717     __ tbl(decL2, arrangement, codecL, 4, in2);
10718     __ tbl(decL3, arrangement, codecL, 4, in3);
10719 
10720     // higher half lookup
10721     __ tbx(decH0, arrangement, codecH, 4, decH0);
10722     __ tbx(decH1, arrangement, codecH, 4, decH1);
10723     __ tbx(decH2, arrangement, codecH, 4, decH2);
10724     __ tbx(decH3, arrangement, codecH, 4, decH3);
10725 
10726     // combine lower and higher
10727     __ orr(decL0, arrangement, decL0, decH0);
10728     __ orr(decL1, arrangement, decL1, decH1);
10729     __ orr(decL2, arrangement, decL2, decH2);
10730     __ orr(decL3, arrangement, decL3, decH3);
10731 
10732     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10733     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10734     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10735     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10736     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10737     __ orr(in0, arrangement, decH0, decH1);
10738     __ orr(in1, arrangement, decH2, decH3);
10739     __ orr(in2, arrangement, in0,   in1);
10740     __ umaxv(in3, arrangement, in2);
10741     __ umov(rscratch2, in3, __ B, 0);
10742 
10743     // get the data to output
10744     __ shl(out0,  arrangement, decL0, 2);
10745     __ ushr(out1, arrangement, decL1, 4);
10746     __ orr(out0,  arrangement, out0,  out1);
10747     __ shl(out1,  arrangement, decL1, 4);
10748     __ ushr(out2, arrangement, decL2, 2);
10749     __ orr(out1,  arrangement, out1,  out2);
10750     __ shl(out2,  arrangement, decL2, 6);
10751     __ orr(out2,  arrangement, out2,  decL3);
10752 
10753     __ cbz(rscratch2, NoIllegalData);
10754 
10755     // handle illegal input
10756     __ umov(r10, in2, __ D, 0);
10757     if (size == 16) {
10758       __ cbnz(r10, ErrorInLowerHalf);
10759 
10760       // illegal input is in higher half, store the lower half now.
10761       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10762 
10763       __ umov(r10, in2,  __ D, 1);
10764       __ umov(r11, out0, __ D, 1);
10765       __ umov(r12, out1, __ D, 1);
10766       __ umov(r13, out2, __ D, 1);
10767       __ b(StoreLegalData);
10768 
10769       __ BIND(ErrorInLowerHalf);
10770     }
10771     __ umov(r11, out0, __ D, 0);
10772     __ umov(r12, out1, __ D, 0);
10773     __ umov(r13, out2, __ D, 0);
10774 
10775     __ BIND(StoreLegalData);
10776     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10777     __ strb(r11, __ post(dst, 1));
10778     __ strb(r12, __ post(dst, 1));
10779     __ strb(r13, __ post(dst, 1));
10780     __ lsr(r10, r10, 8);
10781     __ lsr(r11, r11, 8);
10782     __ lsr(r12, r12, 8);
10783     __ lsr(r13, r13, 8);
10784     __ b(StoreLegalData);
10785 
10786     __ BIND(NoIllegalData);
10787     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10788   }
10789 
10790 
10791    /**
10792    *  Arguments:
10793    *
10794    *  Input:
10795    *  c_rarg0   - src_start
10796    *  c_rarg1   - src_offset
10797    *  c_rarg2   - src_length
10798    *  c_rarg3   - dest_start
10799    *  c_rarg4   - dest_offset
10800    *  c_rarg5   - isURL
10801    *  c_rarg6   - isMIME
10802    *
10803    */
10804   address generate_base64_decodeBlock() {
10805 
10806     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10807     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10808     // titled "Base64 decoding".
10809 
10810     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10811     int entry_count = StubInfo::entry_count(stub_id);
10812     assert(entry_count == 1, "sanity check");
10813     address start = load_archive_data(stub_id);
10814     if (start != nullptr) {
10815       return start;
10816     }
10817     __ align(CodeEntryAlignment);
10818     StubCodeMark mark(this, stub_id);
10819     start = __ pc();
10820 
10821     Register src    = c_rarg0;  // source array
10822     Register soff   = c_rarg1;  // source start offset
10823     Register send   = c_rarg2;  // source end offset
10824     Register dst    = c_rarg3;  // dest array
10825     Register doff   = c_rarg4;  // position for writing to dest array
10826     Register isURL  = c_rarg5;  // Base64 or URL character set
10827     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10828 
10829     Register length = send;    // reuse send as length of source data to process
10830 
10831     Register simd_codec   = c_rarg6;
10832     Register nosimd_codec = c_rarg7;
10833 
10834     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10835 
10836     __ enter();
10837 
10838     __ add(src, src, soff);
10839     __ add(dst, dst, doff);
10840 
10841     __ mov(doff, dst);
10842 
10843     __ sub(length, send, soff);
10844     __ bfm(length, zr, 0, 1);
10845 
10846     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
10847     __ cbz(isURL, ProcessData);
10848     __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
10849 
10850     __ BIND(ProcessData);
10851     __ mov(rscratch1, length);
10852     __ cmp(length, (u1)144); // 144 = 80 + 64
10853     __ br(Assembler::LT, Process4B);
10854 
10855     // In the MIME case, the line length cannot be more than 76
10856     // bytes (see RFC 2045). This is too short a block for SIMD
10857     // to be worthwhile, so we use non-SIMD here.
10858     __ movw(rscratch1, 79);
10859 
10860     __ BIND(Process4B);
10861     __ ldrw(r14, __ post(src, 4));
10862     __ ubfxw(r10, r14, 0,  8);
10863     __ ubfxw(r11, r14, 8,  8);
10864     __ ubfxw(r12, r14, 16, 8);
10865     __ ubfxw(r13, r14, 24, 8);
10866     // get the de-code
10867     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10868     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10869     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10870     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10871     // error detection, 255u indicates an illegal input
10872     __ orrw(r14, r10, r11);
10873     __ orrw(r15, r12, r13);
10874     __ orrw(r14, r14, r15);
10875     __ tbnz(r14, 7, Exit);
10876     // recover the data
10877     __ lslw(r14, r10, 10);
10878     __ bfiw(r14, r11, 4, 6);
10879     __ bfmw(r14, r12, 2, 5);
10880     __ rev16w(r14, r14);
10881     __ bfiw(r13, r12, 6, 2);
10882     __ strh(r14, __ post(dst, 2));
10883     __ strb(r13, __ post(dst, 1));
10884     // non-simd loop
10885     __ subsw(rscratch1, rscratch1, 4);
10886     __ br(Assembler::GT, Process4B);
10887 
10888     // if exiting from PreProcess80B, rscratch1 == -1;
10889     // otherwise, rscratch1 == 0.
10890     __ cbzw(rscratch1, Exit);
10891     __ sub(length, length, 80);
10892 
10893     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
10894     __ cbz(isURL, SIMDEnter);
10895     __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
10896 
10897     __ BIND(SIMDEnter);
10898     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10899     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10900     __ mov(rscratch1, 63);
10901     __ dup(v27, __ T16B, rscratch1);
10902 
10903     __ BIND(Process64B);
10904     __ cmp(length, (u1)64);
10905     __ br(Assembler::LT, Process32B);
10906     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10907     __ sub(length, length, 64);
10908     __ b(Process64B);
10909 
10910     __ BIND(Process32B);
10911     __ cmp(length, (u1)32);
10912     __ br(Assembler::LT, SIMDExit);
10913     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10914     __ sub(length, length, 32);
10915     __ b(Process32B);
10916 
10917     __ BIND(SIMDExit);
10918     __ cbz(length, Exit);
10919     __ movw(rscratch1, length);
10920     __ b(Process4B);
10921 
10922     __ BIND(Exit);
10923     __ sub(c_rarg0, dst, doff);
10924 
10925     __ leave();
10926     __ ret(lr);
10927 
10928     // record the stub entry and end
10929     store_archive_data(stub_id, start, __ pc());
10930 
10931     return start;
10932   }
10933 
10934   // Support for spin waits.
10935   address generate_spin_wait() {
10936     StubId stub_id = StubId::stubgen_spin_wait_id;
10937     int entry_count = StubInfo::entry_count(stub_id);
10938     assert(entry_count == 1, "sanity check");
10939     address start = load_archive_data(stub_id);
10940     if (start != nullptr) {
10941       return start;
10942     }
10943     __ align(CodeEntryAlignment);
10944     StubCodeMark mark(this, stub_id);
10945     start = __ pc();
10946 
10947     __ spin_wait();
10948     __ ret(lr);
10949 
10950     // record the stub entry and end
10951     store_archive_data(stub_id, start, __ pc());
10952 
10953     return start;
10954   }
10955 
10956   void generate_lookup_secondary_supers_table_stub() {
10957     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10958     GrowableArray<address> entries;
10959     int entry_count = StubInfo::entry_count(stub_id);
10960     assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
10961     address start = load_archive_data(stub_id, &entries);
10962     if (start != nullptr) {
10963       assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
10964              "unexpected extra entry count %d", entries.length());
10965       StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
10966       for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10967         StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
10968       }
10969       return;
10970     }
10971 
10972     StubCodeMark mark(this, stub_id);
10973 
10974     const Register
10975       r_super_klass  = r0,
10976       r_array_base   = r1,
10977       r_array_length = r2,
10978       r_array_index  = r3,
10979       r_sub_klass    = r4,
10980       r_bitmap       = rscratch2,
10981       result         = r5;
10982     const FloatRegister
10983       vtemp          = v0;
10984 
10985     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10986       address next_entry = __ pc();
10987       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
10988       if (slot == 0) {
10989         start = next_entry;
10990       } else {
10991         entries.append(next_entry);
10992       }
10993       Label L_success;
10994       __ enter();
10995       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10996                                              r_array_base, r_array_length, r_array_index,
10997                                              vtemp, result, slot,
10998                                              /*stub_is_near*/true);
10999       __ leave();
11000       __ ret(lr);
11001     }
11002     // record the stub entry and end plus all the auxiliary entries
11003     store_archive_data(stub_id, start, __ pc(), &entries);
11004   }
11005 
11006   // Slow path implementation for UseSecondarySupersTable.
11007   address generate_lookup_secondary_supers_table_slow_path_stub() {
11008     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
11009     int entry_count = StubInfo::entry_count(stub_id);
11010     assert(entry_count == 1, "sanity check");
11011     address start = load_archive_data(stub_id);
11012     if (start != nullptr) {
11013       return start;
11014     }
11015     StubCodeMark mark(this, stub_id);
11016     start = __ pc();
11017     const Register
11018       r_super_klass  = r0,        // argument
11019       r_array_base   = r1,        // argument
11020       temp1          = r2,        // temp
11021       r_array_index  = r3,        // argument
11022       r_bitmap       = rscratch2, // argument
11023       result         = r5;        // argument
11024 
11025     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
11026     __ ret(lr);
11027 
11028     // record the stub entry and end
11029     store_archive_data(stub_id, start, __ pc());
11030 
11031     return start;
11032   }
11033 
11034 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11035 
11036   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
11037   //
11038   // If LSE is in use, generate LSE versions of all the stubs. The
11039   // non-LSE versions are in atomic_aarch64.S.
11040 
11041   // class AtomicStubMark records the entry point of a stub and the
11042   // stub pointer which will point to it. The stub pointer is set to
11043   // the entry point when ~AtomicStubMark() is called, which must be
11044   // after ICache::invalidate_range. This ensures safe publication of
11045   // the generated code.
11046   class AtomicStubMark {
11047     address _entry_point;
11048     aarch64_atomic_stub_t *_stub;
11049     MacroAssembler *_masm;
11050   public:
11051     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
11052       _masm = masm;
11053       __ align(32);
11054       _entry_point = __ pc();
11055       _stub = stub;
11056     }
11057     ~AtomicStubMark() {
11058       *_stub = (aarch64_atomic_stub_t)_entry_point;
11059     }
11060   };
11061 
11062   // NB: For memory_order_conservative we need a trailing membar after
11063   // LSE atomic operations but not a leading membar.
11064   //
11065   // We don't need a leading membar because a clause in the Arm ARM
11066   // says:
11067   //
11068   //   Barrier-ordered-before
11069   //
11070   //   Barrier instructions order prior Memory effects before subsequent
11071   //   Memory effects generated by the same Observer. A read or a write
11072   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
11073   //   Observer if and only if RW1 appears in program order before RW 2
11074   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
11075   //   instruction with both Acquire and Release semantics.
11076   //
11077   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
11078   // and Release semantics, therefore we don't need a leading
11079   // barrier. However, there is no corresponding Barrier-ordered-after
11080   // relationship, therefore we need a trailing membar to prevent a
11081   // later store or load from being reordered with the store in an
11082   // atomic instruction.
11083   //
11084   // This was checked by using the herd7 consistency model simulator
11085   // (http://diy.inria.fr/) with this test case:
11086   //
11087   // AArch64 LseCas
11088   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
11089   // P0 | P1;
11090   // LDR W4, [X2] | MOV W3, #0;
11091   // DMB LD       | MOV W4, #1;
11092   // LDR W3, [X1] | CASAL W3, W4, [X1];
11093   //              | DMB ISH;
11094   //              | STR W4, [X2];
11095   // exists
11096   // (0:X3=0 /\ 0:X4=1)
11097   //
11098   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
11099   // with the store to x in P1. Without the DMB in P1 this may happen.
11100   //
11101   // At the time of writing we don't know of any AArch64 hardware that
11102   // reorders stores in this way, but the Reference Manual permits it.
11103 
11104   void gen_cas_entry(Assembler::operand_size size,
11105                      atomic_memory_order order) {
11106     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
11107       exchange_val = c_rarg2;
11108     bool acquire, release;
11109     switch (order) {
11110       case memory_order_relaxed:
11111         acquire = false;
11112         release = false;
11113         break;
11114       case memory_order_release:
11115         acquire = false;
11116         release = true;
11117         break;
11118       default:
11119         acquire = true;
11120         release = true;
11121         break;
11122     }
11123     __ mov(prev, compare_val);
11124     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
11125     if (order == memory_order_conservative) {
11126       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11127     }
11128     if (size == Assembler::xword) {
11129       __ mov(r0, prev);
11130     } else {
11131       __ movw(r0, prev);
11132     }
11133     __ ret(lr);
11134   }
11135 
11136   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
11137     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11138     // If not relaxed, then default to conservative.  Relaxed is the only
11139     // case we use enough to be worth specializing.
11140     if (order == memory_order_relaxed) {
11141       __ ldadd(size, incr, prev, addr);
11142     } else {
11143       __ ldaddal(size, incr, prev, addr);
11144       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11145     }
11146     if (size == Assembler::xword) {
11147       __ mov(r0, prev);
11148     } else {
11149       __ movw(r0, prev);
11150     }
11151     __ ret(lr);
11152   }
11153 
11154   void gen_swpal_entry(Assembler::operand_size size) {
11155     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11156     __ swpal(size, incr, prev, addr);
11157     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11158     if (size == Assembler::xword) {
11159       __ mov(r0, prev);
11160     } else {
11161       __ movw(r0, prev);
11162     }
11163     __ ret(lr);
11164   }
11165 
11166   void generate_atomic_entry_points() {
11167     if (! UseLSE) {
11168       return;
11169     }
11170     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
11171     GrowableArray<address> entries;
11172     int entry_count = StubInfo::entry_count(stub_id);
11173     address start = load_archive_data(stub_id, &entries);
11174     if (start != nullptr) {
11175       assert(entries.length() == entry_count - 1,
11176              "unexpected extra entry count %d", entries.length());
11177       aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
11178       int idx = 0;
11179       aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11180       aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11181       aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11182       aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11183       aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11184       aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11185       aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11186       aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11187       aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11188       aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11189       aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11190       aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11191       aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11192       aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11193       aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11194       assert(idx == entries.length(), "sanity!");
11195       return;
11196     }
11197 
11198     __ align(CodeEntryAlignment);
11199     StubCodeMark mark(this, stub_id);
11200     start = __ pc();
11201     address end;
11202     {
11203     // ADD, memory_order_conservative
11204     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
11205     gen_ldadd_entry(Assembler::word, memory_order_conservative);
11206 
11207     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
11208     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
11209 
11210     // ADD, memory_order_relaxed
11211     AtomicStubMark mark_fetch_add_4_relaxed
11212       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
11213     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
11214 
11215     AtomicStubMark mark_fetch_add_8_relaxed
11216       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
11217     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
11218 
11219     // XCHG, memory_order_conservative
11220     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
11221     gen_swpal_entry(Assembler::word);
11222 
11223     AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
11224     gen_swpal_entry(Assembler::xword);
11225 
11226     // CAS, memory_order_conservative
11227     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
11228     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
11229 
11230     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
11231     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
11232 
11233     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
11234     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
11235 
11236     // CAS, memory_order_relaxed
11237     AtomicStubMark mark_cmpxchg_1_relaxed
11238       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
11239     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
11240 
11241     AtomicStubMark mark_cmpxchg_4_relaxed
11242       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
11243     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
11244 
11245     AtomicStubMark mark_cmpxchg_8_relaxed
11246       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
11247     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
11248 
11249     AtomicStubMark mark_cmpxchg_4_release
11250       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
11251     gen_cas_entry(MacroAssembler::word, memory_order_release);
11252 
11253     AtomicStubMark mark_cmpxchg_8_release
11254       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
11255     gen_cas_entry(MacroAssembler::xword, memory_order_release);
11256 
11257     AtomicStubMark mark_cmpxchg_4_seq_cst
11258       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
11259     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
11260 
11261     AtomicStubMark mark_cmpxchg_8_seq_cst
11262       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
11263     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
11264 
11265     end = __ pc();
11266 
11267     ICache::invalidate_range(start, end - start);
11268     // exit block to force update of AtomicStubMark targets
11269     }
11270 
11271     assert(start == (address)aarch64_atomic_fetch_add_4_impl,
11272            "atomic stub should be at start of buffer");
11273     // record the stub start and end plus all the entries saved by the
11274     // AtomicStubMark destructor
11275     entries.append((address)aarch64_atomic_fetch_add_8_impl);
11276     entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
11277     entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
11278     entries.append((address)aarch64_atomic_xchg_4_impl);
11279     entries.append((address)aarch64_atomic_xchg_8_impl);
11280     entries.append((address)aarch64_atomic_cmpxchg_1_impl);
11281     entries.append((address)aarch64_atomic_cmpxchg_4_impl);
11282     entries.append((address)aarch64_atomic_cmpxchg_8_impl);
11283     entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
11284     entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
11285     entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
11286     entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
11287     entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
11288     entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
11289     entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
11290 
11291     assert(entries.length() == entry_count - 1,
11292            "unexpected extra entry count %d", entries.length());
11293 
11294     store_archive_data(stub_id, start, end, &entries);
11295   }
11296 #endif // LINUX
11297 
11298   address generate_cont_thaw(Continuation::thaw_kind kind) {
11299     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
11300     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
11301 
11302     address start = __ pc();
11303 
11304     if (return_barrier) {
11305       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
11306       __ mov(sp, rscratch1);
11307     }
11308     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11309 
11310     if (return_barrier) {
11311       // preserve possible return value from a method returning to the return barrier
11312       __ fmovd(rscratch1, v0);
11313       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
11314     }
11315 
11316     __ movw(c_rarg1, (return_barrier ? 1 : 0));
11317     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
11318     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
11319 
11320     if (return_barrier) {
11321       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11322       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
11323       __ fmovd(v0, rscratch1);
11324     }
11325     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11326 
11327 
11328     Label thaw_success;
11329     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
11330     __ cbnz(rscratch2, thaw_success);
11331     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
11332     __ br(rscratch1);
11333     __ bind(thaw_success);
11334 
11335     // make room for the thawed frames
11336     __ sub(rscratch1, sp, rscratch2);
11337     __ andr(rscratch1, rscratch1, -16); // align
11338     __ mov(sp, rscratch1);
11339 
11340     if (return_barrier) {
11341       // save original return value -- again
11342       __ fmovd(rscratch1, v0);
11343       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
11344     }
11345 
11346     // If we want, we can templatize thaw by kind, and have three different entries
11347     __ movw(c_rarg1, (uint32_t)kind);
11348 
11349     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
11350     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
11351 
11352     if (return_barrier) {
11353       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11354       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
11355       __ fmovd(v0, rscratch1);
11356     } else {
11357       __ mov(r0, zr); // return 0 (success) from doYield
11358     }
11359 
11360     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
11361     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
11362     __ mov(rfp, sp);
11363 
11364     if (return_barrier_exception) {
11365       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
11366       __ authenticate_return_address(c_rarg1);
11367       __ verify_oop(r0);
11368       // save return value containing the exception oop in callee-saved R19
11369       __ mov(r19, r0);
11370 
11371       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
11372 
11373       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
11374       // __ reinitialize_ptrue();
11375 
11376       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
11377 
11378       __ mov(r1, r0); // the exception handler
11379       __ mov(r0, r19); // restore return value containing the exception oop
11380       __ verify_oop(r0);
11381 
11382       __ leave();
11383       __ mov(r3, lr);
11384       __ br(r1); // the exception handler
11385     } else {
11386       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
11387       __ leave();
11388       __ ret(lr);
11389     }
11390 
11391     return start;
11392   }
11393 
11394   address generate_cont_thaw() {
11395     if (!Continuations::enabled()) return nullptr;
11396 
11397     StubId stub_id = StubId::stubgen_cont_thaw_id;
11398     int entry_count = StubInfo::entry_count(stub_id);
11399     assert(entry_count == 1, "sanity check");
11400     address start = load_archive_data(stub_id);
11401     if (start != nullptr) {
11402       return start;
11403     }
11404     StubCodeMark mark(this, stub_id);
11405     start = __ pc();
11406     generate_cont_thaw(Continuation::thaw_top);
11407 
11408     // record the stub start and end
11409     store_archive_data(stub_id, start, __ pc());
11410 
11411     return start;
11412   }
11413 
11414   address generate_cont_returnBarrier() {
11415     if (!Continuations::enabled()) return nullptr;
11416 
11417     // TODO: will probably need multiple return barriers depending on return type
11418     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
11419     int entry_count = StubInfo::entry_count(stub_id);
11420     assert(entry_count == 1, "sanity check");
11421     address start = load_archive_data(stub_id);
11422     if (start != nullptr) {
11423       return start;
11424     }
11425     StubCodeMark mark(this, stub_id);
11426     start = __ pc();
11427 
11428     generate_cont_thaw(Continuation::thaw_return_barrier);
11429 
11430     // record the stub start and end
11431     store_archive_data(stub_id, start, __ pc());
11432 
11433     return start;
11434   }
11435 
11436   address generate_cont_returnBarrier_exception() {
11437     if (!Continuations::enabled()) return nullptr;
11438 
11439     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
11440     int entry_count = StubInfo::entry_count(stub_id);
11441     assert(entry_count == 1, "sanity check");
11442     address start = load_archive_data(stub_id);
11443     if (start != nullptr) {
11444       return start;
11445     }
11446     StubCodeMark mark(this, stub_id);
11447     start = __ pc();
11448 
11449     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
11450 
11451     // record the stub start and end
11452     store_archive_data(stub_id, start, __ pc());
11453 
11454     return start;
11455   }
11456 
11457   address generate_cont_preempt_stub() {
11458     if (!Continuations::enabled()) return nullptr;
11459     StubId stub_id = StubId::stubgen_cont_preempt_id;
11460     int entry_count = StubInfo::entry_count(stub_id);
11461     assert(entry_count == 1, "sanity check");
11462     address start = load_archive_data(stub_id);
11463     if (start != nullptr) {
11464       return start;
11465     }
11466     StubCodeMark mark(this, stub_id);
11467     start = __ pc();
11468 
11469     __ reset_last_Java_frame(true);
11470 
11471     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
11472     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
11473     __ mov(sp, rscratch2);
11474 
11475     Label preemption_cancelled;
11476     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
11477     __ cbnz(rscratch1, preemption_cancelled);
11478 
11479     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
11480     SharedRuntime::continuation_enter_cleanup(_masm);
11481     __ leave();
11482     __ ret(lr);
11483 
11484     // We acquired the monitor after freezing the frames so call thaw to continue execution.
11485     __ bind(preemption_cancelled);
11486     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
11487     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
11488     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
11489     __ ldr(rscratch1, Address(rscratch1));
11490     __ br(rscratch1);
11491 
11492     // record the stub start and end
11493     store_archive_data(stub_id, start, __ pc());
11494 
11495     return start;
11496   }
11497 
11498   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
11499   // are represented as long[5], with BITS_PER_LIMB = 26.
11500   // Pack five 26-bit limbs into three 64-bit registers.
11501   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
11502     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
11503     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
11504     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
11505     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
11506 
11507     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
11508     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
11509     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
11510     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
11511 
11512     if (dest2->is_valid()) {
11513       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
11514     } else {
11515 #ifdef ASSERT
11516       Label OK;
11517       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
11518       __ br(__ EQ, OK);
11519       __ stop("high bits of Poly1305 integer should be zero");
11520       __ should_not_reach_here();
11521       __ bind(OK);
11522 #endif
11523     }
11524   }
11525 
11526   // As above, but return only a 128-bit integer, packed into two
11527   // 64-bit registers.
11528   void pack_26(Register dest0, Register dest1, Register src) {
11529     pack_26(dest0, dest1, noreg, src);
11530   }
11531 
11532   // Multiply and multiply-accumulate unsigned 64-bit registers.
11533   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
11534     __ mul(prod_lo, n, m);
11535     __ umulh(prod_hi, n, m);
11536   }
11537   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
11538     wide_mul(rscratch1, rscratch2, n, m);
11539     __ adds(sum_lo, sum_lo, rscratch1);
11540     __ adc(sum_hi, sum_hi, rscratch2);
11541   }
11542 
11543   // Poly1305, RFC 7539
11544 
11545   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
11546   // description of the tricks used to simplify and accelerate this
11547   // computation.
11548 
11549   address generate_poly1305_processBlocks() {
11550     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
11551     int entry_count = StubInfo::entry_count(stub_id);
11552     assert(entry_count == 1, "sanity check");
11553     address start = load_archive_data(stub_id);
11554     if (start != nullptr) {
11555       return start;
11556     }
11557     __ align(CodeEntryAlignment);
11558     StubCodeMark mark(this, stub_id);
11559     start = __ pc();
11560     Label here;
11561     __ enter();
11562     RegSet callee_saved = RegSet::range(r19, r28);
11563     __ push(callee_saved, sp);
11564 
11565     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
11566 
11567     // Arguments
11568     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
11569 
11570     // R_n is the 128-bit randomly-generated key, packed into two
11571     // registers.  The caller passes this key to us as long[5], with
11572     // BITS_PER_LIMB = 26.
11573     const Register R_0 = *++regs, R_1 = *++regs;
11574     pack_26(R_0, R_1, r_start);
11575 
11576     // RR_n is (R_n >> 2) * 5
11577     const Register RR_0 = *++regs, RR_1 = *++regs;
11578     __ lsr(RR_0, R_0, 2);
11579     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
11580     __ lsr(RR_1, R_1, 2);
11581     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
11582 
11583     // U_n is the current checksum
11584     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
11585     pack_26(U_0, U_1, U_2, acc_start);
11586 
11587     static constexpr int BLOCK_LENGTH = 16;
11588     Label DONE, LOOP;
11589 
11590     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11591     __ br(Assembler::LT, DONE); {
11592       __ bind(LOOP);
11593 
11594       // S_n is to be the sum of U_n and the next block of data
11595       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
11596       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
11597       __ adds(S_0, U_0, S_0);
11598       __ adcs(S_1, U_1, S_1);
11599       __ adc(S_2, U_2, zr);
11600       __ add(S_2, S_2, 1);
11601 
11602       const Register U_0HI = *++regs, U_1HI = *++regs;
11603 
11604       // NB: this logic depends on some of the special properties of
11605       // Poly1305 keys. In particular, because we know that the top
11606       // four bits of R_0 and R_1 are zero, we can add together
11607       // partial products without any risk of needing to propagate a
11608       // carry out.
11609       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
11610       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
11611       __ andr(U_2, R_0, 3);
11612       __ mul(U_2, S_2, U_2);
11613 
11614       // Recycle registers S_0, S_1, S_2
11615       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
11616 
11617       // Partial reduction mod 2**130 - 5
11618       __ adds(U_1, U_0HI, U_1);
11619       __ adc(U_2, U_1HI, U_2);
11620       // Sum now in U_2:U_1:U_0.
11621       // Dead: U_0HI, U_1HI.
11622       regs = (regs.remaining() + U_0HI + U_1HI).begin();
11623 
11624       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
11625 
11626       // First, U_2:U_1:U_0 += (U_2 >> 2)
11627       __ lsr(rscratch1, U_2, 2);
11628       __ andr(U_2, U_2, (u8)3);
11629       __ adds(U_0, U_0, rscratch1);
11630       __ adcs(U_1, U_1, zr);
11631       __ adc(U_2, U_2, zr);
11632       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
11633       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
11634       __ adcs(U_1, U_1, zr);
11635       __ adc(U_2, U_2, zr);
11636 
11637       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
11638       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11639       __ br(~ Assembler::LT, LOOP);
11640     }
11641 
11642     // Further reduce modulo 2^130 - 5
11643     __ lsr(rscratch1, U_2, 2);
11644     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
11645     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
11646     __ adcs(U_1, U_1, zr);
11647     __ andr(U_2, U_2, (u1)3);
11648     __ adc(U_2, U_2, zr);
11649 
11650     // Unpack the sum into five 26-bit limbs and write to memory.
11651     __ ubfiz(rscratch1, U_0, 0, 26);
11652     __ ubfx(rscratch2, U_0, 26, 26);
11653     __ stp(rscratch1, rscratch2, Address(acc_start));
11654     __ ubfx(rscratch1, U_0, 52, 12);
11655     __ bfi(rscratch1, U_1, 12, 14);
11656     __ ubfx(rscratch2, U_1, 14, 26);
11657     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
11658     __ ubfx(rscratch1, U_1, 40, 24);
11659     __ bfi(rscratch1, U_2, 24, 3);
11660     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
11661 
11662     __ bind(DONE);
11663     __ pop(callee_saved, sp);
11664     __ leave();
11665     __ ret(lr);
11666 
11667     // record the stub start and end
11668     store_archive_data(stub_id, start, __ pc());
11669 
11670     return start;
11671   }
11672 
11673   // exception handler for upcall stubs
11674   address generate_upcall_stub_exception_handler() {
11675     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
11676     int entry_count = StubInfo::entry_count(stub_id);
11677     assert(entry_count == 1, "sanity check");
11678     address start = load_archive_data(stub_id);
11679     if (start != nullptr) {
11680       return start;
11681     }
11682     StubCodeMark mark(this, stub_id);
11683     start = __ pc();
11684 
11685     // Native caller has no idea how to handle exceptions,
11686     // so we just crash here. Up to callee to catch exceptions.
11687     __ verify_oop(r0);
11688     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
11689     __ blr(rscratch1);
11690     __ should_not_reach_here();
11691 
11692     // record the stub start and end
11693     store_archive_data(stub_id, start, __ pc());
11694 
11695     return start;
11696   }
11697 
11698   // load Method* target of MethodHandle
11699   // j_rarg0 = jobject receiver
11700   // rmethod = result
11701   address generate_upcall_stub_load_target() {
11702     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
11703     int entry_count = StubInfo::entry_count(stub_id);
11704     assert(entry_count == 1, "sanity check");
11705     address start = load_archive_data(stub_id);
11706     if (start != nullptr) {
11707       return start;
11708     }
11709     StubCodeMark mark(this, stub_id);
11710     start = __ pc();
11711 
11712     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
11713       // Load target method from receiver
11714     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
11715     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
11716     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
11717     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
11718                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
11719                       noreg, noreg);
11720     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
11721 
11722     __ ret(lr);
11723 
11724     // record the stub start and end
11725     store_archive_data(stub_id, start, __ pc());
11726 
11727     return start;
11728   }
11729 
11730 #undef __
11731 #define __ masm->
11732 
11733   class MontgomeryMultiplyGenerator : public MacroAssembler {
11734 
11735     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
11736       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
11737 
11738     RegSet _toSave;
11739     bool _squaring;
11740 
11741   public:
11742     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
11743       : MacroAssembler(as->code()), _squaring(squaring) {
11744 
11745       // Register allocation
11746 
11747       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
11748       Pa_base = *regs;       // Argument registers
11749       if (squaring)
11750         Pb_base = Pa_base;
11751       else
11752         Pb_base = *++regs;
11753       Pn_base = *++regs;
11754       Rlen= *++regs;
11755       inv = *++regs;
11756       Pm_base = *++regs;
11757 
11758                           // Working registers:
11759       Ra =  *++regs;        // The current digit of a, b, n, and m.
11760       Rb =  *++regs;
11761       Rm =  *++regs;
11762       Rn =  *++regs;
11763 
11764       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
11765       Pb =  *++regs;
11766       Pm =  *++regs;
11767       Pn =  *++regs;
11768 
11769       t0 =  *++regs;        // Three registers which form a
11770       t1 =  *++regs;        // triple-precision accumuator.
11771       t2 =  *++regs;
11772 
11773       Ri =  *++regs;        // Inner and outer loop indexes.
11774       Rj =  *++regs;
11775 
11776       Rhi_ab = *++regs;     // Product registers: low and high parts
11777       Rlo_ab = *++regs;     // of a*b and m*n.
11778       Rhi_mn = *++regs;
11779       Rlo_mn = *++regs;
11780 
11781       // r19 and up are callee-saved.
11782       _toSave = RegSet::range(r19, *regs) + Pm_base;
11783     }
11784 
11785   private:
11786     void save_regs() {
11787       push(_toSave, sp);
11788     }
11789 
11790     void restore_regs() {
11791       pop(_toSave, sp);
11792     }
11793 
11794     template <typename T>
11795     void unroll_2(Register count, T block) {
11796       Label loop, end, odd;
11797       tbnz(count, 0, odd);
11798       cbz(count, end);
11799       align(16);
11800       bind(loop);
11801       (this->*block)();
11802       bind(odd);
11803       (this->*block)();
11804       subs(count, count, 2);
11805       br(Assembler::GT, loop);
11806       bind(end);
11807     }
11808 
11809     template <typename T>
11810     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11811       Label loop, end, odd;
11812       tbnz(count, 0, odd);
11813       cbz(count, end);
11814       align(16);
11815       bind(loop);
11816       (this->*block)(d, s, tmp);
11817       bind(odd);
11818       (this->*block)(d, s, tmp);
11819       subs(count, count, 2);
11820       br(Assembler::GT, loop);
11821       bind(end);
11822     }
11823 
11824     void pre1(RegisterOrConstant i) {
11825       block_comment("pre1");
11826       // Pa = Pa_base;
11827       // Pb = Pb_base + i;
11828       // Pm = Pm_base;
11829       // Pn = Pn_base + i;
11830       // Ra = *Pa;
11831       // Rb = *Pb;
11832       // Rm = *Pm;
11833       // Rn = *Pn;
11834       ldr(Ra, Address(Pa_base));
11835       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11836       ldr(Rm, Address(Pm_base));
11837       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11838       lea(Pa, Address(Pa_base));
11839       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11840       lea(Pm, Address(Pm_base));
11841       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11842 
11843       // Zero the m*n result.
11844       mov(Rhi_mn, zr);
11845       mov(Rlo_mn, zr);
11846     }
11847 
11848     // The core multiply-accumulate step of a Montgomery
11849     // multiplication.  The idea is to schedule operations as a
11850     // pipeline so that instructions with long latencies (loads and
11851     // multiplies) have time to complete before their results are
11852     // used.  This most benefits in-order implementations of the
11853     // architecture but out-of-order ones also benefit.
11854     void step() {
11855       block_comment("step");
11856       // MACC(Ra, Rb, t0, t1, t2);
11857       // Ra = *++Pa;
11858       // Rb = *--Pb;
11859       umulh(Rhi_ab, Ra, Rb);
11860       mul(Rlo_ab, Ra, Rb);
11861       ldr(Ra, pre(Pa, wordSize));
11862       ldr(Rb, pre(Pb, -wordSize));
11863       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11864                                        // previous iteration.
11865       // MACC(Rm, Rn, t0, t1, t2);
11866       // Rm = *++Pm;
11867       // Rn = *--Pn;
11868       umulh(Rhi_mn, Rm, Rn);
11869       mul(Rlo_mn, Rm, Rn);
11870       ldr(Rm, pre(Pm, wordSize));
11871       ldr(Rn, pre(Pn, -wordSize));
11872       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11873     }
11874 
11875     void post1() {
11876       block_comment("post1");
11877 
11878       // MACC(Ra, Rb, t0, t1, t2);
11879       // Ra = *++Pa;
11880       // Rb = *--Pb;
11881       umulh(Rhi_ab, Ra, Rb);
11882       mul(Rlo_ab, Ra, Rb);
11883       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11884       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11885 
11886       // *Pm = Rm = t0 * inv;
11887       mul(Rm, t0, inv);
11888       str(Rm, Address(Pm));
11889 
11890       // MACC(Rm, Rn, t0, t1, t2);
11891       // t0 = t1; t1 = t2; t2 = 0;
11892       umulh(Rhi_mn, Rm, Rn);
11893 
11894 #ifndef PRODUCT
11895       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11896       {
11897         mul(Rlo_mn, Rm, Rn);
11898         add(Rlo_mn, t0, Rlo_mn);
11899         Label ok;
11900         cbz(Rlo_mn, ok); {
11901           stop("broken Montgomery multiply");
11902         } bind(ok);
11903       }
11904 #endif
11905       // We have very carefully set things up so that
11906       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11907       // the lower half of Rm * Rn because we know the result already:
11908       // it must be -t0.  t0 + (-t0) must generate a carry iff
11909       // t0 != 0.  So, rather than do a mul and an adds we just set
11910       // the carry flag iff t0 is nonzero.
11911       //
11912       // mul(Rlo_mn, Rm, Rn);
11913       // adds(zr, t0, Rlo_mn);
11914       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11915       adcs(t0, t1, Rhi_mn);
11916       adc(t1, t2, zr);
11917       mov(t2, zr);
11918     }
11919 
11920     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11921       block_comment("pre2");
11922       // Pa = Pa_base + i-len;
11923       // Pb = Pb_base + len;
11924       // Pm = Pm_base + i-len;
11925       // Pn = Pn_base + len;
11926 
11927       if (i.is_register()) {
11928         sub(Rj, i.as_register(), len);
11929       } else {
11930         mov(Rj, i.as_constant());
11931         sub(Rj, Rj, len);
11932       }
11933       // Rj == i-len
11934 
11935       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11936       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11937       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11938       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11939 
11940       // Ra = *++Pa;
11941       // Rb = *--Pb;
11942       // Rm = *++Pm;
11943       // Rn = *--Pn;
11944       ldr(Ra, pre(Pa, wordSize));
11945       ldr(Rb, pre(Pb, -wordSize));
11946       ldr(Rm, pre(Pm, wordSize));
11947       ldr(Rn, pre(Pn, -wordSize));
11948 
11949       mov(Rhi_mn, zr);
11950       mov(Rlo_mn, zr);
11951     }
11952 
11953     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11954       block_comment("post2");
11955       if (i.is_constant()) {
11956         mov(Rj, i.as_constant()-len.as_constant());
11957       } else {
11958         sub(Rj, i.as_register(), len);
11959       }
11960 
11961       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11962 
11963       // As soon as we know the least significant digit of our result,
11964       // store it.
11965       // Pm_base[i-len] = t0;
11966       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11967 
11968       // t0 = t1; t1 = t2; t2 = 0;
11969       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11970       adc(t1, t2, zr);
11971       mov(t2, zr);
11972     }
11973 
11974     // A carry in t0 after Montgomery multiplication means that we
11975     // should subtract multiples of n from our result in m.  We'll
11976     // keep doing that until there is no carry.
11977     void normalize(RegisterOrConstant len) {
11978       block_comment("normalize");
11979       // while (t0)
11980       //   t0 = sub(Pm_base, Pn_base, t0, len);
11981       Label loop, post, again;
11982       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11983       cbz(t0, post); {
11984         bind(again); {
11985           mov(i, zr);
11986           mov(cnt, len);
11987           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11988           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11989           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11990           align(16);
11991           bind(loop); {
11992             sbcs(Rm, Rm, Rn);
11993             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11994             add(i, i, 1);
11995             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11996             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11997             sub(cnt, cnt, 1);
11998           } cbnz(cnt, loop);
11999           sbc(t0, t0, zr);
12000         } cbnz(t0, again);
12001       } bind(post);
12002     }
12003 
12004     // Move memory at s to d, reversing words.
12005     //    Increments d to end of copied memory
12006     //    Destroys tmp1, tmp2
12007     //    Preserves len
12008     //    Leaves s pointing to the address which was in d at start
12009     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
12010       assert(tmp1->encoding() < r19->encoding(), "register corruption");
12011       assert(tmp2->encoding() < r19->encoding(), "register corruption");
12012 
12013       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
12014       mov(tmp1, len);
12015       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
12016       sub(s, d, len, ext::uxtw, LogBytesPerWord);
12017     }
12018     // where
12019     void reverse1(Register d, Register s, Register tmp) {
12020       ldr(tmp, pre(s, -wordSize));
12021       ror(tmp, tmp, 32);
12022       str(tmp, post(d, wordSize));
12023     }
12024 
12025     void step_squaring() {
12026       // An extra ACC
12027       step();
12028       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12029     }
12030 
12031     void last_squaring(RegisterOrConstant i) {
12032       Label dont;
12033       // if ((i & 1) == 0) {
12034       tbnz(i.as_register(), 0, dont); {
12035         // MACC(Ra, Rb, t0, t1, t2);
12036         // Ra = *++Pa;
12037         // Rb = *--Pb;
12038         umulh(Rhi_ab, Ra, Rb);
12039         mul(Rlo_ab, Ra, Rb);
12040         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12041       } bind(dont);
12042     }
12043 
12044     void extra_step_squaring() {
12045       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12046 
12047       // MACC(Rm, Rn, t0, t1, t2);
12048       // Rm = *++Pm;
12049       // Rn = *--Pn;
12050       umulh(Rhi_mn, Rm, Rn);
12051       mul(Rlo_mn, Rm, Rn);
12052       ldr(Rm, pre(Pm, wordSize));
12053       ldr(Rn, pre(Pn, -wordSize));
12054     }
12055 
12056     void post1_squaring() {
12057       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
12058 
12059       // *Pm = Rm = t0 * inv;
12060       mul(Rm, t0, inv);
12061       str(Rm, Address(Pm));
12062 
12063       // MACC(Rm, Rn, t0, t1, t2);
12064       // t0 = t1; t1 = t2; t2 = 0;
12065       umulh(Rhi_mn, Rm, Rn);
12066 
12067 #ifndef PRODUCT
12068       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12069       {
12070         mul(Rlo_mn, Rm, Rn);
12071         add(Rlo_mn, t0, Rlo_mn);
12072         Label ok;
12073         cbz(Rlo_mn, ok); {
12074           stop("broken Montgomery multiply");
12075         } bind(ok);
12076       }
12077 #endif
12078       // We have very carefully set things up so that
12079       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
12080       // the lower half of Rm * Rn because we know the result already:
12081       // it must be -t0.  t0 + (-t0) must generate a carry iff
12082       // t0 != 0.  So, rather than do a mul and an adds we just set
12083       // the carry flag iff t0 is nonzero.
12084       //
12085       // mul(Rlo_mn, Rm, Rn);
12086       // adds(zr, t0, Rlo_mn);
12087       subs(zr, t0, 1); // Set carry iff t0 is nonzero
12088       adcs(t0, t1, Rhi_mn);
12089       adc(t1, t2, zr);
12090       mov(t2, zr);
12091     }
12092 
12093     void acc(Register Rhi, Register Rlo,
12094              Register t0, Register t1, Register t2) {
12095       adds(t0, t0, Rlo);
12096       adcs(t1, t1, Rhi);
12097       adc(t2, t2, zr);
12098     }
12099 
12100   public:
12101     /**
12102      * Fast Montgomery multiplication.  The derivation of the
12103      * algorithm is in A Cryptographic Library for the Motorola
12104      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
12105      *
12106      * Arguments:
12107      *
12108      * Inputs for multiplication:
12109      *   c_rarg0   - int array elements a
12110      *   c_rarg1   - int array elements b
12111      *   c_rarg2   - int array elements n (the modulus)
12112      *   c_rarg3   - int length
12113      *   c_rarg4   - int inv
12114      *   c_rarg5   - int array elements m (the result)
12115      *
12116      * Inputs for squaring:
12117      *   c_rarg0   - int array elements a
12118      *   c_rarg1   - int array elements n (the modulus)
12119      *   c_rarg2   - int length
12120      *   c_rarg3   - int inv
12121      *   c_rarg4   - int array elements m (the result)
12122      *
12123      */
12124     address generate_multiply() {
12125       Label argh, nothing;
12126 
12127       align(CodeEntryAlignment);
12128       address entry = pc();
12129 
12130       cbzw(Rlen, nothing);
12131 
12132       enter();
12133 
12134       // Make room.
12135       cmpw(Rlen, 512);
12136       br(Assembler::HI, argh);
12137       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12138       andr(sp, Ra, -2 * wordSize);
12139 
12140       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
12141 
12142       {
12143         // Copy input args, reversing as we go.  We use Ra as a
12144         // temporary variable.
12145         reverse(Ra, Pa_base, Rlen, t0, t1);
12146         if (!_squaring)
12147           reverse(Ra, Pb_base, Rlen, t0, t1);
12148         reverse(Ra, Pn_base, Rlen, t0, t1);
12149       }
12150 
12151       // Push all call-saved registers and also Pm_base which we'll need
12152       // at the end.
12153       save_regs();
12154 
12155 #ifndef PRODUCT
12156       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
12157       {
12158         ldr(Rn, Address(Pn_base, 0));
12159         mul(Rlo_mn, Rn, inv);
12160         subs(zr, Rlo_mn, -1);
12161         Label ok;
12162         br(EQ, ok); {
12163           stop("broken inverse in Montgomery multiply");
12164         } bind(ok);
12165       }
12166 #endif
12167 
12168       mov(Pm_base, Ra);
12169 
12170       mov(t0, zr);
12171       mov(t1, zr);
12172       mov(t2, zr);
12173 
12174       block_comment("for (int i = 0; i < len; i++) {");
12175       mov(Ri, zr); {
12176         Label loop, end;
12177         cmpw(Ri, Rlen);
12178         br(Assembler::GE, end);
12179 
12180         bind(loop);
12181         pre1(Ri);
12182 
12183         block_comment("  for (j = i; j; j--) {"); {
12184           movw(Rj, Ri);
12185           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12186         } block_comment("  } // j");
12187 
12188         post1();
12189         addw(Ri, Ri, 1);
12190         cmpw(Ri, Rlen);
12191         br(Assembler::LT, loop);
12192         bind(end);
12193         block_comment("} // i");
12194       }
12195 
12196       block_comment("for (int i = len; i < 2*len; i++) {");
12197       mov(Ri, Rlen); {
12198         Label loop, end;
12199         cmpw(Ri, Rlen, Assembler::LSL, 1);
12200         br(Assembler::GE, end);
12201 
12202         bind(loop);
12203         pre2(Ri, Rlen);
12204 
12205         block_comment("  for (j = len*2-i-1; j; j--) {"); {
12206           lslw(Rj, Rlen, 1);
12207           subw(Rj, Rj, Ri);
12208           subw(Rj, Rj, 1);
12209           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12210         } block_comment("  } // j");
12211 
12212         post2(Ri, Rlen);
12213         addw(Ri, Ri, 1);
12214         cmpw(Ri, Rlen, Assembler::LSL, 1);
12215         br(Assembler::LT, loop);
12216         bind(end);
12217       }
12218       block_comment("} // i");
12219 
12220       normalize(Rlen);
12221 
12222       mov(Ra, Pm_base);  // Save Pm_base in Ra
12223       restore_regs();  // Restore caller's Pm_base
12224 
12225       // Copy our result into caller's Pm_base
12226       reverse(Pm_base, Ra, Rlen, t0, t1);
12227 
12228       leave();
12229       bind(nothing);
12230       ret(lr);
12231 
12232       // handler for error case
12233       bind(argh);
12234       stop("MontgomeryMultiply total_allocation must be <= 8192");
12235 
12236       return entry;
12237     }
12238     // In C, approximately:
12239 
12240     // void
12241     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
12242     //                     julong Pn_base[], julong Pm_base[],
12243     //                     julong inv, int len) {
12244     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12245     //   julong *Pa, *Pb, *Pn, *Pm;
12246     //   julong Ra, Rb, Rn, Rm;
12247 
12248     //   int i;
12249 
12250     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12251 
12252     //   for (i = 0; i < len; i++) {
12253     //     int j;
12254 
12255     //     Pa = Pa_base;
12256     //     Pb = Pb_base + i;
12257     //     Pm = Pm_base;
12258     //     Pn = Pn_base + i;
12259 
12260     //     Ra = *Pa;
12261     //     Rb = *Pb;
12262     //     Rm = *Pm;
12263     //     Rn = *Pn;
12264 
12265     //     int iters = i;
12266     //     for (j = 0; iters--; j++) {
12267     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12268     //       MACC(Ra, Rb, t0, t1, t2);
12269     //       Ra = *++Pa;
12270     //       Rb = *--Pb;
12271     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12272     //       MACC(Rm, Rn, t0, t1, t2);
12273     //       Rm = *++Pm;
12274     //       Rn = *--Pn;
12275     //     }
12276 
12277     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
12278     //     MACC(Ra, Rb, t0, t1, t2);
12279     //     *Pm = Rm = t0 * inv;
12280     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12281     //     MACC(Rm, Rn, t0, t1, t2);
12282 
12283     //     assert(t0 == 0, "broken Montgomery multiply");
12284 
12285     //     t0 = t1; t1 = t2; t2 = 0;
12286     //   }
12287 
12288     //   for (i = len; i < 2*len; i++) {
12289     //     int j;
12290 
12291     //     Pa = Pa_base + i-len;
12292     //     Pb = Pb_base + len;
12293     //     Pm = Pm_base + i-len;
12294     //     Pn = Pn_base + len;
12295 
12296     //     Ra = *++Pa;
12297     //     Rb = *--Pb;
12298     //     Rm = *++Pm;
12299     //     Rn = *--Pn;
12300 
12301     //     int iters = len*2-i-1;
12302     //     for (j = i-len+1; iters--; j++) {
12303     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12304     //       MACC(Ra, Rb, t0, t1, t2);
12305     //       Ra = *++Pa;
12306     //       Rb = *--Pb;
12307     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12308     //       MACC(Rm, Rn, t0, t1, t2);
12309     //       Rm = *++Pm;
12310     //       Rn = *--Pn;
12311     //     }
12312 
12313     //     Pm_base[i-len] = t0;
12314     //     t0 = t1; t1 = t2; t2 = 0;
12315     //   }
12316 
12317     //   while (t0)
12318     //     t0 = sub(Pm_base, Pn_base, t0, len);
12319     // }
12320 
12321     /**
12322      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
12323      * multiplies than Montgomery multiplication so it should be up to
12324      * 25% faster.  However, its loop control is more complex and it
12325      * may actually run slower on some machines.
12326      *
12327      * Arguments:
12328      *
12329      * Inputs:
12330      *   c_rarg0   - int array elements a
12331      *   c_rarg1   - int array elements n (the modulus)
12332      *   c_rarg2   - int length
12333      *   c_rarg3   - int inv
12334      *   c_rarg4   - int array elements m (the result)
12335      *
12336      */
12337     address generate_square() {
12338       Label argh;
12339 
12340       align(CodeEntryAlignment);
12341       address entry = pc();
12342 
12343       enter();
12344 
12345       // Make room.
12346       cmpw(Rlen, 512);
12347       br(Assembler::HI, argh);
12348       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12349       andr(sp, Ra, -2 * wordSize);
12350 
12351       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
12352 
12353       {
12354         // Copy input args, reversing as we go.  We use Ra as a
12355         // temporary variable.
12356         reverse(Ra, Pa_base, Rlen, t0, t1);
12357         reverse(Ra, Pn_base, Rlen, t0, t1);
12358       }
12359 
12360       // Push all call-saved registers and also Pm_base which we'll need
12361       // at the end.
12362       save_regs();
12363 
12364       mov(Pm_base, Ra);
12365 
12366       mov(t0, zr);
12367       mov(t1, zr);
12368       mov(t2, zr);
12369 
12370       block_comment("for (int i = 0; i < len; i++) {");
12371       mov(Ri, zr); {
12372         Label loop, end;
12373         bind(loop);
12374         cmp(Ri, Rlen);
12375         br(Assembler::GE, end);
12376 
12377         pre1(Ri);
12378 
12379         block_comment("for (j = (i+1)/2; j; j--) {"); {
12380           add(Rj, Ri, 1);
12381           lsr(Rj, Rj, 1);
12382           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12383         } block_comment("  } // j");
12384 
12385         last_squaring(Ri);
12386 
12387         block_comment("  for (j = i/2; j; j--) {"); {
12388           lsr(Rj, Ri, 1);
12389           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12390         } block_comment("  } // j");
12391 
12392         post1_squaring();
12393         add(Ri, Ri, 1);
12394         cmp(Ri, Rlen);
12395         br(Assembler::LT, loop);
12396 
12397         bind(end);
12398         block_comment("} // i");
12399       }
12400 
12401       block_comment("for (int i = len; i < 2*len; i++) {");
12402       mov(Ri, Rlen); {
12403         Label loop, end;
12404         bind(loop);
12405         cmp(Ri, Rlen, Assembler::LSL, 1);
12406         br(Assembler::GE, end);
12407 
12408         pre2(Ri, Rlen);
12409 
12410         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
12411           lsl(Rj, Rlen, 1);
12412           sub(Rj, Rj, Ri);
12413           sub(Rj, Rj, 1);
12414           lsr(Rj, Rj, 1);
12415           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12416         } block_comment("  } // j");
12417 
12418         last_squaring(Ri);
12419 
12420         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
12421           lsl(Rj, Rlen, 1);
12422           sub(Rj, Rj, Ri);
12423           lsr(Rj, Rj, 1);
12424           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12425         } block_comment("  } // j");
12426 
12427         post2(Ri, Rlen);
12428         add(Ri, Ri, 1);
12429         cmp(Ri, Rlen, Assembler::LSL, 1);
12430 
12431         br(Assembler::LT, loop);
12432         bind(end);
12433         block_comment("} // i");
12434       }
12435 
12436       normalize(Rlen);
12437 
12438       mov(Ra, Pm_base);  // Save Pm_base in Ra
12439       restore_regs();  // Restore caller's Pm_base
12440 
12441       // Copy our result into caller's Pm_base
12442       reverse(Pm_base, Ra, Rlen, t0, t1);
12443 
12444       leave();
12445       ret(lr);
12446 
12447       // handler for error case
12448       bind(argh);
12449       stop("MontgomeryMultiply total_allocation must be <= 8192");
12450 
12451       return entry;
12452     }
12453     // In C, approximately:
12454 
12455     // void
12456     // montgomery_square(julong Pa_base[], julong Pn_base[],
12457     //                   julong Pm_base[], julong inv, int len) {
12458     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12459     //   julong *Pa, *Pb, *Pn, *Pm;
12460     //   julong Ra, Rb, Rn, Rm;
12461 
12462     //   int i;
12463 
12464     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12465 
12466     //   for (i = 0; i < len; i++) {
12467     //     int j;
12468 
12469     //     Pa = Pa_base;
12470     //     Pb = Pa_base + i;
12471     //     Pm = Pm_base;
12472     //     Pn = Pn_base + i;
12473 
12474     //     Ra = *Pa;
12475     //     Rb = *Pb;
12476     //     Rm = *Pm;
12477     //     Rn = *Pn;
12478 
12479     //     int iters = (i+1)/2;
12480     //     for (j = 0; iters--; j++) {
12481     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12482     //       MACC2(Ra, Rb, t0, t1, t2);
12483     //       Ra = *++Pa;
12484     //       Rb = *--Pb;
12485     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12486     //       MACC(Rm, Rn, t0, t1, t2);
12487     //       Rm = *++Pm;
12488     //       Rn = *--Pn;
12489     //     }
12490     //     if ((i & 1) == 0) {
12491     //       assert(Ra == Pa_base[j], "must be");
12492     //       MACC(Ra, Ra, t0, t1, t2);
12493     //     }
12494     //     iters = i/2;
12495     //     assert(iters == i-j, "must be");
12496     //     for (; iters--; j++) {
12497     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12498     //       MACC(Rm, Rn, t0, t1, t2);
12499     //       Rm = *++Pm;
12500     //       Rn = *--Pn;
12501     //     }
12502 
12503     //     *Pm = Rm = t0 * inv;
12504     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12505     //     MACC(Rm, Rn, t0, t1, t2);
12506 
12507     //     assert(t0 == 0, "broken Montgomery multiply");
12508 
12509     //     t0 = t1; t1 = t2; t2 = 0;
12510     //   }
12511 
12512     //   for (i = len; i < 2*len; i++) {
12513     //     int start = i-len+1;
12514     //     int end = start + (len - start)/2;
12515     //     int j;
12516 
12517     //     Pa = Pa_base + i-len;
12518     //     Pb = Pa_base + len;
12519     //     Pm = Pm_base + i-len;
12520     //     Pn = Pn_base + len;
12521 
12522     //     Ra = *++Pa;
12523     //     Rb = *--Pb;
12524     //     Rm = *++Pm;
12525     //     Rn = *--Pn;
12526 
12527     //     int iters = (2*len-i-1)/2;
12528     //     assert(iters == end-start, "must be");
12529     //     for (j = start; iters--; j++) {
12530     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12531     //       MACC2(Ra, Rb, t0, t1, t2);
12532     //       Ra = *++Pa;
12533     //       Rb = *--Pb;
12534     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12535     //       MACC(Rm, Rn, t0, t1, t2);
12536     //       Rm = *++Pm;
12537     //       Rn = *--Pn;
12538     //     }
12539     //     if ((i & 1) == 0) {
12540     //       assert(Ra == Pa_base[j], "must be");
12541     //       MACC(Ra, Ra, t0, t1, t2);
12542     //     }
12543     //     iters =  (2*len-i)/2;
12544     //     assert(iters == len-j, "must be");
12545     //     for (; iters--; j++) {
12546     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12547     //       MACC(Rm, Rn, t0, t1, t2);
12548     //       Rm = *++Pm;
12549     //       Rn = *--Pn;
12550     //     }
12551     //     Pm_base[i-len] = t0;
12552     //     t0 = t1; t1 = t2; t2 = 0;
12553     //   }
12554 
12555     //   while (t0)
12556     //     t0 = sub(Pm_base, Pn_base, t0, len);
12557     // }
12558   };
12559 
12560   // Initialization
12561   void generate_preuniverse_stubs() {
12562     // preuniverse stubs are not needed for aarch64
12563   }
12564 
12565   void generate_initial_stubs() {
12566     // Generate initial stubs and initializes the entry points
12567 
12568     // entry points that exist in all platforms Note: This is code
12569     // that could be shared among different platforms - however the
12570     // benefit seems to be smaller than the disadvantage of having a
12571     // much more complicated generator structure. See also comment in
12572     // stubRoutines.hpp.
12573 
12574     StubRoutines::_forward_exception_entry = generate_forward_exception();
12575 
12576     StubRoutines::_call_stub_entry =
12577       generate_call_stub(StubRoutines::_call_stub_return_address);
12578 
12579     // is referenced by megamorphic call
12580     StubRoutines::_catch_exception_entry = generate_catch_exception();
12581 
12582     // Initialize table for copy memory (arraycopy) check.
12583     if (UnsafeMemoryAccess::_table == nullptr) {
12584       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
12585     }
12586 
12587     if (UseCRC32Intrinsics) {
12588       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
12589     }
12590 
12591     if (UseCRC32CIntrinsics) {
12592       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
12593     }
12594 
12595     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
12596       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
12597     }
12598 
12599     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
12600       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
12601     }
12602 
12603     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
12604         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
12605       StubRoutines::_hf2f = generate_float16ToFloat();
12606       StubRoutines::_f2hf = generate_floatToFloat16();
12607     }
12608   }
12609 
12610   void generate_continuation_stubs() {
12611     // Continuation stubs:
12612     StubRoutines::_cont_thaw          = generate_cont_thaw();
12613     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
12614     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
12615     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
12616   }
12617 
12618   void generate_final_stubs() {
12619     // support for verify_oop (must happen after universe_init)
12620     if (VerifyOops) {
12621       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
12622     }
12623 
12624     // arraycopy stubs used by compilers
12625     generate_arraycopy_stubs();
12626 
12627     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
12628 
12629     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
12630 
12631     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
12632     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
12633 
12634 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12635 
12636     generate_atomic_entry_points();
12637 
12638 #endif // LINUX
12639 
12640 #ifdef COMPILER2
12641     if (UseSecondarySupersTable) {
12642       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
12643       if (! InlineSecondarySupersTest) {
12644         generate_lookup_secondary_supers_table_stub();
12645       }
12646     }
12647 #endif
12648 
12649     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
12650       StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
12651     }
12652 
12653     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
12654   }
12655 
12656   void generate_compiler_stubs() {
12657 #ifdef COMPILER2
12658 
12659     if (UseSVE == 0) {
12660       generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
12661     }
12662 
12663     // array equals stub for large arrays.
12664     if (!UseSimpleArrayEquals) {
12665       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12666     }
12667 
12668     // arrays_hascode stub for large arrays.
12669     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12670     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12671     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12672     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12673     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12674 
12675     // byte_array_inflate stub for large arrays.
12676     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12677 
12678     // countPositives stub for large arrays.
12679     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12680 
12681     generate_compare_long_strings();
12682 
12683     generate_string_indexof_stubs();
12684 
12685     if (UseMultiplyToLenIntrinsic) {
12686       StubRoutines::_multiplyToLen = generate_multiplyToLen();
12687     }
12688 
12689     if (UseSquareToLenIntrinsic) {
12690       StubRoutines::_squareToLen = generate_squareToLen();
12691     }
12692 
12693     if (UseMulAddIntrinsic) {
12694       StubRoutines::_mulAdd = generate_mulAdd();
12695     }
12696 
12697     if (UseSIMDForBigIntegerShiftIntrinsics) {
12698       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12699       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
12700     }
12701 
12702     if (UseMontgomeryMultiplyIntrinsic) {
12703       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12704       address start = load_archive_data(stub_id);
12705       if (start == nullptr) {
12706         // we have to generate it
12707         StubCodeMark mark(this, stub_id);
12708         MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12709         start = g.generate_multiply();
12710         // record the stub start and end
12711         store_archive_data(stub_id, start, _masm->pc());
12712       }
12713       StubRoutines::_montgomeryMultiply = start;
12714     }
12715 
12716     if (UseMontgomerySquareIntrinsic) {
12717       StubId stub_id = StubId::stubgen_montgomerySquare_id;
12718       address start = load_archive_data(stub_id);
12719       if (start == nullptr) {
12720         // we have to generate it
12721         StubCodeMark mark(this, stub_id);
12722         MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12723         // We use generate_multiply() rather than generate_square()
12724         // because it's faster for the sizes of modulus we care about.
12725         start = g.generate_multiply();
12726         // record the stub start and end
12727         store_archive_data(stub_id, start, _masm->pc());
12728       }
12729       StubRoutines::_montgomerySquare = start;
12730     }
12731 
12732     if (UseChaCha20Intrinsics) {
12733       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12734     }
12735 
12736     if (UseKyberIntrinsics) {
12737       StubRoutines::_kyberNtt = generate_kyberNtt();
12738       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12739       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12740       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12741       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12742       StubRoutines::_kyber12To16 = generate_kyber12To16();
12743       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12744     }
12745 
12746     if (UseDilithiumIntrinsics) {
12747       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12748       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12749       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12750       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12751       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12752     }
12753 
12754     if (UseBASE64Intrinsics) {
12755         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12756         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12757     }
12758 
12759     // data cache line writeback
12760     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12761     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12762 
12763     if (UseAESIntrinsics) {
12764       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12765       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12766       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12767       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12768       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12769     }
12770     if (UseGHASHIntrinsics) {
12771       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12772       StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
12773       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
12774     }
12775     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12776       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12777     }
12778 
12779     if (UseMD5Intrinsics) {
12780       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12781       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12782     }
12783     if (UseSHA1Intrinsics) {
12784       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12785       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12786     }
12787     if (UseSHA256Intrinsics) {
12788       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12789       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12790     }
12791     if (UseSHA512Intrinsics) {
12792       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12793       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12794     }
12795     if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
12796       StubRoutines::_double_keccak         = generate_double_keccak();
12797       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12798       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12799     } else if (UseSHA3Intrinsics) {
12800       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12801       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12802     }
12803 
12804     if (UsePoly1305Intrinsics) {
12805       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12806     }
12807 
12808     // generate Adler32 intrinsics code
12809     if (UseAdler32Intrinsics) {
12810       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12811     }
12812 
12813 #endif // COMPILER2
12814   }
12815 
12816  public:
12817   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
12818     switch(blob_id) {
12819     case BlobId::stubgen_preuniverse_id:
12820       generate_preuniverse_stubs();
12821       break;
12822     case BlobId::stubgen_initial_id:
12823       generate_initial_stubs();
12824       break;
12825      case BlobId::stubgen_continuation_id:
12826       generate_continuation_stubs();
12827       break;
12828     case BlobId::stubgen_compiler_id:
12829       generate_compiler_stubs();
12830       break;
12831     case BlobId::stubgen_final_id:
12832       generate_final_stubs();
12833       break;
12834     default:
12835       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12836       break;
12837     };
12838   }
12839 
12840 #if INCLUDE_CDS
12841   static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
12842     // external data defined in this file
12843 #define ADD(addr) external_addresses.append((address)(addr));
12844     ADD(_sha256_round_consts);
12845     ADD(_sha512_round_consts);
12846     ADD(_sha3_round_consts);
12847     ADD(_double_keccak_round_consts);
12848     ADD(_encodeBlock_toBase64);
12849     ADD(_encodeBlock_toBase64URL);
12850     ADD(_decodeBlock_fromBase64ForNoSIMD);
12851     ADD(_decodeBlock_fromBase64URLForNoSIMD);
12852     ADD(_decodeBlock_fromBase64ForSIMD);
12853     ADD(_decodeBlock_fromBase64URLForSIMD);
12854 #undef ADD
12855   }
12856 #endif // INCLUDE_CDS
12857 }; // end class declaration
12858 
12859 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
12860   StubGenerator g(code, blob_id, stub_data);
12861 }
12862 
12863 #if INCLUDE_CDS
12864 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
12865   StubGenerator::init_AOTAddressTable(addresses);
12866 }
12867 #endif // INCLUDE_CDS
12868 
12869 #if defined (LINUX)
12870 
12871 // Define pointers to atomic stubs and initialize them to point to the
12872 // code in atomic_aarch64.S.
12873 
12874 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12875   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12876     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12877   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12878     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12879 
12880 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12881 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12882 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12883 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12884 DEFAULT_ATOMIC_OP(xchg, 4, )
12885 DEFAULT_ATOMIC_OP(xchg, 8, )
12886 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12887 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12888 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12889 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12890 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12891 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12892 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12893 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12894 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12895 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12896 
12897 #undef DEFAULT_ATOMIC_OP
12898 
12899 #endif // LINUX